In [1]:
#pip install spotipy

In [2]:
#pip install tqdm

In [3]:
#pip install pymongo

In [1]:
print('initializing...')

# import libraries

import configparser
from configparser import ConfigParser
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
import time
from tqdm import tqdm
import os
import boto3
import pymongo
import json
import csv
from bson import json_util

initializing...


In [2]:
# place the config files outside of the current directory 
# this will set the location of the config files as the parent directory

curr_path = os.getcwd()
config_path = os.path.abspath(os.path.join(curr_path, os.pardir))

general_config = configparser.ConfigParser()
general_config.read(config_path+'\\'+'general_config.ini')

config = configparser.ConfigParser()
config.read(config_path+'\\'+'spotify_data_proj.ini')


# set up Spotify API client credentials

client_id = config['credentials']['client_id']
client_secret = config['credentials']['client_secret']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


# set up credentials for AWS. can probably delete this section because AWS output is no longer being used

# general settings
os.environ['AWS_ACCESS_KEY_ID'] = general_config['s3_access']['access_key_id']
os.environ['AWS_SECRET_ACCESS_KEY'] = general_config['s3_access']['secret_access_key']
bucket_name = general_config['s3_access']['bucket_name']

# project specific settings
s3_object_name = config['s3_access']['object_name']
local_file_path = config['s3_access']['local_file_path']

# set up credentials for MongoDB

# general settings
mongodb_pw = general_config['mongodb_access']['password']
mongodb_username = general_config['mongodb_access']['username']

# project specific settings
mongodb_cluster = config['mongodb_access']['cluster_name']
mongodb_db = config['mongodb_access']['database']
mongodb_dbname = config['mongodb_access']['dbname']
mongodb_collection = config['mongodb_access']['collection']


In [38]:
# get the number of batches from the user

num_batches = int(input('each batch contains track data for 50 artists. how many batches would you like to retrieve?'))


# get a list of the top artist IDs, 50 at a time

offst=0
top_artist_list = []

for i in tqdm(range(0,num_batches), desc = 'generating list of top artists'):
    top_artists = sp.search(q='year:2023' , type='artist', offset = offst, limit=50)
    for artist in top_artists['artists']['items']:
        top_artist_list.append(artist['id'])
    offst=offst+50
    time.sleep(0.1)

each batch contains track data for 50 artists. how many batches would you like to retrieve? 20


generating list of top artists: 100%|██████████| 20/20 [00:18<00:00,  1.09it/s]


In [3]:
# define your MongoDB Atlas connection using PyMongo

client = pymongo.MongoClient("mongodb+srv://"+mongodb_username+":"+mongodb_pw+"@"+mongodb_cluster+"."+mongodb_db+".mongodb.net/?retryWrites=true&w=majority")


# select variables for the database and collection you want to upload to using the pymongo syntax

mongodb_db_var = client[mongodb_dbname]
mongodb_collection_var = mongodb_db_var[mongodb_collection]

In [39]:
# retrieve the track data from the API for each artist

for i in tqdm(range(len(top_artist_list)), desc = "retrieving track data for each artist"):
    artist_tracks = sp.artist_top_tracks(artist_id=top_artist_list[i], country="US")["tracks"]

    # write the data for each track to the MongoDB collection

    for track in artist_tracks:                         
        mongodb_collection_var.insert_one(track)

    offst = offst+50
    time.sleep(0.1)

print('data retrieval complete')

retrieving track data for each artist: 100%|██████████| 1000/1000 [08:09<00:00,  2.04it/s]

data retrieval complete





In [37]:
# if you need to delete everything from your MongoDB collection and start over...

#result = mongodb_collection_var.delete_many({})
#print(result.deleted_count, "documents deleted.")

4670 documents deleted.


In [4]:
# write and define the mongodb query

query = {}
projection = {"album":0, "href":0, "external_urls":0, "images":0, "artists":0, "external_urls":0}

# export data to csv on s3

cursor = mongodb_collection_var.find(query, projection)

# set up csv writer fields

csv_headers = ['_id', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'id', 'is_local', 'is_playable', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri']

# create the csv file and write the header row

with open(local_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
    writer.writeheader()

    # write each document to the CSV file
    
    for document in cursor:
        writer.writerow(document)

# don't close the MongoDB connection until you are done using MongoDB!

client.close()

In [5]:
# Set up S3 client, and upload csv file to s3

s3 = boto3.client('s3')
s3.upload_file(local_file_path, bucket_name, s3_object_name)