In [1]:
# import relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from notebook.services.config import ConfigManager
import logging
import requests
import base64
import concurrent.futures
from ratelimit import limits, sleep_and_retry

In [2]:
# load the cleaned data from data-fetch file
streams_df = pd.read_csv(r'/Users/adityamxr/Desktop/spotify-time-series/data-fetching/streams_df_1.csv')

In [3]:
# verify import

streams_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,0.563,0.151,-12.428,1,0,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,0.901,0.0763,-28.454,1,0,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,0.0,0.119,-19.924,0,0,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,0.887,0.111,-14.734,0,0,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,0.908,0.098,-16.829,1,1,0.0424,62.149,0.0693,1928


In [4]:
# increase the IOPub data rate limit to prevent the notebook from stopping output 
# when processing large volumes of data or handling frequent API rate limiting messages

cm = ConfigManager().update('notebook', {'NotebookApp': {'iopub_data_rate_limit': 100000000}})

In [5]:
# set up logging
logging.basicConfig(level=logging.INFO)

# spotify API credentials
client_id = '56b1d34cf1574e28855dc07f73f7754c'
client_secret = '490ebd868e76461fbd7dcb7a0a9cce8f'

# cache to store artist-genre mappings
genre_cache = {}

# function to get the access token
def get_access_token(client_id, client_secret):
    token_url = "https://accounts.spotify.com/api/token"
    credentials = f"{client_id}:{client_secret}"
    encoded_credentials = base64.b64encode(credentials.encode()).decode()

    headers = {
        "Authorization": f"Basic {encoded_credentials}"
    }
    data = {
        "grant_type": "client_credentials"
    }

    response = requests.post(token_url, headers=headers, data=data)
    response.raise_for_status()
    access_token = response.json().get('access_token')
    return access_token

In [7]:
# get the access token using the provided client ID and client secret
access_token = get_access_token(client_id, client_secret)

# define a rate-limited function to fetch genres for a given artist from the spotify api
# the function is rate-limited to 10 calls per second using the 'limits' decorator
@sleep_and_retry  # retry the request if rate limits are hit
@limits(calls=10, period=1)  # limit to 10 API calls per second
def get_artist_genres(artist_name, access_token):
    # spotify api endpoint to search for an artist
    search_url = "https://api.spotify.com/v1/search"
    
    # set up the authorization header with the Bearer token
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    
    # set up the query parameters to search for the artist by name
    params = {
        "q": artist_name,  # artist name to search for
        "type": "artist",  # specify that we are searching for an artist
        "limit": 1  # limit the search to 1 artist (the most relevant one)
    }

    try:
        # make the GET request to the Spotify API to search for the artist
        response = requests.get(search_url, headers=headers, params=params)
        response.raise_for_status()  # Raise an exception for any HTTP errors
        
        # parse the JSON response to extract artist data
        data = response.json()
        artists = data.get('artists', {}).get('items', [])

        # if no artist is found, log a warning and return an empty list
        if not artists:
            logging.warning(f"No artist found for {artist_name}")
            return []

        # get the genres associated with the first (most relevant) artist found
        artist_info = artists[0]
        genres = artist_info.get('genres', [])
        return genres

    except requests.exceptions.RequestException as e:
        # log any exceptions that occur during the API request
        logging.error(f"Error fetching data for {artist_name}: {e}")
        return []

In [8]:
# function to fetch genres for an artist with caching to avoid redundant API calls
def get_artist_genres_cached(artist_name, access_token):
    # check if the artist's genres are already in the cache
    if artist_name in genre_cache:
        logging.info(f"Cache hit for '{artist_name}'")
        return genre_cache[artist_name]
    
    # if not in the cache, fetch the genres from the Spotify API
    genres = get_artist_genres(artist_name, access_token)
    
    # store the fetched genres in the cache for future use
    genre_cache[artist_name] = genres
    logging.info(f"Fetched and cached genres for '{artist_name}': {genres}")
    
    return genres