In [1]:
import json
import pandas as pd
from datetime import datetime
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials
import time
from tqdm import tqdm
import numpy as np

client_id = 'replace with your client id'
client_secret = 'replace with your client secret'
redirect_uri = 'replace with your redirect uri'

In [2]:
client_credentials_manager = SpotifyClientCredentials(
        client_id=client_id,
        client_secret=client_secret
    )
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
# Load extended streaming history
with open("RawData/Streaming_History_Audio_2024_9.json", 'r', encoding='utf-8') as file:
    listen24 = pd.DataFrame(json.load(file))
    
with open("RawData/Streaming_History_Audio_2023_7.json", 'r', encoding='utf-8') as file:
    listen23 = pd.DataFrame(json.load(file))
                            
with open("RawData", 'r', encoding='utf-8') as file:
    listen21_22 = pd.DataFrame(json.load(file))

In [7]:

def get_artist_id(track_uri):
    try:
        return sp.track(track_uri)['artists'][0]['id']
    except:
        return pd.NA

def get_artist_top_track_popularity(artist_id):
    top_track_popularities = []
    try:
        for track in sp.artist_top_tracks(artist_id)['tracks']:
            top_track_popularities.append(track['popularity'])
        return np.mean(top_track_popularities)
    except:
        return pd.NA

def get_artist_genres(artist_id):
    try:
        return sp.artist(artist_id)['genres']
    except:
        return []

def get_track_popularity(track_uri):
    try:
        return sp.track(track_uri)['popularity']
    except:
        return pd.NA

In [8]:
def get_spotify_features(df, client_id, client_secret):
    """
    Add Spotify audio features to each row of the dataframe with better error handling
    """
    # Initialize Spotify client
    client_credentials_manager = SpotifyClientCredentials(
        client_id=client_id,
        client_secret=client_secret
    )
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    
    # Get unique URIs
    unique_uris = df['spotify_track_uri'].unique()
    print(f"Getting features for {len(unique_uris)} unique tracks...")
    
    # Create dictionary to store features
    features_dict = {}
    failed_uris = []
    
    # Process unique URIs in batches of 100
    for i in tqdm(range(0, len(unique_uris), 100)):
        batch = unique_uris[i:i+100]
        
        try:
            # Get audio features for batch
            audio_features = sp.audio_features(batch)
            
            
            # Store features in dictionary and track failures
            for uri, features in zip(batch, audio_features):
                if features is not None:
                    features_dict[uri] = features
                else:
                    failed_uris.append(uri)
                    print(f"Failed to get features for URI: {uri}")
                    # Try to get track info to see if it exists
                    try:
                        track_info = sp.track(uri)
                        print(f"Track exists: {track_info['name']} by {track_info['artists'][0]['name']}")
                    except:
                        print("Could not get track info - track may not exist")
                    
        except Exception as e:
            print(f"Error processing batch {i}-{i+100}: {str(e)}")
            failed_uris.extend(batch)
        
        time.sleep(0.25)  # Rate limiting
    
    # Print summary of failures
    if failed_uris:
        print(f"\nFailed to get features for {len(failed_uris)} tracks")
        print("\nSample of failed tracks:")
        for uri in failed_uris[:5]:
            tracks = df[df['spotify_track_uri'] == uri]
            print(f"\nURI: {uri}")
            print(f"Track name(s): {tracks['master_metadata_track_name'].unique()}")
            print(f"Artist(s): {tracks['master_metadata_album_artist_name'].unique()}")
    
    # Add feature columns to original dataframe
    feature_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                   'speechiness', 'acousticness', 'instrumentalness',
                   'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
    
    # Initialize new columns with None
    for col in feature_cols:
        df[col] = None
    
    # Fill in features for each row
    print("\nAdding features to rows...")
    for i, row in tqdm(df.iterrows(), total=len(df)):
        uri = row['spotify_track_uri']
        if uri in features_dict:
            for col in feature_cols:
                df.at[i, col] = features_dict[uri][col]
    
    # Print summary
    total_tracks = len(df)
    tracks_with_features = df[df['danceability'].notna()].shape[0]
    print(f"\nSummary:")
    print(f"Total tracks: {total_tracks}")
    print(f"Tracks with features: {tracks_with_features}")
    print(f"Tracks without features: {total_tracks - tracks_with_features}")
    
    return df

In [9]:
listen_2024 = get_spotify_features(listen24, client_id, client_secret)
listen_2023 = get_spotify_features(listen24, client_id, client_secret)
listen_2021_2022 = get_spotify_features(listen21_22, client_id, client_secret)

Getting features for 1498 unique tracks...


  0%|          | 0/15 [00:00<?, ?it/s]

Error processing batch 0-100: expected string or bytes-like object


100%|██████████| 15/15 [00:08<00:00,  1.85it/s]



Failed to get features for 100 tracks

Sample of failed tracks:

URI: spotify:track:0RGGAP5gpvToJyUZbiEcXO
Track name(s): ['Bloody Samaritan (with Kelly Rowland) - Remix']
Artist(s): ['Ayra Starr']

URI: spotify:track:1D1hdsuMYmaAVOXnz9hdmt
Track name(s): ['LEFT RIGHT']
Artist(s): ['Wisa Greid']

URI: spotify:track:1dxl7CSBstnOcLzWrQiBLj
Track name(s): ['Allo']
Artist(s): ['Youka']

URI: spotify:track:66gOtDjvW09YUpXfAO5opn
Track name(s): ['NO IDEA #jerseyclub (Ziahfyah & Indo2x Remix) - SLOWED & REVERB']
Artist(s): ['Don Toliver']

URI: spotify:track:5J1GXL8FDxHH1ki8oNz4hD
Track name(s): ['Choco']
Artist(s): ['Dj Verigal']

Adding features to rows...


100%|██████████| 4517/4517 [00:00<00:00, 12220.10it/s]



Summary:
Total tracks: 4517
Tracks with features: 3878
Tracks without features: 639
Getting features for 2251 unique tracks...


  0%|          | 0/23 [00:00<?, ?it/s]

Error processing batch 0-100: expected string or bytes-like object


100%|██████████| 23/23 [00:12<00:00,  1.86it/s]



Failed to get features for 100 tracks

Sample of failed tracks:

URI: spotify:track:42m3eP1JJhtzffal9B136J
Track name(s): ['7am On Bridle Path']
Artist(s): ['Drake']

URI: spotify:track:4s7TIubkdsgdtDEDFEAYVL
Track name(s): ['Intro (Hate on Me)']
Artist(s): ['Meek Mill']

URI: spotify:track:5J6rTmMjF9DVIAF8G3M9n4
Track name(s): ['Wandered To LA (with Justin Bieber)']
Artist(s): ['Juice WRLD']

URI: spotify:track:0lk5tzmaJWVAv5GBvpm3xu
Track name(s): ['Peru']
Artist(s): ['Fireboy DML']

URI: spotify:track:2sw9r0DEwO1Nqg6eBtsCcc
Track name(s): ['Enchanted Waterfall']
Artist(s): ['Tory Lanez']

Adding features to rows...


100%|██████████| 16620/16620 [00:01<00:00, 13625.98it/s]


Summary:
Total tracks: 16620
Tracks with features: 12320
Tracks without features: 4300





In [16]:
extended_history = pd.concat([listen_2021_2022, listen_2023, listen_2024], ignore_index=True)
extended_history.to_csv('Detailed Listening History/full_history.csv')