In [1]:
import json
import pandas as pd
from datetime import datetime
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials
import time
from tqdm import tqdm
import numpy as np

client_id = 'replace with your client id'
client_secret = 'replace with your client secret'
redirect_uri = 'replace with your redirect uri'

In [59]:
client_credentials_manager = SpotifyClientCredentials(
        client_id=client_id,
        client_secret=client_secret
    )
sp = spotipy.Spotify(
    client_credentials_manager=client_credentials_manager
)

In [71]:
# Load extended streaming history
with open("Spotify Extended Streaming History/Streaming_History_Audio_2024_9.json", 'r', encoding='utf-8') as file:
    listen24 = pd.DataFrame(json.load(file))
    
with open("Spotify Extended Streaming History/Streaming_History_Audio_2023_7.json", 'r', encoding='utf-8') as file:
    listen23 = pd.DataFrame(json.load(file))
                            
with open("Spotify Extended Streaming History/Streaming_History_Audio_2021-2022_5.json", 'r', encoding='utf-8') as file:
    listen21_22 = pd.DataFrame(json.load(file))

In [60]:
def get_track_and_artist_info_batch(track_uris, batch_size=50):
    results = {
        'artist_ids': [],
        'top_track_popularities': [],
        'genres': [],
        'track_popularities': []
    }
    
    for i in tqdm(range(0, len(track_uris), batch_size)):
        batch = track_uris[i:i + batch_size]
        try:
            # Get track info for whole batch at once
            tracks = sp.tracks(batch)['tracks']
            
            # Get unique artist IDs from this batch of tracks
            artist_ids = list(set(track['artists'][0]['id'] for track in tracks))
            
            # Get artist info for all artists in batch at once
            artists = sp.artists(artist_ids)['artists']
            artist_info = {artist['id']: artist for artist in artists}
            
            # Process each track
            for track in tracks:
                artist_id = track['artists'][0]['id']
                artist = artist_info[artist_id]
                
                results['artist_ids'].append(artist_id)
                results['genres'].append(artist['genres'])
                results['track_popularities'].append(track['popularity'])
                
                # Get top track popularity from artist object
                top_tracks = sp.artist_top_tracks(artist_id)['tracks']
                top_popularity = max(t['popularity'] for t in top_tracks) if top_tracks else None
                results['top_track_popularities'].append(top_popularity)
            
            # Small delay to avoid rate limits
            time.sleep(1)
            if i % (batch_size * 5) == 0:
                time.sleep(30)
            
        except Exception as e:
            print(f"Error processing batch starting at {i}: {e}")
            # Add None values for failed batch
            for _ in range(len(batch)):
                results['artist_ids'].append(None)
                results['top_track_popularities'].append(None)
                results['genres'].append(None)
                results['track_popularities'].append(None)
            
    return results

unique_tracks = listen24['spotify_track_uri'].dropna().unique()
res24 = get_track_and_artist_info_batch(unique_tracks)
pd.DataFrame(results).to_csv('Cleaned_Data/2024RESULTS_REAL.csv')

unique_tracks = listen23['spotify_track_uri'].dropna().unique()
res23 = get_track_and_artist_info_batch(unique_tracks)
pd.DataFrame(results).to_csv('Cleaned_Data/2023RESULTS_REAL.csv')

unique_tracks = listen21_22['spotify_track_uri'].dropna().unique()
res21 = get_track_and_artist_info_batch(unique_tracks)
pd.DataFrame(results).to_csv('Cleaned_Data/2021_22RESULTS_REAL.csv')

100%|██████████| 30/30 [06:57<00:00, 13.92s/it]


In [65]:
listen22_na = listen21_22.dropna(subset=['spotify_track_uri'])
listen23_na = listen23.dropna(subset=['spotify_track_uri'])
listen24_na = listen24.dropna(subset=['spotify_track_uri'])

In [81]:
uri24 = pd.DataFrame({"spotify_track_uri": listen24_na['spotify_track_uri'].dropna().unique()})
step1 = res24.merge(
    uri24, 
    left_index=True, 
    right_index=True,
    how='left'
)

listen24.merge(step1, left_on='spotify_track_uri', right_on='spotify_track_uri', how='left').drop(
    columns = ['ts', 'username', 'platform', 'conn_country',
       'ip_addr_decrypted', 'user_agent_decrypted','episode_name',
       'episode_show_name', 'spotify_episode_uri', 'reason_start',
       'reason_end', 'shuffle', 'skipped', 'offline', 'offline_timestamp',
       'incognito_mode']).dropna().to_csv('Cleaned_Data/2024RESULTS_REAL.csv')

uri23 = pd.DataFrame({"spotify_track_uri": listen23_na['spotify_track_uri'].dropna().unique()})
step1 = res23.merge(
    uri23, 
    left_index=True, 
    right_index=True,
    how='left'
)

listen23.merge(step1, left_on='spotify_track_uri', right_on='spotify_track_uri', how='left').drop(
    columns = ['ts', 'username', 'platform', 'conn_country',
       'ip_addr_decrypted', 'user_agent_decrypted','episode_name',
       'episode_show_name', 'spotify_episode_uri', 'reason_start',
       'reason_end', 'shuffle', 'skipped', 'offline', 'offline_timestamp',
       'incognito_mode']).dropna().to_csv('Cleaned_Data/2023RESULTS_REAL.csv')

uri21 = pd.DataFrame({"spotify_track_uri": listen22_na['spotify_track_uri'].dropna().unique()})
step1 = res21.merge(
    uri21, 
    left_index=True, 
    right_index=True,
    how='left'
)

listen21.merge(step1, left_on='spotify_track_uri', right_on='spotify_track_uri', how='left').drop(
    columns = ['ts', 'username', 'platform', 'conn_country',
       'ip_addr_decrypted', 'user_agent_decrypted','episode_name',
       'episode_show_name', 'spotify_episode_uri', 'reason_start',
       'reason_end', 'shuffle', 'skipped', 'offline', 'offline_timestamp',
       'incognito_mode']).dropna().to_csv('Cleaned_Data/2021_22RESULTS_REAL.csv')