In [None]:
import pandas as pd
import numpy as np

df_stream = pd.read_json('/Users/michellezhang/Documents/HMS/Jiam Lab/Spotify Project/Data/29/29_Spotify Extended Streaming History/Streaming_History_Audio_2013-2024_29.json')

df_stream['UniqueID'] = df_stream['master_metadata_album_artist_name'] + ":" + df_stream['master_metadata_track_name']

df_stream.head()

In [None]:
#Cleaning df_stream by getting rid of rows without URIs

df_stream_cleaned = df_stream.dropna(subset=['spotify_track_uri'])

df_stream_cleaned.head()


In [None]:

# create blank dictionary to store track URI, artist URI, and genres
feature_dict = {}

# convert track_uri column to an iterable list
track_uris = df_stream_cleaned['spotify_track_uri'].to_list()

num_entries = len(track_uris)

print("Number of entries in track_uris",num_entries)

# Removing duplicates
unique_track_uris = list(set(track_uris))

num_entries_unique = len(unique_track_uris)

print("Number of entries in unique_track_uris",num_entries_unique)

print(unique_track_uris[:10])



In [5]:
import spotipy
import time
import pandas as pd
import requests
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.exceptions import SpotifyException

client_id = '400cdcd09b0c49559616bee70d4a1afa'
client_secret = '840d6f90e6934ab0aa70a9c07099534c'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
sp.trace=False


In [6]:

# Put URI to dataframe
df = pd.DataFrame(unique_track_uris, columns=['URI'])

# Initialize empty columns for audio features
features_list = ['energy', 'loudness', 'speechiness', 'valence', 'liveness', 'tempo', 'danceability', 'acousticness', 'instrumentalness']
for feature in features_list:
    df[feature] = ''

# Define a function to process a batch of URIs
def process_batch(batch_uris, df):
    while True:
        try:
            features = sp.audio_features(batch_uris)
            for feature in features:
                if feature:
                    URI = feature['uri']
                    df.loc[df['URI'] == URI, 'energy'] = feature['energy']
                    df.loc[df['URI'] == URI, 'acousticness'] = feature['acousticness']
                    df.loc[df['URI'] == URI, 'instrumentalness'] = feature['instrumentalness']
                    df.loc[df['URI'] == URI, 'loudness'] = feature['loudness']
                    df.loc[df['URI'] == URI, 'speechiness'] = feature['speechiness']
                    df.loc[df['URI'] == URI, 'valence'] = feature['valence']
                    df.loc[df['URI'] == URI, 'liveness'] = feature['liveness']
                    df.loc[df['URI'] == URI, 'tempo'] = feature['tempo']
                    df.loc[df['URI'] == URI, 'danceability'] = feature['danceability']
            break  # Exit the while loop if request is successful
        except SpotifyException as e:
            if e.http_status == 429:
                print("429 Too Many Requests encountered. Waiting 30 minutes before retrying...")
                time.sleep(1800)  # Sleep for 1800 seconds (30 minutes)
            else:
                print(f"An error occurred: {e}")
                break  # Exit the while loop for other errors
            
batch_size = 100
for start in range(0, len(unique_track_uris), batch_size):
    end = start + batch_size
    batch_uris = unique_track_uris[start:end]
    process_batch(batch_uris, df)
    print(f"Processed batch {start // batch_size + 1}")
    time.sleep(15)  # Fixed 15 second delay between each batch

Processed batch 1
Processed batch 2
Processed batch 3
Processed batch 4
Processed batch 5
Processed batch 6
Processed batch 7
Processed batch 8
Processed batch 9
Processed batch 10
Processed batch 11
Processed batch 12
Processed batch 13
Processed batch 14
Processed batch 15
Processed batch 16
Processed batch 17
Processed batch 18
Processed batch 19
Processed batch 20
Processed batch 21
Processed batch 22


In [None]:
#Make URI the index column
df=df[['URI']+[col for col in df.columns if col != 'URI']]

print(df.head())

In [62]:
# Method 1 - using dictionaries

# Step 1: Create a dictionary to store the count of occurrences of each URI in track_uris
uri_count = {}
for uri in track_uris:
    uri_count[uri] = uri_count.get(uri, 0) + 1

# Step 2-4: Calculate the weighted average of each audio feature
weighted_avg = {}
for feature in features_list:
    total_weighted_sum = 0
    total_count = 0
    for index, row in df.iterrows():
        uri = row['URI']
        if uri in uri_count:
            count = uri_count[uri]
            value = row[feature]
            if value:  # Only consider non-empty values
                total_weighted_sum += value * count
                total_count += count
    if total_count != 0:
        weighted_avg[feature] = total_weighted_sum / total_count
    else:
        weighted_avg[feature] = 0  # To avoid division by zero if no URI matches

print("Weighted Average of Audio Features:")
for feature, value in weighted_avg.items():
    print(f"{feature}: {value}")

Weighted Average of Audio Features:
energy: 5.028122065511312e+21
loudness: -3.473626312434061e+22
speechiness: 1.300327726457464e+21
valence: 2.2111057964665807e+21
liveness: 1.5682038285024808e+21
tempo: 8.505350386307177e+23
danceability: 2.551275918999628e+21
acousticness: 8.334509857906367e+19
instrumentalness: 1.3688006801543696e+20


In [61]:
# Method 2 - using numpy

import numpy as np

# Convert the DataFrame to a numpy array
df_values = df[features_list].to_numpy(dtype=float)

# Initialize an array to store the weighted sums
weighted_sums = np.zeros(len(features_list))

# Initialize a variable to store the total count of URIs in track_uris
total_count = 0

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    uri = row['URI']
    if uri in uri_count:
        count = uri_count[uri]
        weighted_sums += row[features_list] * count
        total_count += count

# Calculate the weighted average
weighted_avg = weighted_sums / total_count

# Display the weighted average of audio features
print("Weighted Average of Audio Features:")
for feature, value in zip(features_list, weighted_avg):
    print(f"{feature}: {value}")

Weighted Average of Audio Features:
energy: 5.028122065511312e+21
loudness: -3.473626312434061e+22
speechiness: 1.2995255501919135e+21
valence: 2.2097417583441462e+21
liveness: 1.5682038285024808e+21
tempo: 8.50010340889428e+23
danceability: 2.549702028858357e+21
acousticness: 8.334509857906367e+19
instrumentalness: 1.0253640417125356e+20
