In [1]:
from datetime import datetime
import pandas as pd
import time
from tqdm.auto import tqdm
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Insert your spotify credentials
ID = ''
SECRET = ''
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=ID,
                                                                         client_secret=SECRET))
tqdm.pandas()

In [2]:
# Import data
json_paths = ['MyData/StreamingHistory0.json', 'MyData/StreamingHistory1.json'] # Insert your straming history files
frame = pd.DataFrame()
for path in json_paths:
    frame = pd.concat([frame, pd.read_json(path)])

In [3]:
# Original data
print(f'Original Dataset size: {len(frame)}')
print(f"Original data from {frame.endTime.min()} to {frame.endTime.max()}")

# Filter very short plays
df = frame[frame['msPlayed'] > 500]
# Convert end time to datetime
df.rename(columns={'endTime': 'datetime'}, inplace=True)
df.loc[:, 'datetime'] = pd.to_datetime(df.datetime, format='%Y-%m-%d %H:%M')
df.loc[:, 'fullName'] = df.trackName + ' - ' + df.artistName
      
# Cleaned data
count = len(df)
days = (df.datetime.max() - df.datetime.min()).days
print(f'Cleaned Dataset size: {count}')
print(f'Cleaned data from {df.datetime.min()} to {df.datetime.max()}')
print(f'Columns: {df.columns}')

Original Dataset size: 11190
Original data from 2019-10-01 20:18 to 2020-10-19 22:15
Cleaned Dataset size: 11140
Cleaned data from 2019-10-01 20:18:00 to 2020-10-19 22:15:00
Columns: Index(['artistName', 'datetime', 'msPlayed', 'trackName', 'fullName'], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


In [4]:
# Functions to fetch track features
def matches(sp_track, track_name, artist):
    return sp_track['name'] == track_name and artist in [el['name'] for el in sp_track['artists']]

def get_track_uri(track_name, artist):
    results = sp.search(q=f'{track_name} artist:{artist}', type='track')
    if results['tracks']['items']:
        for sp_track in results['tracks']['items']:
            if matches(sp_track, track_name, artist):
                return sp_track['uri']
    return None

def get_track_features(uri):
    if uri is None:
        return None
    return sp.audio_features(uri)

def generate_features_series(x):
    uri = get_track_uri(x['trackName'], x['artistName'])
    features = get_track_features(uri)
    if features:
        return pd.Series(
            [uri,
             features[0]['duration_ms'],
             features[0]['key'],
             features[0]['time_signature'],
             features[0]['acousticness'],
             features[0]['danceability'],
             features[0]['energy'],
             features[0]['instrumentalness'],
             features[0]['liveness'],
             features[0]['loudness'],
             features[0]['speechiness'],
             features[0]['valence'],
             features[0]['tempo']])
    else:
        return pd.Series([None] * 13)

In [None]:
# Apply this to dataframe
df[['uri',
    'duration_ms',
    'key',
    'time_signature',
    'acousticness',
    'danceability',
    'energy',
    'instrumentalness',
    'liveness',
    'loudness',
    'speechiness',
    'valence',
    'tempo']] = df.progress_apply(generate_features_series, axis=1)

In [None]:
# Save dataframe to disk, as it is quite expensive to recompute
filename = f'MyData/data{int(time.time())}.pkl'
df.to_pickle(filename)