# Library Imports

In [3]:
import pandas as pd
import glob
import requests
import time

# Data Imports / Exports

In [35]:
# THESE FILES ARE EXCLUDED FROM THE REPOSITORY
stream_json = glob.glob('datasets/Streaming_History_Audio*.json')

In [36]:
# see the list of globbed files
stream_json

['datasets\\Streaming_History_Audio_2016-2017_0.json',
 'datasets\\Streaming_History_Audio_2017-2018_1.json',
 'datasets\\Streaming_History_Audio_2018-2019_2.json',
 'datasets\\Streaming_History_Audio_2019-2020_3.json',
 'datasets\\Streaming_History_Audio_2020-2021_4.json',
 'datasets\\Streaming_History_Audio_2021-2022_5.json',
 'datasets\\Streaming_History_Audio_2022-2023_6.json',
 'datasets\\Streaming_History_Audio_2023_7.json',
 'datasets\\Streaming_History_Audio_2023_8.json']

In [40]:
# load the globbed files into a dataframe
stream_df = pd.concat([pd.read_json(f) for f in stream_json])

In [38]:
tracks_df = pd.read_csv('datasets/tracks_df.csv')
podcasts_df = pd.read_csv('datasets/podcasts_df.csv')

  tracks_df = pd.read_csv('datasets/tracks_df.csv')


In [41]:
print(stream_df.shape)
print(stream_df.columns)

(128775, 21)
Index(['ts', 'username', 'platform', 'ms_played', 'conn_country',
       'ip_addr_decrypted', 'user_agent_decrypted',
       'master_metadata_track_name', 'master_metadata_album_artist_name',
       'master_metadata_album_album_name', 'spotify_track_uri', 'episode_name',
       'episode_show_name', 'spotify_episode_uri', 'reason_start',
       'reason_end', 'shuffle', 'skipped', 'offline', 'offline_timestamp',
       'incognito_mode'],
      dtype='object')


In [42]:
stream_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 128775 entries, 0 to 1545
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   ts                                 128775 non-null  object
 1   username                           128775 non-null  object
 2   platform                           128775 non-null  object
 3   ms_played                          128775 non-null  int64 
 4   conn_country                       128775 non-null  object
 5   ip_addr_decrypted                  108167 non-null  object
 6   user_agent_decrypted               108167 non-null  object
 7   master_metadata_track_name         127961 non-null  object
 8   master_metadata_album_artist_name  127961 non-null  object
 9   master_metadata_album_album_name   127961 non-null  object
 10  spotify_track_uri                  127961 non-null  object
 11  episode_name                       639 non-null     object


In [43]:
# number of unique values in each column
stream_df.nunique()

ts                                   121283
username                                  1
platform                                 33
ms_played                             26836
conn_country                              9
ip_addr_decrypted                      2737
user_agent_decrypted                      6
master_metadata_track_name             4714
master_metadata_album_artist_name      1639
master_metadata_album_album_name       2680
spotify_track_uri                      5172
episode_name                            295
episode_show_name                        31
spotify_episode_uri                     295
reason_start                              8
reason_end                               10
shuffle                                   2
skipped                                   2
offline                                   2
offline_timestamp                    126902
incognito_mode                            2
dtype: int64

In [44]:
stream_df.isna().sum()

ts                                        0
username                                  0
platform                                  0
ms_played                                 0
conn_country                              0
ip_addr_decrypted                     20608
user_agent_decrypted                  20608
master_metadata_track_name              814
master_metadata_album_artist_name       814
master_metadata_album_album_name        814
spotify_track_uri                       814
episode_name                         128136
episode_show_name                    128136
spotify_episode_uri                  128136
reason_start                              0
reason_end                            20608
shuffle                                   0
skipped                              106195
offline                                   0
offline_timestamp                         0
incognito_mode                            0
dtype: int64

In [45]:
stream_df['user_agent_decrypted'].value_counts()

user_agent_decrypted
unknown                                                                                                                                                                  108091
Mozilla%2F5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit%2F537.36%20(KHTML,%20like%20Gecko)%20Chrome%2F95.0.4638.54%20Safari%2F537.36%20Edg%2F95.0.1020.40        66
Mozilla%2F5.0%20(Windows%20NT%2010.0;%20Win64;%20x64;%20rv:84.0)%20Gecko%2F20100101%20Firefox%2F84.0                                                                          6
Mozilla%2F5.0%20(Windows%20NT%2010.0;%20Win64;%20x64;%20rv:61.0)%20Gecko%2F20100101%20Firefox%2F61.0                                                                          2
Mozilla%2F5.0%20(Windows%20NT%2010.0;%20Win64;%20x64;%20rv:69.0)%20Gecko%2F20100101%20Firefox%2F69.0                                                                          1
Mozilla%2F5.0%20(Windows%20NT%2010.0;%20Win64;%20x64;%20rv:83.0)%20Gecko%2F20100101%20Firefox%2F83.

In [46]:
# dropping some columns that might cnotain sensitive information or just not useful
stream_df.drop(columns=['username', 'ip_addr_decrypted', 'user_agent_decrypted'], inplace=True)

there are a huge number of null value of 'spotify_episode_uri', indicating that tracks/songs and podcasts don't share same uri, we need to separate this into its own dataframe (for tracks and podcasts)

# Preparing for Data Collection
we need 4 things here.
1. tracks streaming history
2. podcasts streaming history
3. list of unique tracks streamed
4. ~~list of unique podcasts streamed~~ not really needed actually

In [55]:
# rows with null in both spotify_track_uri and spotify_episode_uri
print(stream_df[stream_df['spotify_episode_uri'].isna() & stream_df['spotify_track_uri'].isna()].shape)
print(stream_df[stream_df['spotify_episode_uri'].notna() & stream_df['spotify_track_uri'].notna()].shape)

(175, 18)
(0, 18)


so apparently there exists some rows are null on both spotify_episode_uri and spotify_track_uri, so those should be excluded as well

In [56]:
# get rows with null values in 'spotify_episode_uri' column and put it into a new dataframe
tracks_df = stream_df[stream_df['spotify_track_uri'].notna()]
podcasts_df = stream_df[stream_df['spotify_episode_uri'].notna()]

In [57]:
print(tracks_df.shape)
print(podcasts_df.shape)

(127961, 18)
(639, 18)


In [58]:
# list of tracks, containing unique values in 'spotify_track_uri' column
tracks_uri_list = tracks_df['spotify_track_uri'].unique().tolist()
podcasts_uri_list = podcasts_df['spotify_episode_uri'].unique().tolist()

In [59]:
print(len(tracks_uri_list))
print(len(podcasts_uri_list))

5172
295


In [60]:
# save tracks_df and podcasts_df into csv
tracks_df.to_csv('datasets/tracks_df.csv', index=False)
podcasts_df.to_csv('datasets/podcasts_df.csv', index=False)

# Data Collection (Spotipy)
now we're going to collect the using [Spotipy](https://spotipy.readthedocs.io/en/2.22.1/), it'll require user credentials (client_id and client_secret) which you can get [here](https://developer.spotify.com/dashboard) by registering your app at the dashboard

In [61]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [62]:
# Spotify API credentials (client_id and client_secret used will be excluded from the repository)
client_id = 'YOUR CLIENT ID HERE'
client_secret = 'YOUR CLIENT SECRET HERE'

In [63]:
#Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

## get track info

In [65]:
# since they have the batch request limit of 50, we need to split the list into batches of 50
track_info = {}
for i in range(0, len(tracks_uri_list), 50):
    track_batch = tracks_uri_list[i:i+50]
    sapi = sp.tracks(track_batch)
    for track in sapi['tracks']:
        track_info[track['id']] = {'name':track['name'], 'artistName':track['artists'][0]['name'], 'release_date':track['album']['release_date'], 'popularity':track['popularity'], 'duration_ms':track['duration_ms']}
    time.sleep(1)
    

In [87]:
len(track_info)

5172

In [68]:
track_info

{'6tAM5c0bJOwRqGAEgiNMpI': {'name': 'Solace Album Mix',
  'artistName': 'Monstercat',
  'release_date': '2012-06-06',
  'popularity': 0,
  'duration_ms': 3538579},
 '4mjgNE8R31AzxWfPNGtVMf': {'name': 'Best of 2015 (Album Mix)',
  'artistName': 'Monstercat',
  'release_date': '2016-01-22',
  'popularity': 0,
  'duration_ms': 9158194},
 '5Q0P0cX3e42PgKd8LLS3ms': {'name': 'Horizon Album Mix',
  'artistName': 'Monstercat',
  'release_date': '2014-08-06',
  'popularity': 0,
  'duration_ms': 3623121},
 '6jvMmRtSzoEibQGrQkSISQ': {'name': 'Monstercat Best of 2012',
  'artistName': 'Monstercat',
  'release_date': '2013-02-04',
  'popularity': 0,
  'duration_ms': 6348017},
 '1KzLyjpjIRHuuj4iX8QsC2': {'name': 'Monstercat Podcast EP. 100',
  'artistName': 'Monstercat',
  'release_date': '2016-04-05',
  'popularity': 0,
  'duration_ms': 9744610},
 '4VrdksXJVhAOLW49qV0VTQ': {'name': 'Best of 2014 (Album Mix - Part 1)',
  'artistName': 'Monstercat',
  'release_date': '2015-01-26',
  'popularity': 0,


In [69]:
track_info_df = pd.DataFrame.from_dict(track_info, orient='index')

In [72]:
track_info_df.to_csv('datasets/tracks_info_df.csv', index=False)

## get track features

In [83]:
track_features = {}
for i in range(0, len(tracks_uri_list), 50):
    track_batch = tracks_uri_list[i:i+50]
    sapi = sp.audio_features(track_batch)
    for track in sapi:
        if track is not None:
            track_features[track['id']] = {'danceability':track['danceability'], 'energy':track['energy'], 'key':track['key'], 'loudness':track['loudness'], 'mode':track['mode'], 'speechiness':track['speechiness'], 'acousticness':track['acousticness'], 'instrumentalness':track['instrumentalness'], 'liveness':track['liveness'], 'valence':track['valence'], 'tempo':track['tempo'], 'time_signature':track['time_signature']}
        # we'll skip it if the track has no audio features data
    time.sleep(1)

In [85]:
len(track_features)

5165

In [84]:
track_features

{'6tAM5c0bJOwRqGAEgiNMpI': {'danceability': 0.446,
  'energy': 0.823,
  'key': 11,
  'loudness': -5.279,
  'mode': 0,
  'speechiness': 0.0859,
  'acousticness': 0.00256,
  'instrumentalness': 0.0748,
  'liveness': 0.117,
  'valence': 0.263,
  'tempo': 128.198,
  'time_signature': 4},
 '5Q0P0cX3e42PgKd8LLS3ms': {'danceability': 0.402,
  'energy': 0.856,
  'key': 1,
  'loudness': -4.256,
  'mode': 1,
  'speechiness': 0.0659,
  'acousticness': 0.00504,
  'instrumentalness': 0.0164,
  'liveness': 0.253,
  'valence': 0.247,
  'tempo': 140.028,
  'time_signature': 4},
 '4VrdksXJVhAOLW49qV0VTQ': {'danceability': 0.426,
  'energy': 0.915,
  'key': 11,
  'loudness': -3.881,
  'mode': 0,
  'speechiness': 0.143,
  'acousticness': 0.000899,
  'instrumentalness': 0.108,
  'liveness': 0.495,
  'valence': 0.271,
  'tempo': 130.405,
  'time_signature': 4},
 '0ng42pTjKgskmobNzhnEUa': {'danceability': 0.434,
  'energy': 0.839,
  'key': 8,
  'loudness': -4.493,
  'mode': 1,
  'speechiness': 0.119,
  'aco

In [89]:
track_features_df = pd.DataFrame.from_dict(track_features, orient='index')
track_features_df.to_csv('datasets/tracks_features_df.csv', index=False)