In [5]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
from tqdm.notebook import tqdm
import json
import os
import time
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import matplotlib.pyplot as plt
plt.rc('font', size=14)
import seaborn as sns
sns.set(style='whitegrid', color_codes=True, rc={'figure.figsize':(11,8)}, font_scale=2)

In [3]:
#os.chdir('/c/Users/matth/Documents/Coding/spotify/')

In [4]:
#!pwd

## Loading and Tidying Streaming Data

### Loading

In [8]:
# read streaming data and concat rows
strm_base = pd.DataFrame()
for file in os.listdir("./data"):
    if file.startswith("Streaming"):
        file_path = "./data/" + file
        temp = pd.read_json(file_path)
        strm_base = pd.concat([strm_base,temp])

In [9]:
strm_base.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2021-11-05 01:35,Grant,Weapon,980
1,2021-11-05 01:35,LVTHER,This Love,0
2,2021-11-05 01:35,Hellberg,Synchronize - VIP,670
3,2021-11-05 01:35,Vicetone,I Hear You,570
4,2021-11-05 01:35,Tritonal,Getaway,0


### Tidying

In [10]:
strm_base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20586 entries, 0 to 585
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   endTime     20586 non-null  object
 1   artistName  20586 non-null  object
 2   trackName   20586 non-null  object
 3   msPlayed    20586 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 804.1+ KB


In [11]:
strm_base.index = range(0,strm_base.shape[0])
strm_base = strm_base.astype({'endTime': 'datetime64','artistName':'string','trackName':'string'})

In [12]:
strm_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20586 entries, 0 to 20585
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   endTime     20586 non-null  datetime64[ns]
 1   artistName  20586 non-null  string        
 2   trackName   20586 non-null  string        
 3   msPlayed    20586 non-null  int64         
dtypes: datetime64[ns](1), int64(1), string(2)
memory usage: 643.4 KB


## Spotify API
### Initial Exploration
#### Initializing spotify object

In [6]:
# add your own credentials in dev_creds.py
from dev_creds import get_creds, alt_creds
cid, secret = get_creds()

In [7]:
client_credentials_manager = SpotifyClientCredentials(client_id=cid,client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [8]:
print(cid)
print(secret)

29548aaefa2b41f695b01501f57a56a9
3f15d47c280642a98fdd3db7b7648e50


In [9]:
# test code
playlist_link = "https://open.spotify.com/playlist/37i9dQZEVXbNG2KDcFcKOF?si=1333723a6eff4b7f"
playlist_URI = playlist_link.split("/")[-1].split("?")[0]
track_uris = [x["track"]["uri"] for x in sp.playlist_tracks(playlist_URI)["items"]]

### Exploring sp.search query

In [17]:
import time
start_time = time.time()

test = sp.search('move brb')

print("My program took", time.time() - start_time, "to run")

My program took 0.09958195686340332 to run


In [18]:
type(test)

dict

In [19]:
test.keys()

dict_keys(['tracks'])

In [20]:
test['tracks'].keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [21]:
test['tracks']['total']

93

In [22]:
len(test['tracks']['items'])

10

In [23]:
test['tracks']['items'][0].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

In [24]:
test['tracks']['items'][0]['id']

'2Ryp5LkAWyJwRqoFd8N7Kk'

In [25]:
test['tracks']['items'][0]['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/2XBiI8PjCnjJ3XKWtiKcvc'},
  'href': 'https://api.spotify.com/v1/artists/2XBiI8PjCnjJ3XKWtiKcvc',
  'id': '2XBiI8PjCnjJ3XKWtiKcvc',
  'name': 'brb.',
  'type': 'artist',
  'uri': 'spotify:artist:2XBiI8PjCnjJ3XKWtiKcvc'}]

In [26]:
test['tracks']['items'][0]['name']

'move'

#### Findings
The format of the search return appears to be a dictionary which starts with 'tracks' since we search tracks (default) and then the parameters of the search where 'items' are the songs returned. Items is a list of 10 since default limit=10 where each entry in the list is a dictionary. We want to extract the name and artist to match with our streaming data and then retrieve the spotify song_id if they match (we don't need to worry about duplicate songs e.g. songs released by an artist first as a single and then in an album since the songs should have the same features/genre etc)

### Song_artist Object
#### Creating object which stores song and artist data for later extraction


In [1]:
from objects import song_artist

# testing song_artist object
sa = song_artist('move','brb')

In [2]:
print(sa.song)
print(sa.artist)

move
brb


### Search-match function
#### Creating a function which returns track_id
This function will take in a `song_artist` object as a parameter and perform a search using the song and artist names. It then iterates through the returned tracks and matches our name-artist pair with one of the search outputs and returns the spotify `track_id` for that track. 

In [13]:
def search_getid(song_artist):
    pair = song_artist.song + ' ' + song_artist.artist
    temp = sp.search(pair)
    end = len(temp['tracks']['items'])
    for x in range(0,end):
        #print(x)
        #print(len(temp['tracks']['items']))
        #print(temp['tracks']['items'])
        out_track = temp['tracks']['items'][x]['name']
        out_artist = temp['tracks']['items'][x]['artists'][0]['name']
        track_id = temp['tracks']['items'][x]['id']
        if ((song_artist.song == out_track) and (song_artist.artist == out_artist)):
            return(track_id)

In [14]:
start_time = time.time()

temp = song_artist('The Weekend (with 347aidan) - Remix','88rising')

print(search_getid(temp)) #check passed 

print("My program took", time.time() - start_time, "to run")

0G7xOaJtStqoAEyLKNuRA3
My program took 0.10372376441955566 to run


#### Adding Track IDs to Streaming Data

My initial attempt to complete this task used .apply to apply the function to the entire dataframe. However, this resulted in time out error where the cell would be stuck running but the api was not responsive. As a result, I believe it is not possible to apply the function to the entire dataframe. My next attempt is to split the dataframe up and apply the function to each part, changing the client_id and client_secret as needed. 

**Attempt 1**

In [25]:
#import time
#start_time = time.time()

#df_ids = strm_base
#df_ids['trackIDs'] = df_ids.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run")

In [26]:
#df_ids.info()

**Attempt 2**

In [27]:
#start_time = time.time()

#df_1 = strm_base.iloc[0:5000,:]
#df_1['trackIDs'] = df_1.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run") # My program took 661.2164733409882 to run
# approx 11 min runtime

In [28]:
#df_1.to_csv(path_or_buf='data/ids_1.csv')

In [30]:
from dev_creds import alt_creds
import math

start_time = time.time()
print(start_time)
'''
for alt in range(0,5):
    cid, secret = alt_creds(alt)
    client_credentials_manager = SpotifyClientCredentials(client_id=cid,client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    
    if (alt == 0):
        start = (alt) * (strm_base.shape[0]/5)
    else:
        start = (alt) * (strm_base.shape[0]/5) + 1
    if (alt ==4):
        end = strm_base.shape[0]
    else:
        end = math.floor((alt+1) * (strm_base.shape[0]/5))
    
    df=strm_base.iloc[int(start):int(end),:]
    df['trackIDs'] = df.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)
    
    path = 'data/ids_' + str(alt) + '.csv'
    df.to_csv(path_or_buf=path)

print("My program took", time.time() - start_time, "to run")''' # full loop takes around an hour to run, each iteration takes around 9-11 min

1656376569.9426615


'\nfor alt in range(0,5):\n    cid, secret = alt_creds(alt)\n    client_credentials_manager = SpotifyClientCredentials(client_id=cid,client_secret=secret)\n    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)\n    \n    if (alt == 0):\n        start = (alt) * (strm_base.shape[0]/5)\n    else:\n        start = (alt) * (strm_base.shape[0]/5) + 1\n    if (alt ==4):\n        end = strm_base.shape[0]\n    else:\n        end = math.floor((alt+1) * (strm_base.shape[0]/5))\n    \n    df=strm_base.iloc[int(start):int(end),:]\n    df[\'trackIDs\'] = df.apply(lambda x: search_getid(x[\'trackName\'], x[\'artistName\']), axis=1)\n    \n    path = \'data/ids_\' + str(alt) + \'.csv\'\n    df.to_csv(path_or_buf=path)\n\nprint("My program took", time.time() - start_time, "to run")'

In [31]:
full_ids = pd.DataFrame()
for file in os.listdir("./data"):
    if file.startswith("ids_"):
        file_path = "./data/" + file
        temp = pd.read_csv(file_path)
        full_ids = pd.concat([full_ids,temp])

In [32]:
full_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20582 entries, 0 to 4115
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  20582 non-null  int64 
 1   endTime     20582 non-null  object
 2   artistName  20582 non-null  object
 3   trackName   20582 non-null  object
 4   msPlayed    20582 non-null  int64 
 5   trackIDs    20360 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.1+ MB


In [6]:
full_ids[pd.isnull(full_ids['trackIDs'])]

Unnamed: 0.1,Unnamed: 0,endTime,artistName,trackName,msPlayed,trackIDs
6,16475,2021-08-20 03:49:00,SURL,Dancing on the Rooftop 옥상에서 춤을,6140,
10,16479,2021-08-20 03:53:00,SURL,Dancing on the Rooftop 옥상에서 춤을,5930,
20,16489,2021-08-20 03:58:00,Epik High,End of the World,18940,
262,16731,2021-08-24 23:05:00,All The Credit,Making Sense of the Booms and Busts in Global ...,1335920,
349,16818,2021-08-26 23:08:00,All The Credit,Making Sense of the Booms and Busts in Global ...,758230,
...,...,...,...,...,...,...
3972,16324,2021-08-17 23:19:00,Macro Musings with David Beckworth,"Skanda Amarnath on Maximum Employment, Inflati...",1452716,
3973,16325,2021-08-17 23:22:00,Macro Musings with David Beckworth,Jerusalem Demsas on Problems in the US Housing...,186823,
3974,16326,2021-08-17 23:56:00,Macro Musings with David Beckworth,Scott Sumner on What Milton Friedman Would Thi...,1111500,
4022,16374,2021-08-18 23:33:00,Macro Musings with David Beckworth,Scott Sumner on What Milton Friedman Would Thi...,2059390,


In [None]:
#start_time = time.time()

#df_2 = strm_base.iloc[5001:10000,:]
#df_2['trackIDs'] = df_2.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run")

In [None]:
#start_time = time.time()

#df_3 = strm_base.iloc[10001:15000,:]
#df_3['trackIDs'] = df_3.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run")

In [None]:
#start_time = time.time()

#df_4 = strm_base.iloc[15000:strm_base.shape[0],:]
#df_4['trackIDs'] = df_4.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run")

**Attempt 5**

Parallelization of the querying

### Adding Genre and Feature Data