In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
from tqdm.notebook import tqdm
import json
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import matplotlib.pyplot as plt
plt.rc('font', size=14)
import seaborn as sns
sns.set(style='whitegrid', color_codes=True, rc={'figure.figsize':(11,8)}, font_scale=2)

## Loading and Tidying Streaming Data

### Loading

In [2]:
# read streaming data and concat rows
strm_base = pd.DataFrame()
for file in os.listdir("./data"):
    if file.startswith("Streaming"):
        file_path = "./data/" + file
        temp = pd.read_json(file_path)
        strm_base = pd.concat([strm_base,temp])

In [3]:
strm_base.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2021-11-05 01:35,Grant,Weapon,980
1,2021-11-05 01:35,LVTHER,This Love,0
2,2021-11-05 01:35,Hellberg,Synchronize - VIP,670
3,2021-11-05 01:35,Vicetone,I Hear You,570
4,2021-11-05 01:35,Tritonal,Getaway,0


### Tidying

In [4]:
strm_base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20586 entries, 0 to 585
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   endTime     20586 non-null  object
 1   artistName  20586 non-null  object
 2   trackName   20586 non-null  object
 3   msPlayed    20586 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 804.1+ KB


In [5]:
strm_base.index = range(0,strm_base.shape[0])
strm_base = strm_base.astype({'endTime': 'datetime64','artistName':'string','trackName':'string'})

In [6]:
strm_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20586 entries, 0 to 20585
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   endTime     20586 non-null  datetime64[ns]
 1   artistName  20586 non-null  string        
 2   trackName   20586 non-null  string        
 3   msPlayed    20586 non-null  int64         
dtypes: datetime64[ns](1), int64(1), string(2)
memory usage: 643.4 KB


In [7]:
strm_base[strm_base['artistName'] == '88rising']

Unnamed: 0,endTime,artistName,trackName,msPlayed
175,2021-11-09 06:01:00,88rising,California (feat. Warren Hue),33216
177,2021-11-09 06:02:00,88rising,California (feat. Warren Hue),5141
178,2021-11-09 06:02:00,88rising,California (feat. Warren Hue),45525
179,2021-11-09 06:05:00,88rising,California (feat. Warren Hue),143912
1760,2021-12-09 21:11:00,88rising,Freaks,9420
...,...,...,...,...
20174,2022-05-03 03:19:00,88rising,The Weekend (with 347aidan) - Remix,167661
20175,2022-05-03 03:23:00,88rising,California (feat. Warren Hue),230395
20355,2022-05-08 03:34:00,88rising,T,249033
20357,2022-05-08 03:37:00,88rising,Best Lover,152910


## Spotify API
### Intial Exploration
#### Initializing spotify object

In [8]:
# add your own credentials in dev_creds.py
from dev_creds import get_creds
cid, secret = get_creds()

In [9]:
client_credentials_manager = SpotifyClientCredentials(client_id=cid,client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Exploring sp.search query

In [None]:
import time
start_time = time.time()

test = sp.search('move brb',limit=1)

print("My program took", time.time() - start_time, "to run")

In [None]:
type(test)

In [None]:
test.keys()

In [None]:
test['tracks'].keys()

In [None]:
test['tracks']['total']

In [None]:
len(test['tracks']['items'])

In [None]:
test['tracks']['items'][0].keys()

In [None]:
test['tracks']['items'][0]['id']

In [None]:
test['tracks']['items'][0]['artists']

In [None]:
test['tracks']['items'][0]['name']

In [None]:
test['tracks']['items'][0]['name']

#### Findings
The format of the search return appears to be a dictionary which starts with 'tracks' since we search tracks (default) and then the parameters of the search where 'items' are the songs returned. Items is a list of 10 since default limit=10 where each entry in the list is a dictionary. We want to extract the name and artist to match with our streaming data and then retrieve the spotify song_id if they match (we don't need to worry about duplicate songs e.g. songs released by an artist first as a single and then in an album since the songs should have the same features/genre etc)

### Search-match function
#### Creating a function which returns track_id
This function should take in `in_track` and `in_artist` parameters (name of track and name of artist) and perform a search. It then iterates through the returned tracks and matches our name-artist pair with one of the search outputs and returns the spotify song_id for that track. 

In [None]:
def search_getid(in_track,in_artist):
    pair = in_track + ' ' + in_artist
    temp = sp.search(pair)
    end = len(temp['tracks']['items'])
    for x in range(0,end):
        #print(x)
        #print(len(temp['tracks']['items']))
        #print(temp['tracks']['items'])
        out_track = temp['tracks']['items'][x]['name']
        out_artist = temp['tracks']['items'][x]['artists'][0]['name']
        track_id = temp['tracks']['items'][x]['id']
        if ((in_track == out_track) and (in_artist == out_artist)):
            return(track_id)

In [None]:
search_getid('The Weekend (with 347aidan) - Remix','88rising') #check passed 

#### Adding Track IDs to Streaming Data

In [None]:
import time
start_time = time.time()

df_ids = strm_base
df_ids['trackIDs'] = df_ids.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

print("My program took", time.time() - start_time, "to run")

In [None]:
df_ids.info()

In [None]:
in_name = 'Weapon'
in_artist = 'Grant'
lim = 10
pair = in_name + ' ' + in_artist
temp = sp.search(pair,limit=lim)
for x in range(0,lim):
    out_name = temp['tracks']['items'][x]['name']
    out_artist = temp['tracks']['items'][x]['artists'][0]['name']
    track_id = temp['tracks']['items'][x]['id']
    if ((in_name == out_name) and (in_artist == out_artist)):
        print(track_id)
