In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
from tqdm.notebook import tqdm


import json
import os
import time
import csv

import multiprocessing
from joblib import Parallel, delayed

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import matplotlib.pyplot as plt
plt.rc('font', size=14)
import seaborn as sns
sns.set(style='whitegrid', color_codes=True, rc={'figure.figsize':(11,8)}, font_scale=2)

In [2]:
#os.chdir('/c/Users/matth/Documents/Coding/spotify/')

In [3]:
#!pwd

## Loading and Tidying Streaming Data

### Loading

In [4]:
# read streaming data and concat rows
strm_base = pd.DataFrame()
for file in os.listdir("./data"):
    if file.startswith("Streaming"):
        file_path = "./data/" + file
        temp = pd.read_json(file_path)
        strm_base = pd.concat([strm_base,temp])

In [5]:
strm_base.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2020-06-23 22:28,Aiobahn,過ぎゆく日と君へ,6290
1,2021-05-08 05:06,Smallpools,Insincere,125825
2,2021-05-10 00:25,Smallpools,cycle,149040
3,2021-05-10 02:22,Nice White Parents,"3: ‘This Is Our School, How Dare You?’",331310
4,2021-05-10 02:22,COIN,Turnaround,14470


### Tidying

In [6]:
strm_base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20586 entries, 0 to 585
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   endTime     20586 non-null  object
 1   artistName  20586 non-null  object
 2   trackName   20586 non-null  object
 3   msPlayed    20586 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 804.1+ KB


In [7]:
strm_base.index = range(0,strm_base.shape[0])
strm_base = strm_base.astype({'endTime': 'datetime64','artistName':'string','trackName':'string'})

In [8]:
strm_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20586 entries, 0 to 20585
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   endTime     20586 non-null  datetime64[ns]
 1   artistName  20586 non-null  string        
 2   trackName   20586 non-null  string        
 3   msPlayed    20586 non-null  int64         
dtypes: datetime64[ns](1), int64(1), string(2)
memory usage: 643.4 KB


## Spotify API
### Initial Exploration
#### Initializing spotify object

In [9]:
# add your own credentials in dev_creds.py
from dev_creds import get_creds, alt_creds
cid, secret = get_creds()

In [10]:
client_credentials_manager = SpotifyClientCredentials(client_id=cid,client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [11]:
print(cid)
print(secret)

29548aaefa2b41f695b01501f57a56a9
3f15d47c280642a98fdd3db7b7648e50


In [12]:
# test code
playlist_link = "https://open.spotify.com/playlist/37i9dQZEVXbNG2KDcFcKOF?si=1333723a6eff4b7f"
playlist_URI = playlist_link.split("/")[-1].split("?")[0]
track_uris = [x["track"]["uri"] for x in sp.playlist_tracks(playlist_URI)["items"]]

### Exploring sp.search query

In [13]:
import time
start_time = time.time()

test = sp.search('move brb')

print("My program took", time.time() - start_time, "to run")

My program took 0.14682388305664062 to run


In [14]:
type(test)

dict

In [15]:
test.keys()

dict_keys(['tracks'])

In [16]:
test['tracks'].keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [17]:
test['tracks']['total']

117

In [18]:
len(test['tracks']['items'])

10

In [19]:
test['tracks']['items'][0].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

In [20]:
test['tracks']['items'][0]['id']

'2Ryp5LkAWyJwRqoFd8N7Kk'

In [21]:
test['tracks']['items'][0]['album']

{'album_type': 'single',
 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/2XBiI8PjCnjJ3XKWtiKcvc'},
   'href': 'https://api.spotify.com/v1/artists/2XBiI8PjCnjJ3XKWtiKcvc',
   'id': '2XBiI8PjCnjJ3XKWtiKcvc',
   'name': 'brb.',
   'type': 'artist',
   'uri': 'spotify:artist:2XBiI8PjCnjJ3XKWtiKcvc'}],
 'available_markets': ['AD',
  'AE',
  'AG',
  'AL',
  'AM',
  'AO',
  'AR',
  'AT',
  'AU',
  'AZ',
  'BA',
  'BB',
  'BD',
  'BE',
  'BF',
  'BG',
  'BH',
  'BI',
  'BJ',
  'BN',
  'BO',
  'BR',
  'BS',
  'BT',
  'BW',
  'BY',
  'BZ',
  'CA',
  'CD',
  'CG',
  'CH',
  'CI',
  'CL',
  'CM',
  'CO',
  'CR',
  'CV',
  'CW',
  'CY',
  'CZ',
  'DE',
  'DJ',
  'DK',
  'DM',
  'DO',
  'DZ',
  'EC',
  'EE',
  'EG',
  'ES',
  'FI',
  'FJ',
  'FM',
  'FR',
  'GA',
  'GB',
  'GD',
  'GE',
  'GH',
  'GM',
  'GN',
  'GQ',
  'GR',
  'GT',
  'GW',
  'GY',
  'HK',
  'HN',
  'HR',
  'HT',
  'HU',
  'ID',
  'IE',
  'IL',
  'IN',
  'IQ',
  'IS',
  'IT',
  'JM',
  'JO',
  'JP',
  'K

In [22]:
test['tracks']['items'][0]['album']['id']

'1gaYhlmZa4fT0NfH1IiSQ4'

In [23]:
test['tracks']['items'][0]['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/2XBiI8PjCnjJ3XKWtiKcvc'},
  'href': 'https://api.spotify.com/v1/artists/2XBiI8PjCnjJ3XKWtiKcvc',
  'id': '2XBiI8PjCnjJ3XKWtiKcvc',
  'name': 'brb.',
  'type': 'artist',
  'uri': 'spotify:artist:2XBiI8PjCnjJ3XKWtiKcvc'}]

In [24]:
test['tracks']['items'][0]['name']

'move'

#### Findings
The format of the search return appears to be a dictionary which starts with 'tracks' since we search tracks (default) and then the parameters of the search where 'items' are the songs returned. Items is a list of 10 since default limit=10 where each entry in the list is a dictionary. We want to extract the name and artist to match with our streaming data and then retrieve the spotify song_id if they match (we don't need to worry about duplicate songs e.g. songs released by an artist first as a single and then in an album since the songs should have the same features/genre etc).

Additionally, we can extract the album id while we extract song id. This can potentially be useful for getting genre information. 

### Song_artist Object
#### Creating object which stores song and artist data for later extraction


In [25]:
from objects import song_artist

# testing song_artist object
sa = song_artist('move','brb')

In [26]:
print(sa.song)
print(sa.artist)

move
brb


### Search-match function
#### Creating a function which returns track_id
This function will take in a `song_artist` object as a parameter and perform a search using the song and artist names. It then iterates through the returned tracks and matches our name-artist pair with one of the search outputs and returns the spotify `track_id` for that track. 

In [27]:
def search_getid(worker, song_artist):
    pair = song_artist.song + ' ' + song_artist.artist
    temp = worker.search(pair)
    end = len(temp['tracks']['items'])
    for x in range(0,end):
        #print(x)
        #print(len(temp['tracks']['items']))
        #print(temp['tracks']['items'])
        out_track = temp['tracks']['items'][x]['name']
        out_artist = temp['tracks']['items'][x]['artists'][0]['name']
        track_id = temp['tracks']['items'][x]['id']
        if ((song_artist.song == out_track) and (song_artist.artist == out_artist)):
            return(track_id)

In [28]:
start_time = time.time()

temp = song_artist('The Weekend (with 347aidan) - Remix','88rising')

print(search_getid(sp,temp)) #check passed 

print("My program took", time.time() - start_time, "to run")

0G7xOaJtStqoAEyLKNuRA3
My program took 0.31278347969055176 to run


#### Adding Track IDs to Streaming Data

My initial attempt to complete this task used .apply to apply the function to the entire dataframe. However, this resulted in time out error where the cell would be stuck running but the api was not responsive. As a result, I believe it is not possible to apply the function to the entire dataframe. My next attempt is to split the dataframe up and apply the function to each part, changing the client_id and client_secret as needed. Finally, I decided to parallelize the operation while also simultaneously extracting audio_features and genre information. 

**Attempt 1**

In [29]:
#import time
#start_time = time.time()

#df_ids = strm_base
#df_ids['trackIDs'] = df_ids.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run")

In [30]:
#df_ids.info()

**Attempt 2**

In [31]:
#start_time = time.time()

#df_1 = strm_base.iloc[0:5000,:]
#df_1['trackIDs'] = df_1.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run") # My program took 661.2164733409882 to run
# approx 11 min runtime

In [32]:
#df_1.to_csv(path_or_buf='data/ids_1.csv')

In [33]:
from dev_creds import alt_creds
import math

start_time = time.time()
print(start_time)
'''
for alt in range(0,5):
    cid, secret = alt_creds(alt)
    client_credentials_manager = SpotifyClientCredentials(client_id=cid,client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    
    if (alt == 0):
        start = (alt) * (strm_base.shape[0]/5)
    else:
        start = (alt) * (strm_base.shape[0]/5) + 1
    if (alt ==4):
        end = strm_base.shape[0]
    else:
        end = math.floor((alt+1) * (strm_base.shape[0]/5))
    
    df=strm_base.iloc[int(start):int(end),:]
    df['trackIDs'] = df.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)
    
    path = 'data/ids_' + str(alt) + '.csv'
    df.to_csv(path_or_buf=path)

print("My program took", time.time() - start_time, "to run")''' # full loop takes around an hour to run, each iteration takes around 9-11 min

1657764701.4609902


'\nfor alt in range(0,5):\n    cid, secret = alt_creds(alt)\n    client_credentials_manager = SpotifyClientCredentials(client_id=cid,client_secret=secret)\n    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)\n    \n    if (alt == 0):\n        start = (alt) * (strm_base.shape[0]/5)\n    else:\n        start = (alt) * (strm_base.shape[0]/5) + 1\n    if (alt ==4):\n        end = strm_base.shape[0]\n    else:\n        end = math.floor((alt+1) * (strm_base.shape[0]/5))\n    \n    df=strm_base.iloc[int(start):int(end),:]\n    df[\'trackIDs\'] = df.apply(lambda x: search_getid(x[\'trackName\'], x[\'artistName\']), axis=1)\n    \n    path = \'data/ids_\' + str(alt) + \'.csv\'\n    df.to_csv(path_or_buf=path)\n\nprint("My program took", time.time() - start_time, "to run")'

In [34]:
full_ids = pd.DataFrame()
for file in os.listdir("./data"):
    if file.startswith("ids_"):
        file_path = "./data/" + file
        temp = pd.read_csv(file_path)
        full_ids = pd.concat([full_ids,temp])

In [35]:
full_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20582 entries, 0 to 4116
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  20582 non-null  int64 
 1   endTime     20582 non-null  object
 2   artistName  20582 non-null  object
 3   trackName   20582 non-null  object
 4   msPlayed    20582 non-null  int64 
 5   trackIDs    20360 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.1+ MB


In [36]:
full_ids[pd.isnull(full_ids['trackIDs'])]

Unnamed: 0.1,Unnamed: 0,endTime,artistName,trackName,msPlayed,trackIDs
67,67,2021-11-08 00:28:00,Life Kit,"How To Wake Up Early, Even If You're Not A Mor...",1291310,
126,126,2021-11-09 01:38:00,Alonestar,Raise Em Up (House Remix) (feat. Ed Sheeran),1990,
310,310,2021-11-17 03:25:00,Macro Musings with David Beckworth,"03 - John Cochrane on Finance, the Fiscal Theo...",1287330,
411,411,2021-11-22 22:12:00,Ben Rosett,Shadow Galaxy,157959,
441,441,2021-11-23 08:04:00,Third Party,We Found Love,205714,
...,...,...,...,...,...,...
3434,19903,2021-11-02 03:23:00,Alonestar,Raise Em Up (House Remix) (feat. Ed Sheeran),45518,
3535,20004,2022-04-29 21:38:00,nzev,Lay It Down,640,
3565,20034,2022-04-29 23:16:00,DAWNBRINGERS,wishing,1400,
3566,20035,2022-04-29 23:16:00,DAWNBRINGERS,wishing,26180,


In [37]:
#start_time = time.time()

#df_2 = strm_base.iloc[5001:10000,:]
#df_2['trackIDs'] = df_2.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run")

In [38]:
#start_time = time.time()

#df_3 = strm_base.iloc[10001:15000,:]
#df_3['trackIDs'] = df_3.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run")

In [39]:
#start_time = time.time()

#df_4 = strm_base.iloc[15000:strm_base.shape[0],:]
#df_4['trackIDs'] = df_4.apply(lambda x: search_getid(x['trackName'], x['artistName']), axis=1)

#print("My program took", time.time() - start_time, "to run")

### Parallelization of Operations

Here, we adapt the methods used in parallelization of selenium webdriver scraping operations to matching track ids and adding features to each song. We first explore how features are added to determine how to best add it to the dataframe and then add this to the parallel operation.

#### Exploring Adding Features 

In [40]:
sa = song_artist('free love','HONNE')

In [41]:
from methods import search_getid

song_id = search_getid(sp,sa)
print(song_id)

0GPJSHYaXh8rZSSJoUMgyl


In [42]:
feat = sp.audio_features(song_id)

In [43]:
print(feat)
print(len(feat[0]))

[{'danceability': 0.708, 'energy': 0.68, 'key': 9, 'loudness': -8.203, 'mode': 1, 'speechiness': 0.0519, 'acousticness': 0.0625, 'instrumentalness': 0, 'liveness': 0.152, 'valence': 0.7, 'tempo': 133.947, 'type': 'audio_features', 'id': '0GPJSHYaXh8rZSSJoUMgyl', 'uri': 'spotify:track:0GPJSHYaXh8rZSSJoUMgyl', 'track_href': 'https://api.spotify.com/v1/tracks/0GPJSHYaXh8rZSSJoUMgyl', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0GPJSHYaXh8rZSSJoUMgyl', 'duration_ms': 209280, 'time_signature': 4}]
18


In [44]:
list(feat[0].values())

[0.708,
 0.68,
 9,
 -8.203,
 1,
 0.0519,
 0.0625,
 0,
 0.152,
 0.7,
 133.947,
 'audio_features',
 '0GPJSHYaXh8rZSSJoUMgyl',
 'spotify:track:0GPJSHYaXh8rZSSJoUMgyl',
 'https://api.spotify.com/v1/tracks/0GPJSHYaXh8rZSSJoUMgyl',
 'https://api.spotify.com/v1/audio-analysis/0GPJSHYaXh8rZSSJoUMgyl',
 209280,
 4]

In [45]:
list(feat[0].keys())

['danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'type',
 'id',
 'uri',
 'track_href',
 'analysis_url',
 'duration_ms',
 'time_signature']

In [46]:
temp = list(feat[0].keys())
del temp[-7:-2]
print(temp)
print(len(temp))

['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
13


**Findings**:

It appears that `audio_features` returns a list of length one containing a dictionary of the features of the track given a track id. We can extract and get the relevant features by converting the dictionary values into a list for writing into a csv. The column names of the csv can also be created by the same method.

#### Exploring Getting Genre Info

*via album*

In [47]:
s = sp.search('gods plan drake')
s['tracks']['items'][0]['album']['id']

'1ATL5GLyefJaxhQzSPVrLX'

In [48]:
alb = s['tracks']['items'][0]['album']['id']

In [49]:
alb_feat = sp.album(alb)

In [50]:
alb_feat.keys()

dict_keys(['album_type', 'artists', 'available_markets', 'copyrights', 'external_ids', 'external_urls', 'genres', 'href', 'id', 'images', 'label', 'name', 'popularity', 'release_date', 'release_date_precision', 'total_tracks', 'tracks', 'type', 'uri'])

In [51]:
alb_feat['genres']

[]

*via artist*

In [52]:
s['tracks']['items'][0]['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/3TVXtAsR1Inumwj472S9r4'},
  'href': 'https://api.spotify.com/v1/artists/3TVXtAsR1Inumwj472S9r4',
  'id': '3TVXtAsR1Inumwj472S9r4',
  'name': 'Drake',
  'type': 'artist',
  'uri': 'spotify:artist:3TVXtAsR1Inumwj472S9r4'}]

In [53]:
artist_id = s['tracks']['items'][0]['artists'][0]['id']

In [54]:
art_info = sp.artist(artist_id)

In [55]:
art_info.keys()

dict_keys(['external_urls', 'followers', 'genres', 'href', 'id', 'images', 'name', 'popularity', 'type', 'uri'])

In [56]:
art_info['genres']

['canadian hip hop', 'canadian pop', 'hip hop', 'rap', 'toronto rap']

In [57]:
len(sp.recommendation_genre_seeds()['genres'])

126

**Findings**:

Strangely, it appears that using albums returns no genre for a number of songs. As a result, this method of obtaining genre information may be unreliable. We will test this further on the rest of the dataset. Additionally, using artist to get genre information appears to be too broad of a measure, with a number of genres listed that may not be indicative of the genre of the given song. 

Depending on how further testing of the album method goes, one potential solution could be to supplement spotify data with genre information from another website(s). This would require implementing some kind of webscraping algorithm to iterate through the list of song+artist pairs and extract genre information. 

Upon further research [online](https://community.spotify.com/t5/Spotify-for-Developers/Getting-album-not-getting-genre/td-p/5093156), it appears that neither tracks nor albums have genre data population, and assigning aggregate artist genre tags to individual tracks would most likely result in inaccurate analyses. As a result, I plan on exploring workarounds so that I can add genre data to my dataset e.g. webscraping.

#### Parallelization 

**Creating Method for Audio Feature Extraction**

This method takes in a spotipy worker/object and a track id and returns a list of the relevant features

In [58]:
def get_features(worker, track_id):
    ft = worker.audio_features(track_id)
    feats = list(ft[0].values())
    del feats[-7:-2]
    
    return(feats)

In [59]:
get_features(sp,song_id)

[0.708, 0.68, 9, -8.203, 1, 0.0519, 0.0625, 0, 0.152, 0.7, 133.947, 209280, 4]

**Initalizing Table**

In [60]:
col_og = ['song_artist','track_id']
col_feat = list(feat[0].keys())
del col_feat[-7:-2]
col_names = col_og + col_feat

In [61]:
file = open('./modified/id_feats.csv', 'w')
writer = csv.writer(file)
writer.writerow(col_names)
file.close()

**Performing Operation**

The `parallel` function takes in a list of data, here song_artist objects, and a function name, here `id_feats`. It will iterate through the song_artist objects utilizing each spotipy worker and return a completed link table which uses song_artist objects as the key.

First we must create the column of song_artist objects that will be the input for this operation.

In [62]:
from objects import song_artist

strm_base['song_artist'] = strm_base.apply(lambda x: song_artist(x['trackName'], x['artistName']), axis=1)

In [63]:
data = strm_base['song_artist'].tolist()

In [64]:
'''
from parallelization import parallel
from methods import id_feats
start_time = time.time()

parallel(data,id_feats)

print("Table creation took", time.time()-start_time, "to run") # 1996.9691202640533 (33 min) for entire run
'''

'\nfrom parallelization import parallel\nfrom methods import id_feats\nstart_time = time.time()\n\nparallel(data,id_feats)\n\nprint("Table creation took", time.time()-start_time, "to run") # 1996.9691202640533 (33 min) for entire run\n'

### Exploring Supplementary Genre Data

#### Webscraping

In this section, we explore using webscraping as a method to obtain track-level genre data.

*Website 1: [last.fm](https://www.last.fm/)*

Initial inspection of last.fm reveals fairly comprehensive genre data. However, you cannot directly scrape this from the search feature. You must navigate to the song page itself in order to get this information. As a result, there may be significant trial and error when attempting to create a standard query to get track info from lastfm.

With a simple song e.g. free love by HONNE, the query is formatted like this: https://www.last.fm/music/HONNE/_/free+love

For an artist with a two-word name e.g. Jet Fuel by Mac Miller: https://www.last.fm/music/Mac+Miller/_/Jet+Fuel

Track with multiple artists e.g. Nerdy Love by pH-1 and Yerin Baek: https://www.last.fm/music/pH-1,+Yerin+Baek/_/Nerdy+Love (interestingly, this is not actually the first song that comes up when you search 'Nerdy Love pH-1 Yerin Baek', it is actually https://www.last.fm/music/pH-1/_/Nerdy+Love+(feat.+Yerin+Baek))

For our dataset, we only get the first artist from Spotify when there is more than one artist on the song. With the last.fm format, it seems that they are created with https://www.last.fm/music/ + artist + /_ / + track, where the track name replaces spaces with + and maintains capitalization.

#### *last.fm* Query and Extraction:

We create a function which makes a formatted artist and song query to feed into last.fm and scrape the page to extract genre information. While last.fm provides a number of tags/genres, we will only select the first one since it appears sorted by relevance as opposed to alphabetical order. This function takes in a `song_artist` object and returns the genre of the track.

In [65]:
import requests
from bs4 import BeautifulSoup

song = sa.song
artist = sa.artist
    
artist_query = "+".join(artist.split(" "))
song_query = "+".join(song.split(" "))
    
query = 'https://www.last.fm/music/' + artist_query + '/_/' + song_query
    
req = requests.get(query)
sample = BeautifulSoup(req.content, 'html.parser')

In [66]:
section = sample.find(name='section', class_ = 'catalogue-tags')
section.find(name='li').string

'alternative rnb'

In [67]:
def get_genre(song_artist):
    song = song_artist.song
    artist = song_artist.artist
    
    artist_query = "+".join(artist.split(" "))
    song_query = "+".join(song.split(" "))
    
    query = 'https://www.last.fm/music/' + artist_query + '/_/' + song_query
    
    req = requests.get(query)
    sample = BeautifulSoup(req.content, 'html.parser')
    
    try:
        section = sample.find(name='section', class_ = 'catalogue-tags')
        genre = section.find(name='li').string
    except:
        genre = np.nan
    
    return genre

In [68]:
get_genre(sa)

'alternative rnb'

#### *last.fm* genre extraction test

Here we test our function on a subset of the data (100 entries).

In [69]:
'''

start_time = time.time()

genre_test = strm_base.iloc[0:100,:].apply(lambda x: get_genre(x['song_object']), axis=1)

print("My program took", time.time() - start_time, "to run") # old code took 42s to scrape 100 genres

'''

'\n\nstart_time = time.time()\n\ngenre_test = strm_base.iloc[0:100,:].apply(lambda x: get_genre(x[\'song_object\']), axis=1)\n\nprint("My program took", time.time() - start_time, "to run") # old code took 42s to scrape 100 genres\n\n'

In [70]:
#len(genre_test[pd.isnull(genre_test)])

In [71]:
#print(genre_test)

In [72]:
#strings = [~isinstance(n, str) for n in genre_test]

In [73]:
#len(strings)

In [74]:
#for x in genre_test:
#    print(type(x))

### Adding Genre Data to Dataset

#### *last.fm* Run

In [75]:
'''
from methods import get_genre
import threading

file = open('./modified/genres.csv', 'w')
writer = csv.writer(file)
writer.writerow(['song_artist','genre'])
file.close()

start_time = time.time()

strm_base.apply(lambda x: get_genre(x['song_artist']), axis=1)

print("My program took", time.time() - start_time, "to run") # 51.85 for 100 genres writing to csv
# 6545.196605205536 (1hr 49min) for full 20586 to csv
'''

'\nfrom methods import get_genre\nimport threading\n\nfile = open(\'./modified/genres.csv\', \'w\')\nwriter = csv.writer(file)\nwriter.writerow([\'song_artist\',\'genre\'])\nfile.close()\n\nstart_time = time.time()\n\nstrm_base.apply(lambda x: get_genre(x[\'song_artist\']), axis=1)\n\nprint("My program took", time.time() - start_time, "to run") # 51.85 for 100 genres writing to csv\n# 6545.196605205536 (1hr 49min) for full 20586 to csv\n'

### Combining Methods

#### Creating new table and merging with stream_base

In [76]:
strm_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20586 entries, 0 to 20585
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   endTime      20586 non-null  datetime64[ns]
 1   artistName   20586 non-null  string        
 2   trackName    20586 non-null  string        
 3   msPlayed     20586 non-null  int64         
 4   song_artist  20586 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1), string(2)
memory usage: 804.3+ KB


In [77]:
len(data)

20586

In [85]:
len(col_feat)

13

In [78]:
columns = ['track_id'] + col_feat + ['genre']

In [88]:
len(columns)

15

In [79]:
file = open('./modified/full_table.csv', 'w')
writer = csv.writer(file)
writer.writerow(columns)
file.close()

In [80]:

from parallelization import parallel
from methods import create_table
start_time = time.time()

parallel(data,create_table)

print("Table creation took", time.time()-start_time, "to run") # full 5117.0232944488525 (1hr 25m)


HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'The Restructuring of the LCS Players Association feat. Jacob Wolf FTW with Imad Khan: An Esports and Competitive Gaming Podcast', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'The Significance of the Sinatraa Suspension feat. Hunter Cooke FTW with Imad Khan: An Esports and Competitive Gaming Podcast', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'The Restructuring of the LCS Players Association feat. Jacob Wolf FTW with Imad Khan: An Esports and Competitive Gaming Podcast', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'Crazy Noisy Bizarre Town (From "Jojo\'s

Table creation took 5117.0232944488525 to run


Traceback (most recent call last):
  File "C:\Users\matth\anaconda3\lib\multiprocessing\queues.py", line 238, in _feed
    send_bytes(obj)
  File "C:\Users\matth\anaconda3\lib\multiprocessing\connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "C:\Users\matth\anaconda3\lib\multiprocessing\connection.py", line 280, in _send_bytes
    ov, err = _winapi.WriteFile(self._handle, buf, overlapped=True)
BrokenPipeError: [WinError 232] The pipe is being closed


In [81]:
add_table = pd.read_csv('./modified/full_table.csv')

In [83]:
add_table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20576 entries, nan to 648sjoR2ga0PXixXttyqzJ
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          20300 non-null  float64
 1   danceability      20300 non-null  object 
 2   energy            20299 non-null  float64
 3   key               20299 non-null  float64
 4   loudness          20299 non-null  float64
 5   mode              20299 non-null  object 
 6   speechiness       20298 non-null  float64
 7   acousticness      20298 non-null  float64
 8   instrumentalness  20298 non-null  float64
 9   liveness          20298 non-null  float64
 10  valence           20298 non-null  float64
 11  tempo             20298 non-null  float64
 12  duration_ms       20298 non-null  float64
 13  time_signature    14061 non-null  object 
 14  genre             98 non-null     object 
dtypes: float64(11), object(4)
memory usage: 3.1+ MB


In [82]:
comb = pd.concat([strm_base[['endTime','msPlayed']],add_table], axis = 1)

ValueError: Shape of passed values is (656686, 17), indices imply (41162, 17)

In [None]:
comb.info()

In [None]:
#comb.to_csv(path_or_buf = './modified/temp_post.csv')

Checking songs that exist (have genre) but did not get a track id:

#### Validating full table

In [None]:
test = comb.copy()

In [None]:
test.info()

In [None]:
no_ids = test[pd.isnull(test['track_id']) & ~pd.isnull(test['genre'])]

In [None]:
no_ids

In [None]:
no_ids['song_artist'] = no_ids.apply(lambda x: song_artist(x['trackName'], x['artistName']), axis=1)

In [None]:
rerun = no_ids['song_artist'].tolist()

In [None]:
file = open('./modified/rerun.csv', 'w')
writer = csv.writer(file)
writer.writerow(columns)
file.close()

In [None]:
start_time = time.time()

parallel(rerun,create_table)

print("Rerun took", time.time()-start_time) # 

In [None]:
rerun_res = pd.read_csv('./modified/rerun.csv')

In [None]:
rerun_res.info()

Checking songs that did not receive a genre:

In [None]:
test = comb.copy()

In [None]:
genre_nas = test[pd.isnull(test['genre'])]

In [None]:
genre_nas[['artistName','trackName']]

*Note*: There are a significant number of NAs after scraping genres. As a result, there is a need to either revise our method to get genre information or devise a method for imputation. 

**Possible Solution using Imputation**: Since it appears that there are artists who receive genre data for some songs and not others, one possible imputation method would be to create a dictionary where each artist corresponds to their top genre (can also scrape this from last.fm).

#### Imputing Genres

In [None]:
from methods import table_artGen

test_artists = add_table['artistName'][0:100].tolist()

start_time = time.time()

test_dict = {artist: table_artGen(artist) for artist in test_artists}

print("My program took", time.time() - start_time, "to run")


In [None]:
len(test_dict) == len(add_table.iloc[0:100,:]['artistName'].unique()) 
# using a dictionary means that duplicate artists are not created

**Potential Idea**: replace all nas in genre column with corresponding value in artist column and then replace the filled artist values with genres using the dictionary mentioned above

**Testing replacing 'artist' values with genre**

In [None]:
test = add_table.copy()

In [None]:
test.iloc[0:100,:]['artistName'].replace(test_dict, inplace=True)

In [None]:
test.iloc[0:100,:]

**Making Artist:Genre Dictionary**

In [None]:


art_names = add_table['artistName'].unique().tolist()

start_time = time.time()

artGen_dict = {artist: table_artGen(artist) for artist in art_names}

print("My program took", time.time() - start_time, "to run")


**Imputing Genres**

In [None]:
add_table['genre'].fillna(add_table['artistName'], inplace=True)

In [None]:
add_table['genre'].replace(artGen_dict, inplace=True)

### Processing for Visualization and Analysis

#### Data Cleaning + Tidying