In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
from tqdm.notebook import tqdm


import json
import os
import time
import csv

from objects import song_artist
from parallelization import parallel
from methods import table_noGen
from methods import table_artGen

import multiprocessing
from joblib import Parallel, delayed

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import matplotlib.pyplot as plt
plt.rc('font', size=14)
import seaborn as sns
sns.set(style='whitegrid', color_codes=True, rc={'figure.figsize':(11,8)}, font_scale=2)

In [2]:
#os.chdir('/c/Users/matth/Documents/Coding/spotify/')

In [3]:
#!pwd

## Data Generation Process for Spotify Genres Project
### Initial Data Importation and Prep

In [4]:
# read streaming data and concat rows
strm_base = pd.DataFrame()
for file in os.listdir("./data"):
    if file.startswith("Streaming"):
        file_path = "./data/" + file
        temp = pd.read_json(file_path)
        strm_base = pd.concat([strm_base,temp])
        

strm_base.index = range(0,strm_base.shape[0])
strm_base = strm_base.astype({'endTime': 'datetime64','artistName':'string','trackName':'string'})

In [5]:
# adding song_artist object in preparation for parallel feature addition
strm_base['song_artist'] = strm_base.apply(lambda x: song_artist(x['trackName'], x['artistName']), axis=1)

In [6]:
# adding indices 
temp = strm_base.reset_index()
data = temp[['index','song_artist']].values.tolist()

In [7]:
len(data)

20586

In [8]:
cols = ['idx','track_id', 'danceability','energy', 'key', 'loudness', 'mode',
        'speechiness', 'acousticness', 'instrumentalness', 'liveness',
        'valence', 'tempo', 'duration_ms', 'time_signature']

In [9]:
file = open('./modified/feats_table.csv', 'w', encoding='utf-8')
writer = csv.writer(file)
writer.writerow(cols)
file.close()

### Parallelized Audio Features

In [10]:
start_time = time.time()

parallel(data,table_noGen)

print("Table creation took", time.time()-start_time, "to run") # 2132.3920452594757s (35 min)
# with indexing 1004.4579422473907s (17 min)

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'The Significance of the Sinatraa Suspension feat. Hunter Cooke FTW with Imad Khan: An Esports and Competitive Gaming Podcast', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'The Restructuring of the LCS Players Association feat. Jacob Wolf FTW with Imad Khan: An Esports and Competitive Gaming Podcast', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'The Restructuring of the LCS Players Association feat. Jacob Wolf FTW with Imad Khan: An Esports and Competitive Gaming Podcast', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'Crazy Noisy Bizarre Town (From "Jojo\'s

Table creation took 1004.4579422473907 to run


Traceback (most recent call last):
  File "C:\Users\matth\anaconda3\lib\multiprocessing\queues.py", line 238, in _feed
    send_bytes(obj)
  File "C:\Users\matth\anaconda3\lib\multiprocessing\connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "C:\Users\matth\anaconda3\lib\multiprocessing\connection.py", line 280, in _send_bytes
    ov, err = _winapi.WriteFile(self._handle, buf, overlapped=True)
BrokenPipeError: [WinError 232] The pipe is being closed


In [11]:
feats_table = pd.read_csv('./modified/feats_table.csv')

In [29]:
feats_table.dropna(inplace=True)

In [31]:
feats_table.sort_values(by = 'idx', inplace=True)

In [32]:
feats_table.set_index('idx',inplace=True)

In [38]:
temp = strm_base.join(feats_table, how = 'left')

In [44]:
temp.dropna(inplace=True)

In [45]:
temp.to_csv(path_or_buf = './modified/intermediate_table.csv', index = False)

### Imputing Artist Genres

In [46]:
temp = pd.read_csv('./modified/intermediate_table.csv')

In [49]:
art_names = temp['artistName'].unique().tolist()
temp['genre'] = temp['artistName']

In [50]:
start_time = time.time()

artGen_dict = {artist: table_artGen(artist) for artist in art_names}

print("My program took", time.time() - start_time, "to run") # 1414.6461627483368s (25 min)

My program took 1414.6461627483368 to run


In [51]:
temp['genre'].replace(artGen_dict, inplace=True)

In [52]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20290 entries, 0 to 20289
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   endTime           20290 non-null  object 
 1   artistName        20290 non-null  object 
 2   trackName         20290 non-null  object 
 3   msPlayed          20290 non-null  int64  
 4   song_artist       20290 non-null  object 
 5   track_id          20290 non-null  object 
 6   danceability      20290 non-null  float64
 7   energy            20290 non-null  float64
 8   key               20290 non-null  float64
 9   loudness          20290 non-null  float64
 10  mode              20290 non-null  float64
 11  speechiness       20290 non-null  float64
 12  acousticness      20290 non-null  float64
 13  instrumentalness  20290 non-null  float64
 14  liveness          20290 non-null  float64
 15  valence           20290 non-null  float64
 16  tempo             20290 non-null  float6

### Exporting Final Table

In [None]:
final = temp.dropna()

In [None]:
final.to_csv(path_or_buf = './final/final_table.csv', index = False)