# Importing the libraries

In [164]:
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import yaml
from tqdm import tqdm
import time
import random
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [165]:
stream= open("spotify/spotify.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

# Importing the dataset

In [166]:
df=pd.read_csv('data/unique_tracks_mpd_drop.csv')
artist_features=pd.read_csv('data/artist_features.csv')
audio_features=pd.read_csv('data/audio_features.csv')


In [167]:
track_features=pd.read_csv('data/track_features.csv')

In [168]:
print(df.head())
print(df.count())

   Unnamed: 0               track_uri              artist_uri  \
0           0  0UaMYEvWZi0ZqiDOoHU3YI  2wIVse2owClT7go1WT98tk   
1           1  6I9VzXrHxO9rA9A5euc8Ak  26dSoYclwsYLMAKD3tpOr4   
2           2  0WqIKmW4BTrj3eJFmnCKMv  6vWDO969PvNqNYHIOW5v0m   
3           3  1AWQoqb9bSvzTjaLralEkT  31TPClRtHm23RisEBtV3X7   
4           4  1lzr43nnXAijIGYnCT8M8H  5EvFsr3kj42KNv97ZEnqij   

                album_uri  
0  6vV5UrXcfyQD1wu4Qo2I9K  
1  0z7pVBGOD7HCIB7S8eLkLI  
2  25hVFAxTlDvXbx2X2QkUkE  
3  6QPkyl04rXwTGlGlcYaRoW  
4  6NmFmPX56pcLBOFMhIiKvF  
Unnamed: 0    337684
track_uri     337684
artist_uri    337684
album_uri     337684
dtype: int64


In [169]:
# print(artist_features.head())
print(artist_features.columns)
print(artist_features.count())
artist_features.head()


Index(['0', 'artist_pop', 'genres'], dtype='object')
0             62633
artist_pop    62633
genres        62633
dtype: int64


Unnamed: 0,0,artist_pop,genres
0,2wIVse2owClT7go1WT98tk,70,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...
1,26dSoYclwsYLMAKD3tpOr4,80,dance_pop pop
2,6vWDO969PvNqNYHIOW5v0m,86,pop r&b
3,31TPClRtHm23RisEBtV3X7,82,dance_pop pop
4,5EvFsr3kj42KNv97ZEnqij,71,pop_rap reggae_fusion


In [170]:
print(audio_features.head())
print(audio_features.count())

   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.904   0.813    4    -7.105     0       0.1210       0.03110   
1         0.774   0.838    5    -3.914     0       0.1140       0.02490   
2         0.664   0.758    2    -6.583     0       0.2100       0.00238   
3         0.892   0.714    4    -6.055     0       0.1410       0.20100   
4         0.853   0.606    0    -4.596     1       0.0713       0.05610   

   instrumentalness  liveness  valence    tempo            type  \
0          0.006970    0.0471    0.810  125.461  audio_features   
1          0.025000    0.2420    0.924  143.040  audio_features   
2          0.000000    0.0598    0.701   99.259  audio_features   
3          0.000234    0.0521    0.817  100.972  audio_features   
4          0.000000    0.3130    0.654   94.759  audio_features   

                       id                                   uri  \
0  0UaMYEvWZi0ZqiDOoHU3YI  spotify:track:0UaMYEvWZi0ZqiDOoHU3YI   
1  6I9VzXrHx

In [171]:
print(track_features.head())
print(track_features.count())

                track_uri release_date pop
0  0UaMYEvWZi0ZqiDOoHU3YI   2005-07-04  70
1  6I9VzXrHxO9rA9A5euc8Ak   2003-11-13  86
2  0WqIKmW4BTrj3eJFmnCKMv   2003-06-23  21
3  1AWQoqb9bSvzTjaLralEkT   2002-11-04  83
4  1lzr43nnXAijIGYnCT8M8H         2000   4
track_uri       199480
release_date    199480
pop             199480
dtype: int64


# Merging all dataframes

In [172]:
df_main = pd.merge(df,audio_features, left_on = "track_uri", right_on= "id",how = 'inner')

In [173]:
df_main = pd.merge(df_main,track_features, left_on = "track_uri", right_on= "track_uri",how = 'outer')

In [174]:
df_main = pd.merge(df_main,artist_features, left_on = "artist_uri", right_on= "0",how = 'inner')

# Handling missing data 

In [175]:
df_main.isna().sum()

Unnamed: 0            0
track_uri             0
artist_uri            0
album_uri             0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
type                  0
id                    0
uri                   0
track_href            0
analysis_url          0
duration_ms           0
time_signature        0
release_date        521
pop                 521
0                     0
artist_pop            0
genres                0
dtype: int64

# Droping missing values

In [176]:
df_main.dropna(axis=0,inplace=True)

In [177]:
df_main.isna().sum().sum()

0

In [178]:
df_main.count()

Unnamed: 0          198492
track_uri           198492
artist_uri          198492
album_uri           198492
danceability        198492
energy              198492
key                 198492
loudness            198492
mode                198492
speechiness         198492
acousticness        198492
instrumentalness    198492
liveness            198492
valence             198492
tempo               198492
type                198492
id                  198492
uri                 198492
track_href          198492
analysis_url        198492
duration_ms         198492
time_signature      198492
release_date        198492
pop                 198492
0                   198492
artist_pop          198492
genres              198492
dtype: int64

In [179]:
df_main.value_counts()

Unnamed: 0  track_uri               artist_uri              album_uri               danceability  energy  key   loudness  mode  speechiness  acousticness  instrumentalness  liveness  valence  tempo    type            id                      uri                                   track_href                                                analysis_url                                                      duration_ms  time_signature  release_date  pop  0                       artist_pop  genres                                                                                
0.0         0UaMYEvWZi0ZqiDOoHU3YI  2wIVse2owClT7go1WT98tk  6vV5UrXcfyQD1wu4Qo2I9K  0.904         0.813   4.0   -7.105    0.0   0.1210       0.031100      0.006970          0.0471    0.810    125.461  audio_features  0UaMYEvWZi0ZqiDOoHU3YI  spotify:track:0UaMYEvWZi0ZqiDOoHU3YI  https://api.spotify.com/v1/tracks/0UaMYEvWZi0ZqiDOoHU3YI  https://api.spotify.com/v1/audio-analysis/0UaMYEvWZi0ZqiDOoHU3YI  226864.0     4.0       

In [180]:
df_main.nunique()

Unnamed: 0          198492
track_uri           198492
artist_uri           40912
album_uri            93986
danceability          1238
energy                2196
key                     12
loudness             22679
mode                     2
speechiness           1579
acousticness          4967
instrumentalness      5401
liveness              1752
valence               1698
tempo                77283
type                     1
id                  198492
uri                 198492
track_href          198492
analysis_url        198492
duration_ms          73129
time_signature           5
release_date          8827
pop                     95
0                    40912
artist_pop              98
genres               13935
dtype: int64

In [182]:
df_main.columns

Index(['Unnamed: 0', 'track_uri', 'artist_uri', 'album_uri', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri',
       'track_href', 'analysis_url', 'duration_ms', 'time_signature',
       'release_date', 'pop', '0', 'artist_pop', 'genres'],
      dtype='object')

In [183]:
dfdropped=df_main.drop(columns=['Unnamed: 0','type', 'id', 'uri',
       'track_href', 'analysis_url','0'])

In [185]:
dfdropped.head()

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,release_date,pop,artist_pop,genres
0,0UaMYEvWZi0ZqiDOoHU3YI,2wIVse2owClT7go1WT98tk,6vV5UrXcfyQD1wu4Qo2I9K,0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,226864.0,4.0,2005-07-04,70,70,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...
1,3jagJCUbdqhDSPuxP8cAqF,2wIVse2owClT7go1WT98tk,6DeU398qrJ1bLuryetSmup,0.884,0.677,1.0,-5.603,1.0,0.283,0.0778,0.0,0.0732,0.584,101.868,263227.0,4.0,2002-11-11,73,70,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...
2,3XplJgPz8VjbDzbGwGgZdq,2wIVse2owClT7go1WT98tk,6epR3D622KWsnuHye7ApOl,0.794,0.805,0.0,-6.554,1.0,0.23,0.538,0.122,0.0952,0.658,177.799,236933.0,4.0,2006-09-04,43,70,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...
3,0jG92AlXau21qgCQRxGLic,2wIVse2owClT7go1WT98tk,20t54K6C80QQH7vbcpfJcP,0.622,0.669,9.0,-8.419,1.0,0.329,0.0266,3e-06,0.152,0.57,93.839,252987.0,4.0,2001-05-14,58,70,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...
4,6zsk6uF3MxfIeHPlubKBvR,2wIVse2owClT7go1WT98tk,20t54K6C80QQH7vbcpfJcP,0.797,0.75,0.0,-9.369,1.0,0.247,0.533,0.108,0.095,0.74,177.87,211120.0,4.0,2001-05-14,72,70,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...


## Data Preprocessing

Dividing tracks in buckets of range 5 accoridng to track and artist popularity
Buckets of 50 for release data

In [194]:
dfdropped['Track_pop'] = dfdropped['pop'].apply(lambda x: int(int(x)/5))
dfdropped['Artist_pop'] = dfdropped['artist_pop'].apply(lambda x: int(int(x)/5))

In [198]:
dfdropped['Track_release_date'] = dfdropped['release_date'].apply(lambda x: x.split('-')[0])

In [199]:
dfdropped['Track_release_date']=dfdropped['Track_release_date'].astype('int16')

In [200]:

dfdropped['Track_release_date'] = dfdropped['Track_release_date'].apply(lambda x: int(x/50))

In [202]:
dfdropped.columns

Index(['track_uri', 'artist_uri', 'album_uri', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature',
       'release_date', 'pop', 'artist_pop', 'genres', 'Track_pop',
       'Artist_pop', 'Track_release_date'],
      dtype='object')

In [None]:
dfdropped.drop(columns=['pop','artist_pop','release_date'],inplace=True)

In [16]:
dfdropped.to_csv('data/final_processed_data.csv',index=False)

In [None]:
dfdropped.to_parquet('data/final_processed_data.parquet',index=False)