In [22]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [55]:
df = pd.read_csv('/Users/aravindrajeshmenon/Documents/DataScienceProjects/Projects/Recommender-Model/data/spotify_data.csv')
df.head(5)


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


In [56]:
df = df.drop(['Unnamed: 0'], axis = 1)


To build a recommender system for Indian songs alone, we first create a separate dataset that includes only Indian songs.

In [57]:
df_ind = df[df['genre'] == 'indian']
df_ind.shape

(20583, 19)

In [58]:
df_ind.isna().sum()

artist_name         0
track_name          0
track_id            0
popularity          0
year                0
genre               0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
dtype: int64

To decide whether to include the artist name as a feature in the similarity matrix, we look at the total unique artist names in the Indian song dataset.

In [59]:
df_ind['artist_name'].nunique()

1331

It is seen that there are 1331 unique artist names, and encoding these many categories would require one-hot encoding where each data point would have 1330 0's and one 1. This is a huge waste and not recommended so we shall skip encoding the artist names into the similarity matrix. 

In [60]:
df_ind = df_ind.copy()
df_ind['track_name'] = df_ind['track_name'].str.lower()
scaler = StandardScaler()
features = ['popularity','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms']
df_ind[features] = scaler.fit_transform(df_ind[features])
df_ind

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
32874,Pritam,raabta,6FjbAnaPRPwiP3sciEYctO,4.941604,2012,indian,0.769001,0.060159,-1.497069,0.414601,0.708808,-0.586512,0.591972,-0.723446,-0.504356,-0.611404,-0.103279,-0.395121,4
32875,Pritam,kyon,1sYC1fG7s4p0pTykrJqmgW,3.787195,2012,indian,1.208622,0.429732,-0.921297,0.753899,0.708808,-0.552838,0.412265,-0.723496,-0.623506,1.467395,0.028290,-0.336118,4
32876,Pritam,phir le aya dil - reprise,7fpWJr5shT90KiCHXKHxch,3.787195,2012,indian,-1.906955,-0.109043,0.518133,0.222237,0.708808,-0.511000,0.988469,-0.723485,-0.625330,-0.640049,-1.598144,-0.232262,4
32877,Pritam,tu hi mera,1CVqr5LImdmJ1Upt4z08Pm,3.864156,2012,indian,0.539633,1.298005,1.669677,1.227356,0.708808,0.150236,0.155540,-0.723496,-0.540830,1.639264,-1.051179,-0.317951,4
32878,Pritam,aashiyan,7ttlemwytO21npSmLKqTBg,3.556313,2012,indian,1.864868,0.242719,1.381791,0.607438,0.708808,-0.362018,-0.566142,-0.723488,-0.194320,1.238236,0.368606,-0.414036,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141897,Red Baraat,hey jamalo (live in brooklyn),7qW0nMYIPvlycw7jGMUa4y,-0.599558,2011,indian,0.138240,1.756632,0.518133,0.593478,0.708808,0.211462,-1.207098,-0.251752,3.234305,1.532869,-0.944773,-0.057046,4
1141898,Sanjeev Chimmalgi,ganesh bhujangprayatma,6xVf2qPr94Zyh7Mu4w3Bgf,-0.676519,2011,indian,-0.651804,-0.287151,1.093905,0.075065,0.708808,-0.123238,0.663284,-0.723496,-0.780956,-0.873300,0.641496,0.974628,3
1141899,Trajik,hitman,5E7RCTi3xMxJqJLrSxpSJd,-0.676519,2011,indian,1.858497,0.184834,-0.345525,-0.615836,-1.410819,1.752306,-1.362559,-0.723210,2.115746,1.119565,0.573798,-0.384799,4
1141900,Yearbook Committee,the weather,5Q7TrLKFpFVyNfIK6wgqys,-0.676519,2011,indian,0.361236,-0.634460,-1.497069,0.413418,0.708808,0.293096,-0.623191,-0.722656,-0.267270,-0.623680,-0.062572,-0.473751,4


In [61]:
df_ind.to_csv('../data/indian_songs.csv')
