## Import libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [2]:
df = pd.read_csv('./Data/no_multiples.csv')

In [3]:
df.head()

Unnamed: 0,index,uri,tempo,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,time_signature,duration_ms
0,"siren 042, lala lala",spotify:track:75nN4kH1uzSsUdMrdUVfrq,108.0,0.601,0.596,11,-7.373,1,0.0251,0.694,0.00579,0.107,0.457,4,161043
1,"make me a song, eleanor friedberger",spotify:track:71GBQ7iVnffAGkNuTDxCoH,129.993,0.727,0.575,2,-8.295,1,0.0292,0.0548,0.0131,0.123,0.622,4,332400
2,"uprising, muse",spotify:track:4VqPOruhp5EdPBeR92t6lQ,128.019,0.602,0.905,2,-4.046,1,0.0775,0.000202,0.064,0.117,0.411,4,304840
3,"time is running out, muse",spotify:track:2takcwOaAZWiXQijPHIx7B,118.211,0.585,0.842,9,-5.883,0,0.0556,0.00242,0.00686,0.0866,0.428,4,237040
4,"knights of cydonia, muse",spotify:track:7ouMYWpwJ422jRcDASZB7P,137.114,0.366,0.963,11,-5.301,0,0.142,0.000273,0.0122,0.115,0.211,4,366213


In [4]:
df.shape

(1734, 15)

In [5]:
df.dtypes

index                object
uri                  object
tempo               float64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
time_signature        int64
duration_ms           int64
dtype: object

In [6]:
df.columns

Index(['index', 'uri', 'tempo', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'time_signature', 'duration_ms'],
      dtype='object')

### I will save the uri column, since i need them to search and add the songs to a playlist on spotify

In [7]:
df_id = df.drop(columns=['tempo', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'time_signature', 'duration_ms'])


In [8]:
df_id.set_index('index',inplace=True)

In [9]:
df_id.head()

Unnamed: 0_level_0,uri
index,Unnamed: 1_level_1
"siren 042, lala lala",spotify:track:75nN4kH1uzSsUdMrdUVfrq
"make me a song, eleanor friedberger",spotify:track:71GBQ7iVnffAGkNuTDxCoH
"uprising, muse",spotify:track:4VqPOruhp5EdPBeR92t6lQ
"time is running out, muse",spotify:track:2takcwOaAZWiXQijPHIx7B
"knights of cydonia, muse",spotify:track:7ouMYWpwJ422jRcDASZB7P


In [10]:
df_id.to_csv('./Data/uri.csv', index= True)

### on my main datafram I will set column index, which i had created earlier, as my index, and drop the uri column so i only have numeric features

***now that I only have neumeric features I can standatd scaler my data, this assures that each column will have a μ = 0 and σ = 1, in simple english it means they are now all on the same scale!*** 

In [11]:
df.set_index('index', inplace=True)
df.drop(columns=['uri'],inplace=True)

In [12]:
df.head()

Unnamed: 0_level_0,tempo,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,time_signature,duration_ms
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"siren 042, lala lala",108.0,0.601,0.596,11,-7.373,1,0.0251,0.694,0.00579,0.107,0.457,4,161043
"make me a song, eleanor friedberger",129.993,0.727,0.575,2,-8.295,1,0.0292,0.0548,0.0131,0.123,0.622,4,332400
"uprising, muse",128.019,0.602,0.905,2,-4.046,1,0.0775,0.000202,0.064,0.117,0.411,4,304840
"time is running out, muse",118.211,0.585,0.842,9,-5.883,0,0.0556,0.00242,0.00686,0.0866,0.428,4,237040
"knights of cydonia, muse",137.114,0.366,0.963,11,-5.301,0,0.142,0.000273,0.0122,0.115,0.211,4,366213


In [13]:
ss = StandardScaler()
data = ss.fit_transform(df)

#### now I will create a dataframe with the transformmed features

In [14]:
df= pd.DataFrame(data, columns =df.columns, index = df.index)
df.shape

(1734, 13)

In [15]:
df.head(1)

Unnamed: 0_level_0,tempo,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,time_signature,duration_ms
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"siren 042, lala lala",-0.50495,0.45834,-0.112834,1.722145,0.482366,0.570691,-0.546041,1.386794,-0.389266,-0.545219,-0.481937,0.231248,-0.882392


### I will save this dataframe as well, since this is where I will be looking for to create my pllaylist based on tempo

In [16]:
df.to_csv('./Data/identifier.csv', index= True)

#### now i'm going to use cosine similarity from sklearn, this is a meassure of similarity between two non-zero vectors, by meassuring the cosign of the angle between them. as we know cosine of 0° and 180° are 1 and -1 respectedly, and cosign of any other angle falls between the two, with cosine of 90° and 270° are both 0. therefore, we are comparing the possition of each item along the circle. this means two items taht are most simmilar will have an identical orientation and cosine simmilarity of 1, two items that have a 90° angle between their orientation will have a cosine simmilarity of 0, and if theyhave a cosine simmilarity of -1 then they are on the opposite sides of the spacterum, and will therefore have 180° angle between them. 
#### for this recomendation sysytem, we are assuming that each song is it's own vector, with unique features, we will create a recomendation system by comparing each song (vector) to every other song in a big matrix. 

In [17]:
cs = cosine_similarity(df, df)

In [18]:
matrix = pd.DataFrame(cs, columns=df.index, index=df.index)

In [19]:
matrix.head(1)

index,"siren 042, lala lala","make me a song, eleanor friedberger","uprising, muse","time is running out, muse","knights of cydonia, muse","vicious, lou reed","like a hurricane - 2017 remaster, neil young","smile like you mean it, the killers","season of the witch, donovan","harvest moon, neil young",...,"riders on the storm, the doors","you never can tell, chuck berry","you really got me - mono mix, the kinks","whole lotta love - 1990 remaster, led zeppelin","water under the bridge, adele","mannish boy, muddy waters","white rabbit, jefferson airplane","people are strange, the doors","valerie - live at bbc radio 1 live lounge, london / 2007, amy winehouse","fortunate son, creedence clearwater revival"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"siren 042, lala lala",1.0,-0.311383,-0.286814,0.111002,-0.083149,-0.241955,-0.187183,-0.350366,-0.26267,0.098482,...,0.020727,0.183704,0.161542,-0.20714,-0.121297,-0.297464,0.274764,0.217554,0.35286,-0.142514


***as you can see the first song in my dataframe has a cosine similarity of 1 with itself, which proves that it is identical to itself!***

### now I need a function to take any song of choice and retun 20 closest songs, I will sort them by tempo, because it just makes sence, and I will have the fist 10 raising in tempo and the next 10 decsending, because a playlist is like a roller coaster, if it goes up, it must come down, or i'm not riding! 

In [20]:
def play_list(choice):
    list_of_recoms =[]
    uri= []
    
    
    for song in df.loc[df.index.str.contains(choice)].sort_values(by='tempo').index:
        list_of_recoms.append(matrix[song].sort_values(ascending = False)[0:10])
        list_of_recoms.append(matrix[song].sort_values(ascending = False)[19:9:-1])        
        recom= pd.DataFrame(list_of_recoms).T
        recom= recom.drop(columns=choice)
        return recom

In [21]:
play_list('harvest moon, neil young')

"harvest moon, neil young"
"everyday i have the blues, b.b. king"
"season of the witch, lana del rey"
"everyday people, jeff buckley"
"courage, villagers"
"stuck inside of mobile with the memphis blues again - take 13, alternate take, bob dylan"
"pale blue eyes - closet mix, the velvet underground"
"pale blue eyes, the velvet underground"
"le naufragé, voyou"
"changes - 2015 remaster, david bowie"
"ten years gone - 1990 remaster, led zeppelin"


#### lastly I will save my recomender matrix as a dataframe so I can create playlists on top of it 

In [22]:
matrix.to_csv('./Data/recommender.csv', index= True)