# Using Content Based Filtering Recommended System

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

<p>Creating dataframes from following csv files</p>

In [74]:
songs_playlist_df = pd.read_csv('/Users/nitanshjain/Documents/Data_Science_Learning/Spotify_Recommender_System/csv/songs_playlist_df.csv')
songs_50_df = pd.read_csv('/Users/nitanshjain/Documents/Data_Science_Learning/Spotify_Recommender_System/csv/songs_50_df.csv')

In [75]:
# dropping columns
songs_playlist_df.drop(columns=["album", "artist_genres","time_signature","artist_name"], axis=1, inplace=True) 
songs_50_df.drop(columns=["artist_genres","time_signature","artist_name"], axis=1, inplace=True)

In [76]:
songs_playlist_df.drop_duplicates(subset='track_name', inplace=True) # removing duplicate songs based on track name
songs_50_df.drop_duplicates(subset='track_name', inplace=True)

In [77]:
songs_playlist_df.head()

Unnamed: 0,track_name,artist_pop,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,sad lo-fi,sad rap,a cappella,world worship,roots worship,cartoon,assamese pop,children's music,hi-nrg,track_pop
0,You Right,88,0.828,0.621,-6.414,0.0565,0.0164,0.00233,0.0845,0.436,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86
1,Wild Side (feat. Cardi B),68,0.74,0.576,-6.744,0.146,0.0249,0.0,0.104,0.315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
2,Peaches (feat. Daniel Caesar & Giveon),90,0.677,0.696,-6.181,0.119,0.321,0.0,0.42,0.464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87
3,Chicken Lemon Rice,42,0.516,0.895,-5.77,0.0919,0.000376,0.54,0.492,0.694,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38
4,LALA (Unlocked) (feat. Swae Lee),76,0.615,0.56,-4.938,0.184,0.0889,0.000114,0.314,0.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48


In [78]:
songs_50_df.head()

Unnamed: 0,track_name,artist_pop,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,genre_score,track_pop
0,Cheerleader - Felix Jaehn Remix Radio Edit,64,0.782,0.685,-6.237,0.0309,0.166,1.2e-05,0.16,0.603,118.016,180566,13.0,79
1,Here Comes The Sun - Remastered 2009,82,0.557,0.54,-10.484,0.0347,0.0339,0.00248,0.179,0.394,129.171,185733,15.0,82
2,Shallow,83,0.572,0.385,-6.362,0.0308,0.371,0.0,0.231,0.323,95.799,215733,34.0,84
3,Sweet but Psycho,80,0.72,0.706,-4.719,0.0473,0.0684,0.0,0.166,0.62,133.002,187436,32.0,81
4,Tacones Rojos,80,0.748,0.856,-3.517,0.0348,0.0824,0.0,0.142,0.927,123.031,189320,14.0,84


<p>Scaling the data using Standard Scaler</p>

In [79]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
songs_playlist_df.loc[:,"artist_pop":"track_pop"] = scaler.fit_transform(songs_playlist_df.loc[:,"artist_pop":"track_pop"])

In [80]:
songs_playlist_np = songs_playlist_df.copy() #creating a copy
songs_playlist_np.drop("track_name", axis=1, inplace=True) #dropping column track name as it is the only column of type object
songs_playlist_np = songs_playlist_np.to_numpy() #converting to numpy 2d array


<p>Following code is based on the recommeder-system-python by datacamp link provided below. I haven't yet properly grasped as to why we are using linear_kernel and finding cosine values. Will update this block once I have understood it</p>

In [81]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim_playlist = linear_kernel(songs_playlist_np, songs_playlist_np)

In [82]:
print(cosine_sim_playlist.shape)

(2179, 2179)


In [83]:
songs_playlist_indices = pd.Series(songs_playlist_df.index, index=songs_playlist_df["track_name"]).drop_duplicates() 
#creating series that includes the all the unique songs in the csv and assigns them an index value

In [84]:
def get_recommendations(title):
    
    index = songs_playlist_indices[title] # getting index value of the song from the series based on track_name
    sim_scores = list(enumerate(cosine_sim_playlist[index])) # Get the pairwsie similarity scores of all songs with that song
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Sort the songs based on the similarity scores
    sim_scores = sim_scores[1:11] # Get the scores of the 10 most similar songs
    print(sim_scores)
    recommended_songs_indices = [i[0] for i in sim_scores]  # Get the songs indices
    print(songs_playlist_df["track_name"].iloc[recommended_songs_indices]) # Return the top 10 most similar songs

In [85]:
# used try and except because there is some error occuring for some songs that I can't understand why
for song in songs_50_df.track_name:
    try:
        print("The following songs were recommended for \n{}".format(song))
        get_recommendations(song)
        print("\n")
    except:
        print("\n")

The following songs were recommended for 
Cheerleader - Felix Jaehn Remix Radio Edit
[(464, 14.243787060411849), (11, 13.963377109667576), (1684, 13.11860396673769), (2045, 13.063186131641514), (0, 12.982218727168258), (1516, 12.910609093364208), (1518, 12.496931341189702), (1245, 12.315450174296322), (1384, 12.123287763129227), (265, 12.078749238785973)]
464     Cheerleader - Felix Jaehn Remix Radio Edit
11                              That's What I Like
1684                                 The Lazy Song
2045                          Locked out of Heaven
0                                        You Right
1516                                     24K Magic
1518                              Sweet but Psycho
1245                    We're All In This Together
1384                               Don't Start Now
265                 All 4 Nothing (I'm So In Love)
Name: track_name, dtype: object


The following songs were recommended for 
Here Comes The Sun - Remastered 2009
[(775, 888.62452148

# References
<a href="https://towardsdatascience.com/a-one-stop-shop-for-principal-component-analysis-5582fb7e0a9c">https://towardsdatascience.com/a-one-stop-shop-for-principal-component-analysis-5582fb7e0a9c</a>
<br>
<a href="https://www.datacamp.com/tutorial/recommender-systems-python">https://www.datacamp.com/tutorial/recommender-systems-python</a>