# Content Based Recommendations System

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [9]:
songs = pd.read_csv("data.csv")
songs.drop(songs.columns[songs.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
songs.head(4)

Unnamed: 0,valence,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo
0,0.0594,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,0.0366,80.954
1,0.963,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,0.415,60.936
2,0.0394,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,0.0339,110.339
3,0.165,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,0.0354,100.109


In [None]:
songs.shape
songs.isna().sum()

In [None]:
viz_songs=songs.drop(columns=['id', 'name', 'artists'])
viz_songs.head()

In [None]:
#EDA
plt.subplots(figsize=(12, 8))
sns.heatmap(viz_songs.corr(), annot=True, square=True)
plt.show()

In [10]:
def normalize_column(col):
    songs[col] = (songs[col] - songs[col].min()) / (songs[col].max() - songs[col].min())    

num_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num = songs.select_dtypes(include=num_types)
num.fillna(value = 0,inplace = True)
for col in num.columns:
  normalize_column(col)

# num_2 = viz_songs.select_dtypes(include=num_types)

# for col in num_2.columns:
#   normalize_column(col)
# viz_songs.head()

In [11]:
#K-Means clustering for genre classification
from sklearn.cluster import KMeans

km = KMeans(n_clusters=7)
cat = km.fit_predict(num)
songs['cat'] = cat
# viz_songs['cat'] = cat
normalize_column('cat')

In [None]:
songs.cat[:10]
# print(viz_songs.loc[viz_songs['id']=='7lmeHLHBe4nmXzuXc0HDjk'].shape)

In [13]:


def find_songVector(viz_songs,id):
    return viz_songs.loc[songs['id']==id]
    
#finds reccomendations using a given song title
def find_recommendations(id):

    viz_songs=songs.drop(columns=['id', 'name', 'artists'])
    song_vec=find_songVector(viz_songs,id)
    sim_viz_songs=viz_songs[viz_songs.cat==song_vec.cat.values[0]]
    sim_viz_songs.fillna(value = 0,inplace = True)
    sim=cosine_similarity(sim_viz_songs,song_vec)
    scores=list(enumerate(sim))
    sorted_scores=sorted(scores,key=lambda x:x[1],reverse=True)  #sorts all the songs in the list in reverse order (decreasing order)
    sorted_scores=sorted_scores[1:]                               #skips the first index as it is the same song with highest similarity
    # print(len(sorted_scores))
    # print(scores)
    rec_songs=pd.DataFrame()
    for i in range(0,5):
        indx=sorted_scores[i][0]
        rec_songs=rec_songs.append(songs.loc[indx])       #adds all song title according to the scores found
    return rec_songs #returns the songs


find_recommendations('7lmeHLHBe4nmXzuXc0HDjk')

Unnamed: 0,valence,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo,cat
8197,0.97,0.509036,['Dee Dee Sharp'],0.802632,0.022782,0.798,0.0,2ZxzsmJ093WPTBbsoCf6CN,2e-06,0.181818,0.203,0.851617,1.0,Gravy (For My Mashed Potatoes),0.35,0.034536,0.533718,1.0
7772,0.745,0.763052,['Sonny Stitt'],0.657895,0.033733,0.371,0.0,3p4C7vCwkqOD3bSlh96pls,0.0389,0.0,0.159,0.757341,0.0,I Got Rhythm,0.3,0.060928,0.587203,0.0
61265,0.289,0.922691,"['Stephen Douglass', 'Make a Wish Ensemble']",0.393725,0.056357,0.312,0.0,002dh6a4LfxfGGnhPZY4fG,4e-06,0.636364,0.425,0.785545,1.0,"Paris, France / When Does This Feeling Go Away?",0.0,0.041237,0.490659,0.833333
33356,0.964,0.673695,['Banda El Recodo'],0.61336,0.03174,0.402,0.0,6oiKbZIaQ8A0Rnsq9jMJ1E,0.0,0.909091,0.102,0.797009,1.0,Seis Pies Abajo,0.58,0.032268,0.550477,0.0
47356,0.0996,0.98494,['Linda Perhacs'],0.312753,0.036986,0.0676,0.0,0PSxgms5q4pEtRGXRw2oWK,0.234,0.0,0.0962,0.623271,1.0,Chimacum Rain,0.38,0.036392,0.460032,0.0


In [None]:
class SpotifyRecommender():
    def __init__(self, rec_data):
        #our class should understand which data to work with
        self.rec_data_ = rec_data
    
    #if we need to change data
    def change_data(self, rec_data):
        self.rec_data_ = rec_data
    # num_types = ['string']
    # non_num_col=songs.select_dtypes(include=num_types)

    #function which returns recommendations, we can also choose the amount of songs to be recommended
    def get_recommendations(self, song_name, amount=1):
        distances = []
        #choosing the data for our song
        song = self.rec_data_[(self.rec_data_.name.str.lower() == song_name.lower())].head(1).values[0]
        #dropping the data with our song
        res_data = self.rec_data_[self.rec_data_.name.str.lower() != song_name.lower()]
        for r_song in tqdm(res_data.values):
            dist = 0
            for col in np.arange(len(res_data.columns)):
                #indeces of non-numerical columns
                if not col in [3, 8, 14,16]:
                    #calculating the manhettan distances for each numerical feature
                    dist = dist + np.absolute(float(song[col]) - float(r_song[col]))
            distances.append(dist)
        res_data['distance'] = distances
        #sorting our data to be ascending by 'distance' feature
        res_data = res_data.sort_values('distance')
        columns = ['artists', 'name']
        return res_data[columns][:amount]

In [None]:
recommender = SpotifyRecommender(songs)

In [None]:
recommender.get_recommendations("7lmeHLHBe4nmXzuXc0HDjk", 5)