<div class="alert alert-success alertsuccess"> <center>
    <h1> Music Recommendation System </h1>
    </center>

-------------------------------------------
------------------------------------------

<h3> Details of Dataset: </h3>
<br>
<ul>
    <li>The file <code>tracks_features.csv</code> obtained from kaggle <a href = "https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs/data"> Spotify 1.2M+ Songs </a> </li>
    <li>It contains audio features for over 1.2 million songs, obtained with the Spotify API. </li>
    <li>Reference for these audio features can be found <a href = "https://developer.spotify.com/documentation/web-api/reference/get-audio-features">here</a></li>
</ul>
    

In [44]:
# Require imports

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import pickle

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_columns = None

In [46]:
# importing tracks_features.csv
df = pd.read_csv(r"C:\Users\maner\OneDrive\Desktop\New folder\Pandas dataset\proj datasets\music\tracks_features.csv\tracks_features.csv")

In [47]:
# overview of imported file
display(df.head())
print("-"*50)
print(f"shape dataframe is {df.shape}")
print("-"*50)
print(df.info())

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02


--------------------------------------------------
shape dataframe is (1204025, 24)
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1204025 entries, 0 to 1204024
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   id                1204025 non-null  object 
 1   name              1204025 non-null  object 
 2   album             1204025 non-null  object 
 3   album_id          1204025 non-null  object 
 4   artists           1204025 non-null  object 
 5   artist_ids        1204025 non-null  object 
 6   track_number      1204025 non-null  int64  
 7   disc_number       1204025 non-null  int64  
 8   explicit          1204025 non-null  bool   
 9   danceability      1204025 non-null  float64
 10  energy            1204025 non-null  float64
 11  key               1204025 non-null  int64  
 12  loudness          1204025 non-null  float64
 13  mode        

In [48]:
# id column is unique identifier
# Name, album, artists are required for the output
# following features are be used for recommending music:
#          'explicit', 'danceability', 'energy','key', 'loudness', 'mode', 'speechiness', 'acousticness',
#          'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms','time_signature'
# year and release_date (quarters) are used to filter the music recommendation

In [49]:
# release_date is in object datatype
# converting it to datetime format
df.release_date = pd.to_datetime(df.release_date, errors = 'coerce')

In [50]:
# droping nulls created due to errors while datetime format conversion
df = df.dropna()

In [51]:
# extracting quarters from release date 
df['quarter'] = df.release_date.dt.quarter

In [52]:
# converting explicit column to int
df.explicit = df.explicit.astype('int')

In [53]:
# overview of dataframe
df.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,quarter
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,0,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,4
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,1,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,4
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,0,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,4
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,1,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,4
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,0,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,4


In [54]:
# creating the training_df with required features
training_df = df[['explicit', 'danceability', 'energy','key', 'loudness', 'mode', 'speechiness', 'acousticness',
                  'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms','time_signature']]

In [55]:
# creating an object for MinMaxScaler
scale = MinMaxScaler()

In [56]:
scale.fit(training_df)

In [57]:
training_df = pd.DataFrame(scale.transform(training_df), 
                                index=df.id,
                                columns=training_df.columns)
# scaled data will be in array
# coverted the scaled data to dataframe
# added id as index and cols of training_df

In [58]:
# overview of training_df before clustering
display(training_df.head())
print("-"*50)
print("Shape of training_df is ",training_df.shape)
print("-"*50)
print(training_df.info())

Unnamed: 0_level_0,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
7lmeHLHBe4nmXzuXc0HDjk,0.0,0.47,0.978,0.636364,0.812104,1.0,0.075026,0.026205,1.1e-05,0.356,0.503,0.473644,0.03451,0.8
1wsRitfRRtWyEapl0q22o8,1.0,0.599,0.957,1.0,0.806675,1.0,0.194014,0.012952,7.1e-05,0.155,0.489,0.416496,0.033861,0.8
1hR0fIFK2qRG3f3RF70pb7,0.0,0.315,0.97,0.636364,0.811732,1.0,0.498452,0.023494,2e-06,0.122,0.37,0.601561,0.049157,0.8
2lbASgTSoDO7MTuLAXlTW0,1.0,0.44,0.967,1.0,0.805694,0.0,0.244582,0.163655,4e-06,0.121,0.574,0.388665,0.035089,0.8
1MQTmpYOZ6fcMQc56Hdo7T,0.0,0.426,0.929,0.181818,0.792322,1.0,0.072343,0.001627,0.105,0.0789,0.539,0.510412,0.033762,0.8


--------------------------------------------------
Shape of training_df is  (1204015, 14)
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 1204015 entries, 7lmeHLHBe4nmXzuXc0HDjk to 3GgQmOxxLyRoAb4j86zOBX
Data columns (total 14 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   explicit          1204015 non-null  float64
 1   danceability      1204015 non-null  float64
 2   energy            1204015 non-null  float64
 3   key               1204015 non-null  float64
 4   loudness          1204015 non-null  float64
 5   mode              1204015 non-null  float64
 6   speechiness       1204015 non-null  float64
 7   acousticness      1204015 non-null  float64
 8   instrumentalness  1204015 non-null  float64
 9   liveness          1204015 non-null  float64
 10  valence           1204015 non-null  float64
 11  tempo             1204015 non-null  float64
 12  duration_ms       120401

In [59]:
# convering the data in float32
training_df = training_df[training_df.columns].astype('float32')

In [60]:
# clustering training_df to 10 clusters
kmeans = KMeans(n_clusters=10)
kmeans.fit(training_df)

In [63]:
# saving the model
with open('music_r_final', 'wb') as f: # wb--> write binary
    pickle.dump(kmeans, f) # model is saved

In [64]:
# accessing the model
with open('music_r_final', 'rb') as f: # rb--> read binary
    music_r_final = pickle.load(f) # u

In [80]:
# adding cluster to each record
training_df['cluster'] = music_r_final.labels_
df['cluster'] = music_r_final.labels_

In [165]:
def music_recomm():
    try:
        ip = input("Enter the Song Name --> ")
        if df[df.name.str.lower() == ip.lower()].shape[0] > 1:
            print(f"Many matches found for song {ip}")
            album = input("Please Enter the album name --> ")
            idx = df[(df.name.str.lower() == ip.lower())&(df.album.str.lower() == al.lower())]['id'][0]
            clstr = df[(df.name.str.lower() == ip.lower())&(df.album.str.lower() == al.lower())]['cluster'][0]
        else :
            idx = list(df[df.name.str.lower() == ip.lower()]['id'])[0]
            clstr = list(df[df.name.str.lower() == ip.lower()]['cluster'])[0]
        i = int(input("How many simillar songs are needed --> "))
        q = input("Do you need recommendation of specific period (y/n) --> ")
        if q.lower() == 'n':
            recomm_playlist = df[(df.cluster == clstr) & (df.id != idx)][['id','name','album','artists']].sample(i)
            return recomm_playlist
        else:
            year = int(input("enter the year --> "))
            qtr = int(input("enter the quarter (1/2/3/4) --> "))
            df1 = df[(df.year == year) & (df.quarter == qtr)& (df.cluster == clstr)]
            df1 = pd.concat([df1,df[df.id == idx]])
            training_df1 = df1[['explicit', 'danceability', 'energy','key', 'loudness', 'mode', 'speechiness', 'acousticness',
                      'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms','time_signature']]
            scale1 = MinMaxScaler()
            training_df1 = pd.DataFrame(scale.transform(training_df1), 
                                    index=df1.id,
                                    columns=training_df1.columns)
            cos_data = cosine_similarity(training_df1)
            cos_df = pd.DataFrame(cos_data, index = list(training_df1.index), columns = list(training_df1.index))
            recom_ids = list(cos_df.iloc[::,-1][1:i+1].index)
            recom1 = pd.DataFrame()
            for ids in recom_ids:
                recom1 = pd.concat([recom1,df1[df1.id == ids][['id','name','album','artists']]])
            return recom1
    except:
        print(f"sorry! input song is not in our database")

In [76]:
ip = 'Testify'
al = 'The Battle Of Los Angeles'

'7lmeHLHBe4nmXzuXc0HDjk'

In [166]:
music_recomm()

Enter the Song Name --> zzzzzzzzzz
sorry! input song is not in our database
