#### Song Recommendation Project 
 Nabila Fakhruddin
 


In [None]:
# importing important libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model  # will be using for plotting trend line
from sklearn.preprocessing import MinMaxScaler # for normalizing data
from sklearn.cluster import KMeans 
from sklearn import preprocessing
%matplotlib inline
import seaborn as sns
import plotly.express as px 

In [None]:
# importing data
df_song = pd.read_csv('data.csv')
print(df_song.shape)

# Removed the Duplicate Songs by Same Artist and take only entry with Maximum popularity
df_song = df_song.sort_values('popularity', ascending=False).drop_duplicates(['name', 'artists']).sort_index()
print(df_song.shape)

In [None]:
# viewing the dataset
df_song.head(3)

In [None]:
# preprocessing the data of songs
df_song['artists'] = df_song['artists'].apply(lambda x: x[1:-1].replace("'", ''))
df_song['release_date'] = pd.to_datetime(df_song['release_date'])

In [None]:
# viewing at the dataset info
df_song.info()

In [None]:
# finding correlation
corr = df_song.corr()

# visualizing correlation with heatmap
plt.figure(figsize=(20,8))
sns.heatmap(corr, vmax=1, vmin=-1, center=0,linewidth=.5,square=True, annot = True, annot_kws = {'size':8},fmt='.1f', cmap='Spectral')
plt.title('Correlation')
plt.show()

##### There is strong positive corelation between valence & danceability, year & loudness, year & energy, year & popularity, energy & loudness, 
##### There is strong negative corelation between year & acousticness, acousticness & energy, acousticness & loudness, acousticness & popularity, 

##### over the time we have seen that loudness & energy have taken the front seat in music choices.
##### over the year acousticness have decreased as many instruments are availabe in the market, also, with decrease in acousticness, loudness and energy have increased 

In [None]:
# importing genre data, years and atist data
df_genre = pd.read_csv('data_by_genres.csv')
data_by_year = pd.read_csv('data_by_year.csv')
df_artists = pd.read_csv('data_by_artist.csv')

In [None]:
print("----------------------------------------By Genre-------------------------------------------")
df_genre.info()
print("\n----------------------------------------By Year-------------------------------------------")
data_by_year.info()
print("\n----------------------------------------By Artist-------------------------------------------")
df_artists.info()

In [None]:
df_genre.head(3)

In [None]:
data_by_year.head(3)

In [None]:
df_artists.head(3)

In [None]:
#pip install yellowbrick

In [None]:
# breaking the year into decades for analysis
def get_decade(year):
    period_start = int(year/10) * 10
    decade = '{}s'.format(period_start)
    return decade

df_song['decade'] = df_song['year'].apply(get_decade)
df_song.head()

# plotting the decades and how many songs were in that decade
sns.set(rc={'figure.figsize':(11 ,6)})
sns.countplot(df_song['decade'])

##### Number of songs are consistent since 1950s and there is very less Temporal Bias in the Dataset

In [None]:
# Trend of various song features over the decade
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence','speechiness']
fig = px.line(data_by_year, x='year', y=sound_features)
fig.show()

##### with the increase in time energy has increased while at the same time acousticness has decreased drastically. Between 1930 and 1950 speechiness varied a lot, but after that it reamined stagnant

In [None]:
sns.pairplot(data_by_year[sound_features])

# Show plots
plt.show()

##### There is stong negative correlation between acousticness and energy.

In [None]:
top10_genres = df_genre.nlargest(10, 'popularity')
fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')
fig.show()

In [None]:
bottem10_genres = df_genre.nsmallest(10, 'popularity')
fig = px.bar(bottem10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')
fig.show()

##### in the top 10 grenre by popularity, danceability is dominating, whereas, in the bottom 10, acousticness was dominating.

### Grouping on the basis of Genre using k-means clustering technique

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# getting the numerical values of genre data
cluster_pipeline_genre = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X_genre = df_genre.select_dtypes(np.number)

# forming clusters of the gernre
cluster_pipeline_genre.fit(X_genre)
df_genre['cluster'] = cluster_pipeline_genre.predict(X_genre)

In [None]:
from sklearn.manifold import TSNE

# Pipeline object being created
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=2))])

# using the object above for reducing the components 
genre_embedding = tsne_pipeline.fit_transform(X_genre)

# creating a new dataframe projection_genre with 1st two columns same as genre_embedding
projection_genre = pd.DataFrame(columns=['x', 'y'], data=genre_embedding) 

# creating 2 new columns for the projection_genre dataframe
projection_genre['genres'] = df_genre['genres']
projection_genre['cluster'] = df_genre['cluster']

In [None]:
import plotly
import plotly.express as px
fig = px.scatter(projection_genre, x='x', y='y', color='cluster', hover_data=['x', 'y','genres']) 
fig.show()

In [None]:
# creating 3 components through PCA
from sklearn.decomposition import PCA

# Pipeline object being created
pca_pipeline_genre = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=3))])

# using the object above for reducing the components
genre_embedding = pca_pipeline_genre.fit_transform(X_genre)

# creating a new dataframe projection_genre with 1st two columns same as genre_embedding
projection_genre = pd.DataFrame(columns=['x','y','z'], data=genre_embedding)

# creating 2 new columns for the projection_genre dataframe
projection_genre['genres'] = df_genre['genres']
projection_genre['cluster'] = df_genre['cluster']

# plotting the cluster in 3D
fig = px.scatter_3d(projection_genre, x='x', y='y',z='z', color='cluster', hover_data=['x', 'y','z', 'genres'])
fig.show()

#### Grouping on the basis of Song using k-means clustering technique

In [None]:
# creating pipeline object
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False))
                                 ], verbose=False)

# getting the numerical values of song data
X = df_song.select_dtypes(np.number)

# forming clusters of the gernre
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
df_song['cluster_label'] = song_cluster_labels

In [None]:
# Visualizing the Clusters with PCA
from sklearn.decomposition import PCA

# Pipeline object being created for 2 components
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])

# using the object above for reducing the components
song_embedding = pca_pipeline.fit_transform(X)

# creating a new dataframe projection with 1st two columns same as song_embedding
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)

# creating 2 new columns for the projection  dataframe
projection['title'] = df_song['name']
projection['cluster'] = df_song['cluster_label']

# plotting the 2D graph
fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

In [None]:
X_temp = df_song.select_dtypes(np.number).drop(columns = ['cluster_label'])
X_temp.head()

In [None]:
scaler = StandardScaler()

scaler.fit(X_temp)
X_temp=scaler.transform(X_temp)    
pca = PCA()
x_new = pca.fit_transform(X_temp)

In [None]:
def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley, c = df_song['cluster_label'])
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'b', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'b', ha = 'center', va = 'center')
plt.xlim(-0.6,0.6)
plt.ylim(-0.75,0.75)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()

#Call the function. Use only the 2 PCs.
myplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()


##### lookign at Biplot for 2 principal components along with contributing variables.

In [None]:
pca.explained_variance_ratio_

In [None]:
print(abs( pca.components_ ))

In [None]:
var_imp = df_song.select_dtypes(np.number)
var_imp.head()

In [None]:
model = PCA(n_components=7).fit(X_temp)
X_pc = model.transform(X_temp)

# number of components
n_pcs= model.components_.shape[0]

# get the index of the most important feature on EACH component
# LIST COMPREHENSION HERE
most_important = [np.abs(model.components_[i]).argmax() for i in range(n_pcs)]

initial_feature_names = ['valence','year','acousticness','danceability','duration_ms','energy','explicit','instrumentalness','key','liveness','loudness','mode','popularity','speechiness','tempo']
# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

# LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}

# build the dataframe
df_car_imp = pd.DataFrame(dic.items())
df_car_imp

##### We can see how different music elements contribute in subsequent principal components. 

In [None]:
# #trying to show the clusters of songs in 3D
# import plotly.express as px

# pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=3))])
# song_embedding = pca_pipeline.fit_transform(X)
# projection = pd.DataFrame(columns=['x', 'y','z'], data=song_embedding)
# projection['title'] = df_song['name']
# projection['cluster'] = df_song['cluster_label']

# fig = px.scatter_3d(projection, x='x', y='y',z='z', color='cluster', hover_data=['x', 'y','z', 'title'])
# fig.show()



#### Recommendation System using Cosine Similarity
###### Since we did not have the user ratings or song ratings, we chose to calculate cosine similarity with limited number of songs (top 50,000 popular songs) to demo our recommendation system...

In [None]:
# tracks stores top 50,000 songs based on popularity
from sklearn.metrics.pairwise import cosine_similarity
tracks = df_song.sort_values(by=['popularity'], ascending=False).head(50000)

In [None]:
# finding similarity of test song with all the songs including itself
def get_similarities(song_name, data):
   
  # Getting vector for the input song.
    num_array1 = data[data['name']==song_name].select_dtypes(include=np.number).to_numpy()
   
  # We will store similarity for each row of the dataset.
    sim = []
    for idx, row in data.iterrows():
        name = row['name']

        # Getting vector for current song.
        num_array2 = data[data['name']==name].select_dtypes(include=np.number).to_numpy()

        # Calculating similarities for numeric features of  song
        num_sim = cosine_similarity(num_array1, num_array2)[0][0]
        sim.append(num_sim)

    return sim

In [None]:
# to get the cluster label for the recommended and test song
def compare_cluster(recommended_songs):
    #print (recommended_songs)
    reco_check = recommended_songs.merge(df_song, on=['name','artists'], how='inner').loc[:,['name','artists','cluster_label_y']]
    return reco_check

In [None]:
def recommend_songs(song_name, data=tracks):
  # Base case: 
    if tracks[tracks['name'] == song_name].shape[0] == 0:
        print('This song is either not so popular or you have entered invalid_name.\n Some songs you may like:\n')
        # if the test song is not in the top 50,000 popular songs, it will recommend any 5 random songs
        for song in data.sample(n=5)['name'].values:
            print(song)
        return
        
    # if the test song exists in tracks (top 50,000), so it will calculate similarities with each songs including itslef   
    data['similarity_factor'] = get_similarities(song_name, data)
 
    # sort the song based on the similarity factor and popularity
    data.sort_values(by=['similarity_factor', 'popularity'],
                   ascending = [False, False],
                   inplace=True)
   
    # First song will be the input song itself as the similarity will be highest.
#     print(data[['name', 'artists']][1:11].reset_index())
#     print(type(data))
    
    # compare the clusters of the test song and the recommended song identified through K-means clustering above for df_song
    reco_check = compare_cluster(data[0:11])
    return reco_check

In [None]:
# getting the 5 sample songs from the original 50,000 songs in the tracks
print(tracks.sample(n=5)['name'].values)

In [None]:
# assuming that currently the user is listening to this song ("Lemon").. what will be the top 5 recommended songs
reco_check = recommend_songs('Lemon')
reco_check

#### Test Song 'Lemon' by N.E.R.D. -> 7 out of 10 Recommended songs belong to same cluster as 2
#### Test Song ' Red Eye' by YoungBoy Never Broke Again	 --> 7 out of 10 Recommended songs belong to same cluster as 3
