## Author: Technocolabs Sofwares
## Credit: Mahmoud Yaser Salman
## Programmed: Spotify Recommendation System Project 

<hr>

<center> <h1> Feature Engineering and Content Based Recommendation </h1> </center>

<hr>

<hr>

## Import Packages

In [1]:
import pandas as pd
# Import Pandas to use DataFrames
import swifter
# To efficiently apply any function to a Pandas Data Frame or Series object in the quickest available method
import numpy as np
# Import Numpy which will hel in mathematical calculations 
import matplotlib.pyplot as plt
import seaborn as sns
# To Visualize Data
import os
# To Access system files/options
import glob
# To return all file paths that match a specific pattern
from tqdm import tqdm
# To Display progress when using loops


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# For creating vectors from text and determining similarity



pd.set_option('display.max_columns', None)
pd.set_option("display.precision", 2)
# Tune some options in pandas 


%matplotlib inline
# To make any plot appear in jupyter notebook

## Import our dataset

In [2]:
DATASETS_PATH = 'A:\Spotify Data\JsonData' # change to the path of your csv file
df_main = pd.read_csv(os.path.join(DATASETS_PATH, 'feature_engineering_dataset.csv'))

In [3]:
df_main.head()

Unnamed: 0,pid,name,description,modified_at,num_artists,num_albums,num_tracks,num_followers,num_edits,playlist_duration_ms,collaborative,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,uri,track_href,analysis_url,time_signature,playlist_followers_artist,playlist_followers_album,playlist_followers_track
0,0,Throwbacks,,1493424000,37,47,52,1,6,11532414,0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.9,0.81,4,-7.11,0,0.12,0.03,0.00697,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21
1,123,w o r k o u t,,1500681600,171,225,245,1,26,55676545,0,73,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.9,0.81,4,-7.11,0,0.12,0.03,0.00697,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21
2,218,party playlist,,1504310400,74,93,98,1,4,21299385,0,14,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.9,0.81,4,-7.11,0,0.12,0.03,0.00697,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21
3,342,Dance mix,,1473724800,111,127,167,1,15,43330767,0,42,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.9,0.81,4,-7.11,0,0.12,0.03,0.00697,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21
4,382,spin,,1505347200,13,14,14,2,5,3617477,0,1,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.9,0.81,4,-7.11,0,0.12,0.03,0.00697,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21


# Create New Features by Features Interaction

**I conducted sentiment analysis finding the polarity and subjectivity of the track name.**

- Subjectivity (0,1): The amount of personal opinion and factual information contained in the text.
- Polarity (-1,1): The degree of strong or clearly defined sentiment accounting for negation.
    
"The goal of the sentiment analysis is to extract additional features from the tracks.
By doing so, we can extract sentiment data other audio features via textual information.
For example, if the general mood of the song titles of a playlist is positive,
then this can be utilized to recommend positive songs. However, due to the short length of the titles,
the two metrics cannot produce optimal results."

In [4]:
df_add = df_main.copy()

## One Hot Encoding Manually  

In [5]:
happy_yes = df_add['valence'].copy()
happy_yes[df_add['valence']>=0.5] = 1
happy_yes[df_add['valence']<0.5] = 0
happy_yes.name = "happy_yes"
happy_yes

0         1.0
1         1.0
2         1.0
3         1.0
4         1.0
         ... 
266358    1.0
266359    1.0
266360    1.0
266361    1.0
266362    0.0
Name: happy_yes, Length: 266363, dtype: float64

In [6]:
happy_no = df_add['valence'].copy()
happy_no[df_add['valence'] >= 0.5] = 0
happy_no[df_add['valence'] < 0.5] = 1
happy_no.name = "happy_no"
happy_no

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
266358    0.0
266359    0.0
266360    0.0
266361    0.0
266362    1.0
Name: happy_no, Length: 266363, dtype: float64

In [7]:
speech_yes = df_add['speechiness'].copy()
speech_yes.name = "speech_yes"
speech_yes[(df_add['speechiness']>=0.66)] = 1
speech_yes[(df_add['speechiness']<0.66)] = 0
speech_yes

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
266358    0.0
266359    0.0
266360    0.0
266361    0.0
266362    0.0
Name: speech_yes, Length: 266363, dtype: float64

In [8]:
speech_no = df_add['speechiness'].copy()
speech_no.name = "speech_no"
speech_no[(df_add['speechiness']>=0.66)] = 0
speech_no[(df_add['speechiness']<0.66)] = 1
speech_no

0         1.0
1         1.0
2         1.0
3         1.0
4         1.0
         ... 
266358    1.0
266359    1.0
266360    1.0
266361    1.0
266362    1.0
Name: speech_no, Length: 266363, dtype: float64

In [9]:
collaborative_yes = df_add['collaborative'].copy()
collaborative_yes.name = "collaborative_yes"
collaborative_yes[(df_add['collaborative'] == 1)] = 1
collaborative_yes[(df_add['collaborative'] == 0)] = 0
collaborative_yes

0         0
1         0
2         0
3         0
4         0
         ..
266358    0
266359    0
266360    0
266361    0
266362    0
Name: collaborative_yes, Length: 266363, dtype: int64

In [10]:
collaborative_no = df_add['collaborative'].copy()
collaborative_no.name = "collaborative_no"
collaborative_no[(df_add['collaborative'] == 1)] = 0
collaborative_no[(df_add['collaborative'] == 0)] = 1
collaborative_no

0         1
1         1
2         1
3         1
4         1
         ..
266358    1
266359    1
266360    1
266361    1
266362    1
Name: collaborative_no, Length: 266363, dtype: int64

In [11]:
mode_yes = df_add['mode'].copy()
mode_yes.name = "mode_yes"
mode_yes[(df_add['mode']>=0.66)] = 1
mode_yes[(df_add['mode']<0.66)] = 0
mode_yes

0         0
1         0
2         0
3         0
4         0
         ..
266358    1
266359    1
266360    1
266361    1
266362    1
Name: mode_yes, Length: 266363, dtype: int64

In [12]:
mode_no = df_add['mode'].copy()
mode_no.name = "mode_no"
mode_no[(df_add['mode'] == 1)] = 0
mode_no[(df_add['mode'] == 0)] = 1
mode_no

0         1
1         1
2         1
3         1
4         1
         ..
266358    0
266359    0
266360    0
266361    0
266362    0
Name: mode_no, Length: 266363, dtype: int64

In [13]:
# We have to drop original boolean columns
df_add = df_add.drop(columns = ["collaborative", "mode"], axis = 1)
# Add all the newly created columns by ONE HOT ENCODING technique to our dataframe
df_add = pd.concat([df_add, mode_no, mode_yes, collaborative_no, collaborative_yes, speech_no, speech_yes, happy_no, happy_yes], axis=1)
df_add

Unnamed: 0,pid,name,description,modified_at,num_artists,num_albums,num_tracks,num_followers,num_edits,playlist_duration_ms,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,uri,track_href,analysis_url,time_signature,playlist_followers_artist,playlist_followers_album,playlist_followers_track,mode_no,mode_yes,collaborative_no,collaborative_yes,speech_no,speech_yes,happy_no,happy_yes
0,0,Throwbacks,,1493424000,37,47,52,1,6,11532414,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.90,0.81,4,-7.11,0.12,3.11e-02,6.97e-03,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21,1,0,1,0,1.0,0.0,0.0,1.0
1,123,w o r k o u t,,1500681600,171,225,245,1,26,55676545,73,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.90,0.81,4,-7.11,0.12,3.11e-02,6.97e-03,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21,1,0,1,0,1.0,0.0,0.0,1.0
2,218,party playlist,,1504310400,74,93,98,1,4,21299385,14,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.90,0.81,4,-7.11,0.12,3.11e-02,6.97e-03,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21,1,0,1,0,1.0,0.0,0.0,1.0
3,342,Dance mix,,1473724800,111,127,167,1,15,43330767,42,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.90,0.81,4,-7.11,0.12,3.11e-02,6.97e-03,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21,1,0,1,0,1.0,0.0,0.0,1.0
4,382,spin,,1505347200,13,14,14,2,5,3617477,1,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.90,0.81,4,-7.11,0.12,3.11e-02,6.97e-03,0.05,0.81,125.46,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,4,126,22,21,1,0,1,0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266358,3999,other,,1420848000,17,56,97,1,9,22548683,28,Jason Mraz,spotify:track:3ce7k1L4EkZppZPz1EJWTS,spotify:artist:4phGZZrJZRo4ElhRtViYdl,Living In The Moment,spotify:album:64SChsjQBiEe3aVTFYnW1b,235080,Love Is A Four Letter Word,0.65,0.63,7,-7.16,0.02,4.83e-02,0.00e+00,0.12,0.70,84.14,3ce7k1L4EkZppZPz1EJWTS,spotify:track:3ce7k1L4EkZppZPz1EJWTS,https://api.spotify.com/v1/tracks/3ce7k1L4EkZp...,https://api.spotify.com/v1/audio-analysis/3ce7...,4,248,79,1,0,1,1,0,1.0,0.0,0.0,1.0
266359,3999,other,,1420848000,17,56,97,1,9,22548683,36,The Cure,spotify:track:6DW7MN6DCVvTDjtcL5IM9a,spotify:artist:7bu3H8JO7d0UbMoVzbo70s,Pictures Of You - Remix,spotify:album:3jRLyc2t1tExfVpdB88EUm,288106,Galore - The Singles 1987-1997,0.46,0.74,9,-6.10,0.03,4.48e-03,1.02e-02,0.12,0.63,172.72,6DW7MN6DCVvTDjtcL5IM9a,spotify:track:6DW7MN6DCVvTDjtcL5IM9a,https://api.spotify.com/v1/tracks/6DW7MN6DCVvT...,https://api.spotify.com/v1/audio-analysis/6DW7...,4,79,1,1,0,1,1,0,1.0,0.0,0.0,1.0
266360,3999,other,,1420848000,17,56,97,1,9,22548683,37,The Cure,spotify:track:2JtJruenT9hgZE4cnMtBnt,spotify:artist:7bu3H8JO7d0UbMoVzbo70s,Close To Me - Closest Mix,spotify:album:7atCFN41kB5N0zivS2Pwtd,261560,Mixed Up,0.71,0.68,11,-8.75,0.05,5.24e-02,8.97e-02,0.06,0.78,92.40,2JtJruenT9hgZE4cnMtBnt,spotify:track:2JtJruenT9hgZE4cnMtBnt,https://api.spotify.com/v1/tracks/2JtJruenT9hg...,https://api.spotify.com/v1/audio-analysis/2JtJ...,4,79,1,1,0,1,1,0,1.0,0.0,0.0,1.0
266361,3999,other,,1420848000,17,56,97,1,9,22548683,39,The Cure,spotify:track:5qgBvugGSAnKWCpWAiwusi,spotify:artist:7bu3H8JO7d0UbMoVzbo70s,Lullaby,spotify:album:34ozkv3AkFksZD8srOmOrX,247613,Greatest Hits,0.72,0.69,9,-7.42,0.04,1.51e-01,3.69e-01,0.08,0.60,94.02,5qgBvugGSAnKWCpWAiwusi,spotify:track:5qgBvugGSAnKWCpWAiwusi,https://api.spotify.com/v1/tracks/5qgBvugGSAnK...,https://api.spotify.com/v1/audio-analysis/5qgB...,4,79,568,12,0,1,1,0,1.0,0.0,0.0,1.0


In [14]:
df_add.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266363 entries, 0 to 266362
Data columns (total 44 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   pid                        266363 non-null  int64  
 1   name                       266363 non-null  object 
 2   description                5333 non-null    object 
 3   modified_at                266363 non-null  int64  
 4   num_artists                266363 non-null  int64  
 5   num_albums                 266363 non-null  int64  
 6   num_tracks                 266363 non-null  int64  
 7   num_followers              266363 non-null  int64  
 8   num_edits                  266363 non-null  int64  
 9   playlist_duration_ms       266363 non-null  int64  
 10  pos                        266363 non-null  int64  
 11  artist_name                266363 non-null  object 
 12  track_uri                  266363 non-null  object 
 13  artist_uri                 26

In [15]:
playlist = df_add[["pid", "name", "description", "playlist_duration_ms", "num_edits", "num_followers", "num_tracks", "num_albums", "num_artists", "modified_at", "collaborative_yes", "collaborative_no"]]
playlist = playlist.drop(playlist[playlist.duplicated()].index)

# Reset index for DataFrame
playlist = playlist.reset_index(drop=True)
playlist

Unnamed: 0,pid,name,description,playlist_duration_ms,num_edits,num_followers,num_tracks,num_albums,num_artists,modified_at,collaborative_yes,collaborative_no
0,0,Throwbacks,,11532414,6,1,52,47,37,1493424000,0,1
1,123,w o r k o u t,,55676545,26,1,245,225,171,1500681600,0,1
2,218,party playlist,,21299385,4,1,98,93,74,1504310400,0,1
3,342,Dance mix,,43330767,15,1,167,127,111,1473724800,0,1
4,382,spin,,3617477,5,2,14,14,13,1505347200,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3239,potential,,5619189,12,1,16,16,14,1365120000,0,1
3996,3315,Jethro Tull,,12438266,6,1,35,15,5,1385683200,0,1
3997,3347,undertale,,1341597,2,1,6,6,5,1503360000,0,1
3998,3442,Dark Roast Playlist,,4480169,2,1,20,20,20,1413763200,0,1


In [16]:
track = df_add[["track_name","artist_name", "album_name", "duration_ms", "danceability", "energy", "key", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "mode_no", "mode_yes", "speech_no", "speech_yes", "happy_no", "happy_yes"]]
track["popularity"] = (0.2*df_add["playlist_followers_track"])+(2*df_add["playlist_followers_artist"])+(1.5*df_add["playlist_followers_album"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  track["popularity"] = (0.2*df_add["playlist_followers_track"])+(2*df_add["playlist_followers_artist"])+(1.5*df_add["playlist_followers_album"])


In [17]:
# Drop duplicates
track = track.drop(track[track.duplicated()].index)

# Reset index for DataFrame
track = track.reset_index(drop=True)

# Display first 5 rows
track.head()

Unnamed: 0,track_name,artist_name,album_name,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode_no,mode_yes,speech_no,speech_yes,happy_no,happy_yes,popularity
0,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,226863,0.9,0.81,4,-7.11,0.12,0.0311,0.00697,0.05,0.81,125.46,1,0,1.0,0.0,0.0,1.0,289.2
1,Toxic,Britney Spears,In The Zone,198800,0.77,0.84,5,-3.91,0.11,0.0249,0.025,0.24,0.92,143.04,1,0,1.0,0.0,0.0,1.0,617.5
2,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),235933,0.66,0.76,2,-6.58,0.21,0.00238,0.0,0.06,0.7,99.26,1,0,1.0,0.0,0.0,1.0,2102.5
3,Rock Your Body,Justin Timberlake,Justified,267266,0.89,0.71,4,-6.05,0.14,0.201,0.000234,0.05,0.82,100.97,1,0,1.0,0.0,0.0,1.0,945.3
4,It Wasn't Me,Shaggy,Hot Shot,227600,0.85,0.61,0,-4.6,0.07,0.0561,0.0,0.31,0.65,94.76,0,1,1.0,0.0,0.0,1.0,544.7


In [18]:
track.shape[0]

93123

In [19]:
features = [track.columns[0], track.columns[1], track.columns[2]]
temp = track.copy()
for feature in tqdm(features):
    for i in range(track.shape[0]):
        temp[feature].iloc[i] = str.lower((track[feature].iloc[i]).replace(" ",""))

track["metadata"] = temp[temp.columns[0]] + " " + temp[temp.columns[1]] + " " + temp[temp.columns[2]]
track

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:16<00:00, 65.37s/it]


Unnamed: 0,track_name,artist_name,album_name,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode_no,mode_yes,speech_no,speech_yes,happy_no,happy_yes,popularity,metadata
0,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,226863,0.90,0.81,4,-7.11,0.12,3.11e-02,6.97e-03,0.05,0.81,125.46,1,0,1.0,0.0,0.0,1.0,289.2,losecontrol(feat.ciara&fatmanscoop) missyellio...
1,Toxic,Britney Spears,In The Zone,198800,0.77,0.84,5,-3.91,0.11,2.49e-02,2.50e-02,0.24,0.92,143.04,1,0,1.0,0.0,0.0,1.0,617.5,toxic britneyspears inthezone
2,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),235933,0.66,0.76,2,-6.58,0.21,2.38e-03,0.00e+00,0.06,0.70,99.26,1,0,1.0,0.0,0.0,1.0,2102.5,crazyinlove beyoncé dangerouslyinlove(albenfür...
3,Rock Your Body,Justin Timberlake,Justified,267266,0.89,0.71,4,-6.05,0.14,2.01e-01,2.34e-04,0.05,0.82,100.97,1,0,1.0,0.0,0.0,1.0,945.3,rockyourbody justintimberlake justified
4,It Wasn't Me,Shaggy,Hot Shot,227600,0.85,0.61,0,-4.60,0.07,5.61e-02,0.00e+00,0.31,0.65,94.76,0,1,1.0,0.0,0.0,1.0,544.7,itwasn'tme shaggy hotshot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93118,Living In The Moment,Jason Mraz,Love Is A Four Letter Word,235080,0.65,0.63,7,-7.16,0.02,4.83e-02,0.00e+00,0.12,0.70,84.14,0,1,1.0,0.0,0.0,1.0,614.7,livinginthemoment jasonmraz loveisafourletterword
93119,Pictures Of You - Remix,The Cure,Galore - The Singles 1987-1997,288106,0.46,0.74,9,-6.10,0.03,4.48e-03,1.02e-02,0.12,0.63,172.72,0,1,1.0,0.0,0.0,1.0,159.7,picturesofyou-remix thecure galore-thesingles1...
93120,Close To Me - Closest Mix,The Cure,Mixed Up,261560,0.71,0.68,11,-8.75,0.05,5.24e-02,8.97e-02,0.06,0.78,92.40,0,1,1.0,0.0,0.0,1.0,159.7,closetome-closestmix thecure mixedup
93121,Lullaby,The Cure,Greatest Hits,247613,0.72,0.69,9,-7.42,0.04,1.51e-01,3.69e-01,0.08,0.60,94.02,0,1,1.0,0.0,0.0,1.0,1012.4,lullaby thecure greatesthits


In [20]:
# Create CountVectorizer object to transform text into vector
track_vectorizer = CountVectorizer()

# Fit the vectorizer on "metadata" field of song_library DataFrame
track_vectorizer.fit(track['metadata'])

In [21]:
# We selected the most popular 5000 songs only to make our algo work faster
tracks = track.sort_values("popularity", ascending = False)[:5000]
tracks

Unnamed: 0,track_name,artist_name,album_name,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode_no,mode_yes,speech_no,speech_yes,happy_no,happy_yes,popularity,metadata
3713,One Dance,Drake,Views,173986,0.79,0.62,1,-5.89,0.05,7.84e-03,4.23e-03,0.35,0.37,103.99,0,1,1.0,0.0,1.0,0.0,8160.5,onedance drake views
6401,Hotline Bling,Drake,Views,267066,0.90,0.62,2,-8.09,0.06,3.47e-03,1.19e-04,0.05,0.54,134.96,0,1,1.0,0.0,0.0,1.0,8149.3,hotlinebling drake views
8343,With You,Drake,Views,195053,0.88,0.41,1,-9.96,0.25,1.34e-01,2.34e-04,0.11,0.78,128.01,0,1,1.0,0.0,0.0,1.0,8143.9,withyou drake views
3524,Too Good,Drake,Views,263373,0.80,0.65,7,-7.80,0.12,5.73e-02,3.49e-05,0.10,0.39,117.98,0,1,1.0,0.0,1.0,0.0,8143.1,toogood drake views
2080,Controlla,Drake,Views,245226,0.61,0.48,10,-11.08,0.25,7.73e-02,0.00e+00,0.11,0.35,122.98,1,0,1.0,0.0,1.0,0.0,8142.1,controlla drake views
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43121,Brooklyn Baby,Lana Del Rey,Ultraviolence,351640,0.40,0.66,1,-6.70,0.04,5.42e-01,4.02e-03,0.11,0.09,107.75,0,1,1.0,0.0,1.0,0.0,1016.3,brooklynbaby lanadelrey ultraviolence
40381,Florida Kilos,Lana Del Rey,Ultraviolence,256040,0.27,0.51,0,-7.85,0.04,2.80e-01,5.98e-05,0.11,0.58,200.78,0,1,1.0,0.0,0.0,1.0,1016.3,floridakilos lanadelrey ultraviolence
2434,Boys 'Round Here (feat. Pistol Annies & Friends),Blake Shelton,Based on a True Story...,288760,0.61,0.69,2,-6.37,0.05,2.51e-01,2.35e-06,0.25,0.65,169.90,0,1,1.0,0.0,0.0,1.0,1016.2,boys'roundhere(feat.pistolannies&friends) blak...
36415,Goodbye,Miley Cyrus,Breakout,230826,0.36,0.70,6,-4.75,0.03,5.60e-03,9.01e-05,0.16,0.18,174.91,0,1,1.0,0.0,1.0,0.0,1016.2,goodbye mileycyrus breakout


In [22]:
tracks.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration_ms,5000.0,232000.0,64930.11,4520.0,198990.0,227211.0,260000.0,883586.0
danceability,5000.0,0.607,0.15,0.0,0.51,0.61,0.716,0.97
energy,5000.0,0.654,0.2,0.0,0.53,0.68,0.809,1.0
key,5000.0,5.18,3.59,0.0,2.0,5.0,8.0,11.0
loudness,5000.0,-6.75,3.21,-60.0,-8.11,-6.23,-4.74,0.49
speechiness,5000.0,0.121,0.13,0.0,0.04,0.06,0.164,0.95
acousticness,5000.0,0.22,0.26,0.0,0.02,0.1,0.333,0.99
instrumentalness,5000.0,0.0186,0.1,0.0,0.0,0.0,8.94e-05,0.99
liveness,5000.0,0.218,0.19,0.0,0.1,0.14,0.284,0.99
valence,5000.0,0.468,0.22,0.0,0.29,0.46,0.641,0.97


In [23]:
# Function to recommend more songs based on given song name
def song_recommender(song_name, n = 5):
    try:
        # Numeric columns (audio features) in track DataFrame
        num_cols = ['duration_ms', 'danceability', 'energy', 'key','loudness','speechiness','acousticness','instrumentalness','liveness', 'valence', 'tempo', 'mode_no', 'mode_yes', 'speech_no', 'speech_yes', 'happy_no', 'happy_yes', 'popularity']

        # Create vector from "metadata" field (text data) for given song
        text_vec1 = track_vectorizer.transform(tracks[tracks['track_name']==str(song_name)]['metadata']).toarray()

        # Create vector from numerical columns for given song
        num_vec1 = tracks[tracks['track_name']==str(song_name)][num_cols].to_numpy()

        # Initialise empty list to store similarity scores
        sim_scores=  []

        # For every song/track in song library, determine cosine similarity with given song
        for index, row in tqdm(tracks.iterrows()):
            name = row['track_name']

            # Create vector from "metadata" field for other songs
            text_vec2 = track_vectorizer.transform(tracks[tracks['track_name']==name]['metadata']).toarray()

            # Create vector from numerical columns for other songs
            num_vec2 = tracks[tracks['track_name']==name][num_cols].to_numpy()

            # Calculate cosine similarity using text vectors
            text_sim = cosine_similarity(text_vec1, text_vec2)[0][0]

            # Calculate cosine similarity using numerical vectors
            num_sim = cosine_similarity(num_vec1, num_vec2)[0][0]

            # Take average of both similarity scores and add to list of similarity scores
            sim = (text_sim + num_sim)/2
            sim_scores.append(sim)
        
        # Add new column containing similarity scores to song_library DataFrame
        tracks['similarity'] = sim_scores

        # Sort DataFrame based on "similarity" column
        tracks.sort_values(by=['similarity', 'popularity'], ascending=[False, False], inplace=True)

        # Create DataFrame "recommended_songs" containing 5 songs that are most similar to the given song and return this DataFrame
        recommended_songs = tracks[['track_name', 'artist_name', 'album_name']][2:(2+n)]
        return recommended_songs
    except:
        # If given song is not found in song library then display message
        print('{} not found in songs library.'.format(song_name))

In [24]:
tracks.head()

Unnamed: 0,track_name,artist_name,album_name,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode_no,mode_yes,speech_no,speech_yes,happy_no,happy_yes,popularity,metadata
3713,One Dance,Drake,Views,173986,0.79,0.62,1,-5.89,0.05,0.00784,0.00423,0.35,0.37,103.99,0,1,1.0,0.0,1.0,0.0,8160.5,onedance drake views
6401,Hotline Bling,Drake,Views,267066,0.9,0.62,2,-8.09,0.06,0.00347,0.000119,0.05,0.54,134.96,0,1,1.0,0.0,0.0,1.0,8149.3,hotlinebling drake views
8343,With You,Drake,Views,195053,0.88,0.41,1,-9.96,0.25,0.134,0.000234,0.11,0.78,128.01,0,1,1.0,0.0,0.0,1.0,8143.9,withyou drake views
3524,Too Good,Drake,Views,263373,0.8,0.65,7,-7.8,0.12,0.0573,3.49e-05,0.1,0.39,117.98,0,1,1.0,0.0,1.0,0.0,8143.1,toogood drake views
2080,Controlla,Drake,Views,245226,0.61,0.48,10,-11.08,0.25,0.0773,0.0,0.11,0.35,122.98,1,0,1.0,0.0,1.0,0.0,8142.1,controlla drake views


In [25]:
song_recommender("Goodbye", 10)

5000it [00:37, 132.90it/s]


Unnamed: 0,track_name,artist_name,album_name
27005,This Plane,Wiz Khalifa,Deal Or No Deal
80394,Chewy,Wiz Khalifa,Deal Or No Deal
4849,Studio Lovin’,Wiz Khalifa,Deal Or No Deal
50221,Up In It,Wiz Khalifa,O.N.I.F.C.
61951,Paperbond,Wiz Khalifa,O.N.I.F.C.
88104,Fall Asleep,Wiz Khalifa,O.N.I.F.C.
50238,Time,Wiz Khalifa,O.N.I.F.C.
57831,Time,blink-182,Buddha
5533,We Dem Boyz,Wiz Khalifa,Blacc Hollywood
34356,Cabana,Wiz Khalifa,Live In Concert EP


## Export our Content Based Recommendation Dataset

In [26]:
# This csv file contains the all the tracks which we will use in our content based recommendation system
track.to_csv(os.path.join(DATASETS_PATH, 'content_recommend_dataset.csv'), index=False)