# Song Recommender Program Using Unsupervised Learning through the K-Nearest Neighbors Algorithm

First, run these commands in the terminal to install the needed packages:

pip install pandas

pip install scikit-learn

In [1]:
# import pandas
import pandas as pd
 
# scikit-learn imports
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In general, our steps that we'll take are as follows:

1. Read in the dataset and convert it to a pandas dataframe.

2. Explore the dataset by looking at its dimensions, what columns it has, etc.

3. Clean the dataset by getting rid of any duplicates, missing values, and unneeded columns.

4. Convert categorical data into numerical data that can be used for our algorithm.

5. Scale the data to minimize biases.

6. Create the KNN model and ask the user for an input track.

In [2]:
# read in the csv file and create a dataframe
tracks = pd.read_csv('dataset.csv')
pd.set_option('display.max_columns', None)

In [3]:
# print out the first 5 rows
tracks.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [4]:
# print out dimensions of df (rows, columns)
tracks.shape

(114000, 21)

In [5]:
# print out the columns to see what attributes each track has
tracks.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

As you may have noticed, the quality of our machine learning model is heavily reliant on our dataset. Therefore, we need to make sure that our dataset is clean in order to produce high-quality results. This includes dealing with missing/NULL values, duplicate data, and irrelevant data in our dataset. 

In [6]:
# print out if we have any duplicates in our dataset. Looks like there's not!
print(tracks.duplicated())

# see if there's missing data in any of the columns
print(tracks.info())

0         False
1         False
2         False
3         False
4         False
          ...  
113995    False
113996    False
113997    False
113998    False
113999    False
Length: 114000, dtype: bool
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode    

In [7]:
# filter the columns into categorical and numerical data

# categorical columns
cat_col = [col for col in tracks.columns if tracks[col].dtype == 'object']
print('Categorical columns :',cat_col)
# numerical columns
num_col = [col for col in tracks.columns if tracks[col].dtype != 'object']
print('Numerical columns :',num_col)



Categorical columns : ['track_id', 'artists', 'album_name', 'track_name', 'track_genre']
Numerical columns : ['Unnamed: 0', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']


In [8]:
# we see that there are some rows with the same track ID. This means that we have duplicate songs!
print(tracks[cat_col].nunique())

# TODO: get rid of the duplicates and make sure that there are none left

# unique_tracks = tracks.drop_duplicates().reset_index(drop=True)
# duplicates = unique_tracks['track_id'].duplicated()
print("Rows with duplicates in 'track_id' column:")
print((unique_tracks[duplicates])['track_name'])

track_id       89741
artists        31437
album_name     46589
track_name     73608
track_genre      114
dtype: int64
Rows with duplicates in 'track_id' column:
1925                   Song for Rollins
2155                      Snow (Hey Oh)
3000                       Daddy Issues
3002                           Softcore
3003                    Sweater Weather
                      ...              
113572         Jesus We Love You - Live
113605    In The Ordinary - Spontaneous
113617                 King of My Heart
113619                           Simple
113641                           Closer
Name: track_name, Length: 24259, dtype: object


We still have some duplicates because some songs are remixes, so they have different track IDs. Thus, it makes it harder to find these remixes and get rid of them. You could probably utilize regex's to do this, but for the purposes of this workshop, we'll just leave in the remixes. They shouldn't affect the model that much since there are over 100,000 songs in the dataset.

In [9]:
# remove a feature that we aren't interested in
unique_tracks = unique_tracks.drop(['track_id'], axis = 1)

In order for our machine learning algorithm to work, we'll need each categorical variable to be converted into real numbers. We can do this by using label encoding, which assigns each unique genre a number.

In [10]:
# TODO: print the unique genres

# unique_genres = unique_tracks["track_genre"].unique()
# print(unique_genres)

# fit and transform the 'track_genre' column to numerical values
label_encoder = LabelEncoder()
unique_tracks['track_genre'] = label_encoder.fit_transform(unique_tracks['track_genre'])

['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive-house' 'psych-rock' 'punk-rock' 'punk' 'r-n-b' 'reggae'
 'reggaeton' 'rock-n-roll' 'rock' 'rockabilly' 'romance' 'sad' 'salsa'
 'samba' 'sertanejo' 'show

In [11]:
# show the dataframe with the encoded genre
print(unique_tracks["track_genre"].unique())
non_unique_rows = unique_tracks[unique_tracks.duplicated(keep=False)]
print(non_unique_rows.head())

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  57  56  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  81  80  82  83  84  86  85  87  88  89
  91  90  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113]
Empty DataFrame
Columns: [Unnamed: 0, artists, album_name, track_name, popularity, duration_ms, explicit, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature, track_genre]
Index: []


In [12]:
# let's also get rid of songs with duplicate track names

# TODO: drop rows that have duplicate track names

# unique_tracks = unique_tracks.drop_duplicates(subset=['track_name'], keep='first').reset_index(drop=True)
# non_unique_rows = unique_tracks[unique_tracks.duplicated(keep=False)]

print(len(non_unique_rows))

0


As before with the 'track_genre' feature, we have another feature, the 'explicit' feature, which represents a boolean that is True if the song is explicit and False if not. Here, we can use one-hot encoding to convert each value to a 0 or a 1 to work with our algorithm.

In [13]:
# TODO: utilize one-hot encoding to convert the 'explicit' feature into a binary format

# unique_tracks['explicit'] = unique_tracks['explicit'].astype(int)
# print(unique_tracks['explicit'].unique())

[0 1]


In [14]:
def scale_columns():
    '''In this function, we'll be scaling the data by standardizing it, making sure that it's a dataframe,
    printing its head and shape, and then returning the scaled dataframe.'''
    
    # TODO: create a StandardScaler() object, fit the scaler to the numerical data and transform the data, 
    # convert to dataframe
    # scaler = StandardScaler()

    # # fit the scaler to the data and transform it
    # scaled_data = scaler.fit_transform(unique_tracks.drop(columns=['album_name', 'track_name', 'artists']))
    # scaled_df = pd.DataFrame(scaled_data, columns=unique_tracks.columns.drop(['album_name', 'track_name', 'artists']))

    print(scaled_df.head())
    print(scaled_df.shape)
    return scaled_df
    

In [15]:
def setup_knn(scaled_df):
    '''Initializes the K-Nearest Neighbors model and fits it on the given scaled dataframe.'''
    
    # TODO: create the model, letting k = 10 and the distance formula be 'euclidean'. Then, fit it to the data
    # knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
    # knn.fit(scaled_df)
    return knn

In [16]:
def recommend_songs(track, knn, scaled_data):
    '''Given an input track, checks if the track is in the dataset. If so, it retrieves the row number of its first
    appearance and uses the K-Nearest Neighbors model to find the 10 neareset neighbors. Then, it prints the 10 tracks
    and corresponding artists.'''
    

    if track not in unique_tracks['track_name'].values:
        print("Track not in dataset")
    else:
        # TODO: find the index of the track in the dataframe, then find the indices of the 10 closest tracks
        # row_index = unique_tracks[unique_tracks['track_name'] == track].index[0]
        # print(unique_tracks.iloc[row_index])
        
        # distances, indices = knn.kneighbors(scaled_data.iloc[row_index].values.reshape(1, -1))

        ret_tracks = []
        # TODO: for each of the 10 closest tracks, append the 'track_name' and 'artists' features to the list
        # for i in indices:
        #     ret_tracks.append((unique_tracks.iloc[i]['track_name'], unique_tracks.iloc[i]['artists']))
        print(ret_tracks)

Now, let's call our functions and ask the user to input a track.

In [None]:
scaled_data = scale_columns()
knn = setup_knn(scaled_data)
input_track = input("Enter a track: ")
recommend_songs(input_track, knn, scaled_data)