# Song Recommender Program Using Unsupervised Learning through the K-Nearest Neighbors Algorithm

First, run these commands in the terminal to install the needed packages:

pip install pandas

pip install scikit-learn

In [1]:
# import pandas
import pandas as pd
 
# scikit-learn imports
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In general, our steps that we'll take are as follows:

1. Read in the dataset and convert it to a pandas dataframe.

2. Explore the dataset by looking at its dimensions, what columns it has, etc.

3. Clean the dataset by getting rid of any duplicates, missing values, and unneeded columns.

4. Convert categorical data into numerical data that can be used for our algorithm.

5. Scale the data to minimize biases.

6. Create the KNN model and ask the user for an input track.

In [2]:
# read in the csv file and create a dataframe
tracks = pd.read_csv('dataset.csv')
pd.set_option('display.max_columns', None)

In [None]:
# print out the first 5 rows
tracks.head()

In [None]:
# print out dimensions of df (rows, columns)
tracks.shape

In [None]:
# print out the columns to see what attributes each track has
tracks.columns

As you may have noticed, the quality of our machine learning model is heavily reliant on our dataset. Therefore, we need to make sure that our dataset is clean in order to produce high-quality results. This includes dealing with missing/NULL values, duplicate data, and irrelevant data in our dataset. 

In [None]:
# print out if we have any duplicates in our dataset. Looks like there's not!
tracks.duplicated()

# see if there's missing data in any of the columns
tracks.info()

In [None]:
# filter the columns into categorical and numerical data

# categorical columns
cat_col = [col for col in tracks.columns if tracks[col].dtype == 'object']
print('Categorical columns :',cat_col)
# numerical columns
num_col = [col for col in tracks.columns if tracks[col].dtype != 'object']
print('Numerical columns :',num_col)



In [None]:
# we see that there are some rows with the same track ID. This means that we have duplicate songs!
tracks[cat_col].nunique()

# TODO: get rid of the duplicates and make sure that there are none left

# unique_tracks = tracks.drop_duplicates().reset_index(drop=True)
# duplicates = unique_tracks['track_id'].duplicated()
print("Rows with duplicates in 'track_id' column:")
(unique_tracks[duplicates])['track_name']

We still have some duplicates because some songs are remixes, so they have different track IDs. Thus, it makes it harder to find these remixes and get rid of them. You could probably utilize regex's to do this, but for the purposes of this workshop, we'll just leave in the remixes. They shouldn't affect the model that much since there are over 100,000 songs in the dataset.

In [9]:
# remove a feature that we aren't interested in
unique_tracks = unique_tracks.drop(['track_id'], axis = 1)

In order for our machine learning algorithm to work, we'll need each categorical variable to be converted into real numbers. We can do this by using label encoding, which assigns each unique genre a number.

In [None]:
# TODO: print the unique genres

# unique_genres = unique_tracks["track_genre"].unique()
# print(unique_genres)

# fit and transform the 'track_genre' column to numerical values
label_encoder = LabelEncoder()
unique_tracks['track_genre'] = label_encoder.fit_transform(unique_tracks['track_genre'])

In [None]:
# show the dataframe with the encoded genre
unique_tracks["track_genre"].unique()
non_unique_rows = unique_tracks[unique_tracks.duplicated(keep=False)]
non_unique_rows.head()

In [None]:
# let's also get rid of songs with duplicate track names

# TODO: drop rows that have duplicate track names

# unique_tracks = unique_tracks.drop_duplicates(subset=['track_name'], keep='first').reset_index(drop=True)
# non_unique_rows = unique_tracks[unique_tracks.duplicated(keep=False)]

print(len(non_unique_rows))

As before with the 'track_genre' feature, we have another feature, the 'explicit' feature, which represents a boolean that is True if the song is explicit and False if not. Here, we can use one-hot encoding to convert each value to a 0 or a 1 to work with our algorithm.

In [None]:
# TODO: utilize one-hot encoding to convert the 'explicit' feature into a binary format

# unique_tracks['explicit'] = unique_tracks['explicit'].astype(int)
# print(unique_tracks['explicit'].unique())

In [14]:
def scale_columns():
    '''In this function, we'll be scaling the data by standardizing it, making sure that it's a dataframe,
    printing its head and shape, and then returning the scaled dataframe.'''
    
    # TODO: create a StandardScaler() object, fit the scaler to the numerical data and transform the data, 
    # convert to dataframe
    # scaler = StandardScaler()

    # # fit the scaler to the data and transform it
    # scaled_data = scaler.fit_transform(unique_tracks.drop(columns=['album_name', 'track_name', 'artists']))
    # scaled_df = pd.DataFrame(scaled_data, columns=unique_tracks.columns.drop(['album_name', 'track_name', 'artists']))

    scaled_df.head()
    scaled_df.shape
    return scaled_df
    

In [15]:
def setup_knn(scaled_df):
    '''Initializes the K-Nearest Neighbors model and fits it on the given scaled dataframe.'''
    
    # TODO: create the model, letting k = 10 and the distance formula be 'euclidean'. Then, fit it to the data
    # knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
    # knn.fit(scaled_df)
    return knn

In [16]:
def recommend_songs(track, knn, scaled_data):
    '''Given an input track, checks if the track is in the dataset. If so, it retrieves the row number of its first
    appearance and uses the K-Nearest Neighbors model to find the 10 nearest neighbors. Then, it prints the 10 tracks
    and corresponding artists.'''
    

    if track not in unique_tracks['track_name'].values:
        print("Track not in dataset")
    else:
        # TODO: find the index of the track in the dataframe, then find the indices of the 10 closest tracks
        # row_index = unique_tracks[unique_tracks['track_name'] == track].index[0]
        # (unique_tracks.iloc[row_index]
        
        # distances, indices = knn.kneighbors(scaled_data.iloc[row_index].values.reshape(1, -1))

        ret_tracks = []
        # TODO: for each of the 10 closest tracks, append the 'track_name' and 'artists' features to the list
        # for i in indices:
        #     ret_tracks.append((unique_tracks.iloc[i]['track_name'], unique_tracks.iloc[i]['artists']))
        print(ret_tracks)

Now, let's call our functions and ask the user to input a track.

In [None]:
scaled_data = scale_columns()
knn = setup_knn(scaled_data)
input_track = input("Enter a track: ")
recommend_songs(input_track, knn, scaled_data)