# Content based recommendation system

## Basic inception

14 numeric values found. All will be used in the analysis except for "time_signature". ["album_name", "track_name", "artists", "track_id"] are kept in the DS due to user friendliness; but not used. "explicit" column is going to be used as a filter only.

In [1]:
import pandas as pd
df = pd.read_csv("dataset.csv", index_col=0)
# check dtypes, cols etc.
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 114000 entries, 0 to 113999
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          114000 non-null  object 
 1   artists           113999 non-null  object 
 2   album_name        113999 non-null  object 
 3   track_name        113999 non-null  object 
 4   popularity        114000 non-null  int64  
 5   duration_ms       114000 non-null  int64  
 6   explicit          114000 non-null  bool   
 7   danceability      114000 non-null  float64
 8   energy            114000 non-null  float64
 9   key               114000 non-null  int64  
 10  loudness          114000 non-null  float64
 11  mode              114000 non-null  int64  
 12  speechiness       114000 non-null  float64
 13  acousticness      114000 non-null  float64
 14  instrumentalness  114000 non-null  float64
 15  liveness          114000 non-null  float64
 16  valence           114000 

check number of missing values

In [2]:
print(df.isna().sum())

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


check number of duplicates

In [3]:
duplicated = df.duplicated()
print(duplicated.sum())

450


In [4]:
df = df.drop_duplicates()
df

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.4610,1,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.420,0.1660,1,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.4430,2,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,4,acoustic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113995,2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,21,384999,False,0.172,0.2350,5,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,5,world-music
113996,1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,22,385000,False,0.174,0.1170,0,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,4,world-music
113997,6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,Best Of,Miss Perfumado,22,271466,False,0.629,0.3290,0,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,4,world-music
113998,2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,Change Your World,Friends,41,283893,False,0.587,0.5060,7,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,4,world-music


# Data cleaning

## Handle missing values

there are very few rows with missing values; therefore safe to remove.

In [5]:
df.dropna(inplace=True)
print(df.isna().sum())

track_id            0
artists             0
album_name          0
track_name          0
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


lower case value of "artists" before changing dtype to list

In [6]:
df["artists"]= df["artists"].str.lower().str.strip()

## confirm column data types

In [7]:
# # make a list of artist/s
# df["artists"] = df["artists"].str.split(";")
# print(df["artists"].head())

In [8]:
# make sure album and track name is a string
df["album_name"]= df["album_name"].astype(str)
df["track_name"]= df["track_name"].astype(str)
# popularity is between 0 to 100
df["popularity"]= df["popularity"].astype("int8")
df["duration_ms"]=df["duration_ms"].astype("int32")
df["danceability"]=df["danceability"].astype("float32")
df["energy"]=df["energy"].astype("float32")
df["key"]=df["key"].astype("int8")
df["loudness"]=df["loudness"].astype("float32")
df["mode"]=df["mode"].astype("int8")
df["speechiness"]=df["speechiness"].astype("float32")
df["acousticness"]=df["acousticness"].astype("float32")
df["instrumentalness"]=df["instrumentalness"].astype("float32")
df["liveness"]=df["liveness"].astype("float32")
df["valence"]=df["valence"].astype("float32")
df["tempo"]=df["tempo"].astype("int16")
# most tracks in this DS are modern and this info is sort of useless
df= df.drop(columns=["time_signature"])


Standardize string values: lowercase them and make sure there are no unwanted spaces at the beginning

In [9]:
df[["track_name","album_name", "track_genre"]] = df[["track_name", "album_name", "track_genre"]].apply(lambda x: x.str.lower().str.strip())
df.head(10)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,gen hoshino,comedy,comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,ben woodward,ghost (acoustic),ghost - acoustic,55,149610,False,0.42,0.166,1,-17.235001,1,0.0763,0.924,6e-06,0.101,0.267,77,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,ingrid michaelson;zayn,to begin again,to begin again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,kina grannis,crazy rich asians (original motion picture sou...,can't help falling in love,71,201933,False,0.266,0.0596,0,-18.514999,1,0.0363,0.905,7.1e-05,0.132,0.143,181,acoustic
4,5vjLSffimiIP26QG5WcN2K,chord overstreet,hold on,hold on,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119,acoustic
5,01MVOl9KtVTNfFiBU9I7dc,tyrone wells,days i will remember,days i will remember,58,214240,False,0.688,0.481,6,-8.807,1,0.105,0.289,0.0,0.189,0.666,98,acoustic
6,6Vc5wAMmXdKIAM7WUoEb7N,a great big world;christina aguilera,is there anybody out there?,say something,74,229400,False,0.407,0.147,2,-8.822,1,0.0355,0.857,3e-06,0.0913,0.0765,141,acoustic
7,1EzrEOXmMH3G43AXT1y7pA,jason mraz,we sing. we dance. we steal things.,i'm yours,80,242946,False,0.703,0.444,11,-9.331,1,0.0417,0.559,0.0,0.0973,0.712,150,acoustic
8,0IktbUcnAGrvD03AWnz3Q8,jason mraz;colbie caillat,we sing. we dance. we steal things.,lucky,74,189613,False,0.625,0.414,0,-8.7,1,0.0369,0.294,0.0,0.151,0.669,130,acoustic
9,7k9GuJYLp2AzqokyEdwEw2,ross copperman,hunger,hunger,56,205594,False,0.442,0.632,1,-6.77,1,0.0295,0.426,0.00419,0.0735,0.196,78,acoustic


## Scaling numeric values

### using minmaxscaler so that all values are between 0 and 1

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numeric_cols= ["danceability","energy","valence","tempo","loudness"]
df[numeric_cols]= scaler.fit_transform(df[numeric_cols])
df[numeric_cols].head()

Unnamed: 0,danceability,energy,valence,tempo,loudness
0,0.686294,0.461,0.718593,0.358025,0.791391
1,0.426396,0.166,0.268342,0.316872,0.597377
2,0.44467,0.359,0.120603,0.312757,0.736123
3,0.270051,0.0596,0.143719,0.744856,0.573701
4,0.627411,0.443,0.167839,0.489712,0.737103


### One-hot-encoding genres

In [11]:
print(df["track_genre"].value_counts())
print(df["track_genre"].nunique())

track_genre
acoustic      1000
british       1000
electronic    1000
emo           1000
funk          1000
              ... 
honky-tonk     981
dance          965
german         963
classical      933
romance        904
Name: count, Length: 114, dtype: int64
114


In [12]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output= False)
track_genre_matrix= ohe.fit_transform(df["track_genre"].values.reshape(-1,1))
track_genre_df = pd.DataFrame(track_genre_matrix, columns=ohe.get_feature_names_out(['track_genre']))
track_genre_df

Unnamed: 0,track_genre_acoustic,track_genre_afrobeat,track_genre_alt-rock,track_genre_alternative,track_genre_ambient,track_genre_anime,track_genre_black-metal,track_genre_bluegrass,track_genre_blues,track_genre_brazil,...,track_genre_spanish,track_genre_study,track_genre_swedish,track_genre_synth-pop,track_genre_tango,track_genre_techno,track_genre_trance,track_genre_trip-hop,track_genre_turkish,track_genre_world-music
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
113545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
113546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
113547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Saving the cleaned DS

In [13]:
df.to_csv("dataset_cleaned.csv")

# Inception of vectors similarity

join the 2 matrices together 

In [14]:
print(track_genre_df.isna().sum())

track_genre_acoustic       0
track_genre_afrobeat       0
track_genre_alt-rock       0
track_genre_alternative    0
track_genre_ambient        0
                          ..
track_genre_techno         0
track_genre_trance         0
track_genre_trip-hop       0
track_genre_turkish        0
track_genre_world-music    0
Length: 114, dtype: int64


In [15]:
print(df.isna().sum())

track_id            0
artists             0
album_name          0
track_name          0
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
track_genre         0
dtype: int64


In [16]:
df = df.reset_index(drop=True)
genre_df = track_genre_df.reset_index(drop=True)
features_df = pd.concat([df[numeric_cols], track_genre_df], axis=1)
print(features_df.isna().sum())

danceability               0
energy                     0
valence                    0
tempo                      0
loudness                   0
                          ..
track_genre_techno         0
track_genre_trance         0
track_genre_trip-hop       0
track_genre_turkish        0
track_genre_world-music    0
Length: 119, dtype: int64


In [17]:
features_df= features_df.astype("float32")
features_df=features_df.reset_index(drop=True)
features_df

Unnamed: 0,danceability,energy,valence,tempo,loudness,track_genre_acoustic,track_genre_afrobeat,track_genre_alt-rock,track_genre_alternative,track_genre_ambient,...,track_genre_spanish,track_genre_study,track_genre_swedish,track_genre_synth-pop,track_genre_tango,track_genre_techno,track_genre_trance,track_genre_trip-hop,track_genre_turkish,track_genre_world-music
0,0.686294,0.4610,0.718593,0.358025,0.791391,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.426396,0.1660,0.268342,0.316872,0.597377,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.444670,0.3590,0.120603,0.312757,0.736123,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.270051,0.0596,0.143719,0.744856,0.573701,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.627411,0.4430,0.167839,0.489712,0.737103,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,0.174619,0.2350,0.034070,0.514403,0.612952,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
113545,0.176650,0.1170,0.035176,0.349794,0.577345,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
113546,0.638579,0.3290,0.746734,0.543210,0.714648,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
113547,0.595939,0.5060,0.415075,0.555556,0.714759,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
features_df = features_df.drop_duplicates()
features_df=features_df.reset_index(drop=True)
df = df.loc[features_df.index] 

## using cosine similarity

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend(track_name, artist_name, df, features_df, top_n=5):
    """
    Recommend tracks similar to a given track by name and artist.

    Args:
        track_name (str): Name of the track.
        artist_name (str): Name of the artist (used to select the correct track).
        df (DataFrame): Original DataFrame containing track metadata.
        features_df (DataFrame): Feature matrix (numeric + one-hot genres).
        top_n (int): Number of recommendations to return.

    Returns:
        list: List of recommended tracks in "track_name — artist_name" format.
    """
    
    # Lowercase for case-insensitive matching (if you lowercased your DataFrame)
    track_name = track_name.lower()
    artist_name = artist_name.lower()
    
    # Check if track exists
    subset = df[(df["track_name"] == track_name) & (df["artists"].str.lower() == artist_name)]
    if subset.empty:
        return ["Track not found!"]

    # Get the correct row index
    idx = subset.index[0]

    # Get feature vector for the selected track
    track_vec = features_df.iloc[idx].values.reshape(1, -1)

    # Compute cosine similarity against all tracks
    similarities = cosine_similarity(track_vec, features_df.values)[0]

    # Get top N most similar tracks (skip itself)
    top_indices = np.argsort(similarities)[::-1][1:top_n+1]

    # Build recommendation list
    recommendations = []
    for i in top_indices:
        recommendations.append(f"{df.iloc[i]['track_name']} — {df.iloc[i]['artists']}")

    return recommendations


In [21]:
track = "We Don't Talk Anymore (feat. Selena Gomez)"
artist = "Charlie Puth;Selena Gomez"

recs = recommend(track, artist, df, features_df, top_n=5)
for r in recs:
    print(r)

poker face — lady gaga
the nights — avicii
pull-up — jason derulo
in the name of love — martin garrix;bebe rexha
let me love you — dj snake;justin bieber
