In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
data = pd.read_csv('dataset.csv')

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [26]:
data.shape

(114000, 21)

In [8]:
print(data.head())

   Unnamed: 0                track_id                 artists  \
0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2              To Begin Again          57       210826     False   


In [15]:
features = data[['danceability', 'energy', 'valence', 'acousticness', 'instrumentalness', 'liveness', 'speechiness', 'popularity']]

In [16]:
features = features.dropna()

In [17]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


In [18]:
print(features_scaled[:5])

[[ 0.62924424 -0.71714792  0.92930586 -0.85020151 -0.50410861  0.75874327
   0.55184753  1.78262719]
 [-0.84590798 -1.88997974 -0.79868969  1.8317324  -0.50409391 -0.59121068
  -0.07899331  0.97563254]
 [-0.74218634 -1.12266943 -1.36568823 -0.31549883 -0.50411187 -0.50716686
  -0.27382571  1.06529861]
 [-1.73330424 -2.31299433 -1.27697417  1.7745932  -0.50388348 -0.42837577
  -0.45730865  1.69296111]
 [ 0.29503007 -0.78871054 -1.18440298  0.46339878 -0.50411187 -0.68628526
  -0.30314514  2.18612451]]


In [20]:
knn = NearestNeighbors(n_neighbors=6, metric='cosine')  # 6 for one query song + 5 recommendations
knn.fit(features_scaled)

In [21]:
def recommend_song(song_name, song_data, model, n_recommendations=5):
    """Recommend songs similar to the input song."""
    
    # Find the index of the song in the dataframe (update 'track_name' to match your dataset column)
    song_idx = song_data[song_data['track_name'] == song_name].index[0]
    
    # Get the feature vector for the song
    song_vector = features_scaled[song_idx].reshape(1, -1)
    
    # Find the nearest neighbors for the song
    distances, indices = model.kneighbors(song_vector, n_neighbors=n_recommendations + 1)
    
    # Get recommended songs (skip the first song since it's the input song)
    recommended_songs = song_data.iloc[indices[0][1:]]
    
    return recommended_songs[['track_name', 'artists', 'popularity']]  # Update 'track_name' and 'artists' columns

In [23]:
# Test the recommendation system with the song "Shape of You" (or any other song in your dataset)
song_name = "Shape of You"  
recommendations = recommend_song(song_name, data, knn)
print("Recommendations for:", song_name)
print(recommendations)

Recommendations for: Shape of You
                                            track_name  \
104186       Colgando en tus manos (con Marta Sánchez)   
81040   Mayakkama Kalakkama (From "Thiruchitrambalam")   
65007   Mayakkama Kalakkama (From "Thiruchitrambalam")   
80026   Mayakkama Kalakkama (From "Thiruchitrambalam")   
11480                                        New Shoes   

                            artists  popularity  
104186   Carlos Baute;Marta Sánchez          77  
81040   Dhanush;Anirudh Ravichander          77  
65007   Dhanush;Anirudh Ravichander          77  
80026   Dhanush;Anirudh Ravichander          77  
11480                  Paolo Nutini          67  
