In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

In [93]:
data = pd.read_csv('spotify_songs.csv')
df = pd.read_csv('Spotify Data/data-clean.csv')

In [86]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40560 entries, 0 to 40559
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   track_id           40560 non-null  object 
 1   time_signature     40560 non-null  float64
 2   chorus_hit         40560 non-null  float64
 3   sections           40560 non-null  float64
 4   target             40560 non-null  float64
 5   popularity         312 non-null    float64
 6   sm_target          40560 non-null  float64
 7   tiktok             40560 non-null  int64  
 8   spotify            40560 non-null  int64  
 9   track              40560 non-null  object 
 10  artist             40560 non-null  object 
 11  duration_ms        40560 non-null  float64
 12  danceability       40560 non-null  float64
 13  energy             40560 non-null  float64
 14  key                40560 non-null  object 
 15  loudness           40560 non-null  float64
 16  mode               405

In [87]:
data = data.drop(["popularity", "era","main_parent_genre", "sections", "chorus_hit" ], axis=1)
data = data.drop(['tiktok', "sm_target", 'spotify', "track"], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40560 entries, 0 to 40559
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          40560 non-null  object 
 1   time_signature    40560 non-null  float64
 2   target            40560 non-null  float64
 3   artist            40560 non-null  object 
 4   duration_ms       40560 non-null  float64
 5   danceability      40560 non-null  float64
 6   energy            40560 non-null  float64
 7   key               40560 non-null  object 
 8   loudness          40560 non-null  float64
 9   mode              40560 non-null  object 
 10  speechiness       40560 non-null  float64
 11  acousticness      40560 non-null  float64
 12  instrumentalness  40560 non-null  float64
 13  liveness          40560 non-null  float64
 14  valence           40560 non-null  float64
 15  tempo             40560 non-null  float64
dtypes: float64(12), object(4)
memory usage: 

In [88]:
data = pd.get_dummies(data)

In [89]:
feature_cols=['acousticness', 'danceability', 'duration_ms', 'energy',
              'instrumentalness', 'liveness', 'loudness', 
              'speechiness', 'tempo', 'time_signature', 'valence',]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_df =scaler.fit_transform(data[feature_cols])

print(normalized_df[:2])

[[0.49196787 0.42206478 0.03811378 0.6199046  0.         0.0658215
  0.78355379 0.04197917 0.76900295 0.6        0.84839357]
 [0.01807229 0.50404858 0.04775985 0.50487572 0.107      0.1653144
  0.69396381 0.03510417 0.42167068 0.8        0.8002008 ]]


In [100]:
indices = pd.Series(data.index, index=data['track_id']).drop_duplicates()
# Create cosine similarity matrix based on given matrix
cosine = cosine_similarity(normalized_df)
def generate_recommendation(track_id, model_type=cosine):
    """
    Purpose: Function for song recommendations 
    Inputs: track_id and type of similarity model
    Output: Pandas series of recommended songs
    """
    # Get song indices
    index = indices[track_id]
    # Get list of songs for given track_id
    score = list(enumerate(model_type[index]))
    # Sort the most similar songs
    similarity_score = sorted(score, key=lambda x: x[1], reverse=True)
    # Select the top-10 recommended songs
    similarity_score = similarity_score[1:11]
    top_songs_index = [i[0] for i in similarity_score]
    # Top 10 recommended songs
    top_songs = data['track_id'].iloc[top_songs_index]
    return top_songs


In [98]:
print(indices.keys())

Index(['1dtKN6wwlolkM8XZy2y9C1', '5hjsmSnUefdUqzsDogisiX',
       '6uk8tI6pwxxdVTNlNOJeJh', '7aNjMJ05FvUXACPWZ7yJmv',
       '1rQ0clvgkzWr001POOPJWx', '32VBSoD2vcoIOiPEvAfFXU',
       '62eoocmGk59EMfuRcLBvUL', '0ecGwZA1ReAq4Mn3ZInaT4',
       '15gNqKkZN9Mq9AvnRqWJKb', '1hcMfYTsRTC4hIKbfosxjz',
       ...
       '4Ro98RCK90oHqqSZUnTFq5', '0aUWfpD3PlSv3FTTKcT2rN',
       '3bnVBN67NBEzedqQuWrpP4', '2QjOHCTQ1Jl3zawyYOpxh6',
       '4MofYf0f4ijlVV6elUW5S3', '4t1TljQWJ6ZuoSY67zVvBI',
       '2MShy1GSSgbmGUxADNIao5', '55qBw1900pZKfXJ6Q9A2Lc',
       '4o9npmYHrOF1rUxxTVH8h4', '2khIaVUkbMmDHB596lyMG3'],
      dtype='object', name='track_id', length=40560)


In [101]:
print("Recommended Songs:")
print(generate_recommendation('5hjsmSnUefdUqzsDogisiX',cosine).values)

Recommended Songs:
['1A4eGVWeU3TUd1MxNdxU65' '5Abd8qlHxfliMmkggUCiSY'
 '41TqRBgF4Ahyr4vGmZVk8b' '2KSwmLHkgjZCE49YFkgspo'
 '4ozKDDSnHMv3HRoPwUQ01x' '55mQhobuwtY7lfLAXylg1k'
 '7KmHVdey9cB6ITm229T6Jw' '18bH4uo8CFY2yVwHbjal6l'
 '2jONAYO6JniitrbnLEoCwX' '6RueOnbP7XQyZaru8NBCdi']


In [102]:
# Create sigmoid kernel matrix based on given matrix
sig_kernel = sigmoid_kernel(normalized_df)

print("Recommended Songs:")
print(generate_recommendation('Parallel Lines',sig_kernel).values)