### Song recommendation model MVP

In [18]:
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets # sklearn comes with some toy datasets to practice
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
import config
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from fuzzywuzzy import fuzz #  fuzzy string matching techniques to identify similar strings in the DataFrame

import warnings

# Suppress the warning
warnings.filterwarnings("ignore", message="X does not have valid feature names") # Added this to reduce number of warnings appearing after user input

df = pd.read_csv('my_music.csv') 

# I am removing ID as we want to train our model exclusively on numerical variables

df_noid = df.drop("id", axis=1)

scaler = StandardScaler()
scaler.fit(df_noid)
df_noid_scaled = scaler.transform(df_noid)
X_scaled_df = pd.DataFrame(df_noid_scaled, columns = df_noid.columns)

kmeans = KMeans(n_clusters=102, n_init = 5, random_state=1234) # Went for 102 as multiple of 17 which was original number of playlists; maximum number that still generated decent cluster sizes
kmeans.fit(X_scaled_df)

labels = kmeans.labels_

clusters = kmeans.predict(X_scaled_df)
#clusters
pd.Series(clusters).value_counts().sort_index()

df["cluster"] = clusters

# Ask the user to input a song
song = input("Please enter the name of a song. Tip: include the artist name for optimal results: ")

# Print the song name entered by the user
print("You entered the search:", song)

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= config.client_id,
                                                           client_secret= config.client_secret))

song_result = sp.search(q=song,limit = 1, type = "track")
track_id=song_result["tracks"]["items"][0]["id"]

sp.audio_features(track_id)

df_billboard = pd.read_csv('billboard_hot_100_df.csv')

# Check if the user input is similar to any value in the DataFrame
matching_songs = df_billboard[df_billboard['songs'].apply(lambda x: fuzz.partial_ratio(song.lower(), x.lower())) > 74]

if not matching_songs.empty:
    print("\033[91m🔥 Your song is hot! 🔥\033[0m")
    
    # Retrieve a random song from the Billboard Hot 100 as a recommendation
    if not matching_songs.empty:
        recommendation = df_billboard.sample()
        print("\033[1m👇 Here's another hot song from the Billboard Hot 100 as a recommendation:\033[0m")
        print(recommendation)
        
        recommendation_song_name = recommendation.iloc[0]['songs']  # Assuming 'songs' is the column containing the song names
        song_result = sp.search(q=recommendation_song_name, limit=1, type="track")
        matching_track_id = song_result["tracks"]["items"][0]["id"]
        print("The Spotify ID of this track is", matching_track_id)

        # Embed the input track 
        print("You can listen to the recommended track below")
        input_track_iframe = IFrame(src="https://open.spotify.com/embed/track/"+matching_track_id,
                           width="320",
                           height="80",
                           frameborder="0",
                           allowtransparency="true",
                           allow="encrypted-media")
        display(input_track_iframe)
    
else:

        print("\033[1m\033[91m😔 Your song doesn't seem to be hot. Fear not! We're cooking something up for you.")
        sample_df = pd.DataFrame(sp.audio_features(track_id)).drop(columns = ["id","type","uri","track_href","analysis_url"]) # Remove non-numerical columns and ID
        sample_array_scaled = scaler.transform(sample_df)

        from sklearn.cluster import KMeans

        sample_pred = kmeans.predict(sample_array_scaled)
        sample_cluster = sample_pred[0]

        recommendation = df[df['cluster'] == sample_cluster].sample()

        import numpy as np

        # Assuming recommendation.id is a NumPy array
        recommendation_id = np.array(recommendation.id)

        # Convert the NumPy array to a string
        id_string = str(recommendation_id)

        # Split the string by single quotes and take the second element (index 1)
        recommendation_id_isolated = id_string.split("'")[1]

        from IPython.display import IFrame, display

        # Embed the input track
        input_track_iframe = IFrame(src="https://open.spotify.com/embed/track/"+track_id,
                            width="320",
                            height="80",
                            frameborder="0",
                            allowtransparency="true",
                            allow="encrypted-media")

        # Embed the recommended track
        recommendation_iframe = IFrame(src="https://open.spotify.com/embed/track/"+recommendation_id_isolated,
                               width="320",
                               height="80",
                               frameborder="0",
                               allowtransparency="true",
                               allow="encrypted-media")

        # Display both input and recommended tracks
        print("Your input track is: ")
        print("Cluster check: ", sample_cluster)
        display(input_track_iframe)
        print("\U0001F525 Our recommendation is: \U0001F525")
        display(recommendation_iframe) 



Please enter the name of a song. Tip: include the artist name for optimal results:  Despacito - Luis Fonsi


You entered the search: Despacito - Luis Fonsi
[1m[91m😔 Your song doesn't seem to be hot. Fear not! We're cooking something up for you.
Your input track is: 
Cluster check:  32


🔥 Our recommendation is: 🔥


### Checking count per cluster

In [9]:
pd.Series(clusters).value_counts().sort_index()

0      53
1      82
2      16
3      53
4      27
       ..
97     21
98     36
99     23
100    42
101    15
Name: count, Length: 102, dtype: int64

### Calculating the silhouette score

In [10]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import config
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from fuzzywuzzy import fuzz
from sklearn.metrics import silhouette_score
from IPython.display import IFrame, display

import warnings

# Suppress the warning
warnings.filterwarnings("ignore", message="X does not have valid feature names")

# Load the data
df = pd.read_csv('my_music.csv')

# Remove ID column
df_noid = df.drop("id", axis=1)

# Standardize the data
scaler = StandardScaler()
scaler.fit(df_noid)
df_noid_scaled = scaler.transform(df_noid)
X_scaled_df = pd.DataFrame(df_noid_scaled, columns=df_noid.columns)

# Train the KMeans model
kmeans = KMeans(n_clusters=85, n_init=5, random_state=1234)
kmeans.fit(X_scaled_df)

# Get the clusters
clusters = kmeans.predict(X_scaled_df)
df["cluster"] = clusters

# Ask the user to input a song
song = input("Please enter the name of a song. Tip: include the artist name for optimal results: ")
print("You entered the search:", song)

# Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=config.client_id,
                                                           client_secret=config.client_secret))

# Search for the input song on Spotify
song_result = sp.search(q=song, limit=1, type="track")
track_id = song_result["tracks"]["items"][0]["id"]

# Retrieve audio features of the input song
sample_df = pd.DataFrame(sp.audio_features(track_id)).drop(columns=["id", "type", "uri", "track_href", "analysis_url"])
sample_array_scaled = scaler.transform(sample_df)

# Predict the cluster of the input song
sample_pred = kmeans.predict(sample_array_scaled)
sample_cluster = sample_pred[0]

# Calculate silhouette score
silhouette_avg = silhouette_score(X_scaled_df, clusters)
print("Silhouette Score:", silhouette_avg)

Please enter the name of a song. Tip: include the artist name for optimal results:  blur tender


You entered the search: blur tender
Silhouette Score: 0.1126319551117379


Note

A silhouette score ranges from -1 to 1, where:

A score closer to 1 indicates that the samples are well-clustered and far away from neighboring clusters.
A score around 0 indicates overlapping clusters with samples that are close to the decision boundary between two neighboring clusters.
A score closer to -1 indicates that the samples may have been assigned to the wrong clusters.