![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Lab | Clustering songs

## Introduction

Now it's time to cluster the songs of the **hot_songs** and **not_hot_songs** databases according to the song's audio features. For this purpose, you need to consider the following questions:

* Are you going to use all the audio features? If not, which ones do you think that makes more sense to be used?
* What is the optimal number of clusters (for methods that need to know this beforehand)?
* What is the best distance to use?
* What clustering method provides better results?
* Does the clustering method need a transformer?

## Considerations

Be aware that this process is extremely time-consuming!!! (it might take several hours in your laptop). Therefore, when testing different options, save the models into your disk in order to be able to use the best model later. Use pickle for this.  You don't want to retrain the best model again when you know what are the optimal parameters for each.
To determine which clustering method performs best, you need to be practical and think how many clusters you might want to have alongside with a [clustering metric](https://analyticsindiamag.com/a-tutorial-on-various-clustering-evaluation-metrics/) to evaluate how good or bad the songs were clustered.
If the number of clusters is small, each cluster will be too big and generic. On the other hand, if the number of clusters is too big then each cluster it will be too specific and it will be poorly populated (this it also depend on how heterogeneous is your dataset).

On the other hand, when you train your clustering model make sure to concatenate both databases together (ie: **hot_songs** and **not_hot_songs**) before. 
If you don't combine both datasets, the clusters obtained with the **hot_songs** will be different than the ones obtained with the **not_hot_songs**
database even though they might have the same label because they will contain different songs. However, after this you will not know to which original
dataframe belongs each song. To prevent this problem, you can add a new column named "dataset" with a "flag" to remind yourself in which dataset was included
each song. 

Finally, add new column to the full dataset **for each clustering method with the cluster membership of each song** 

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pickle
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('extended_songs.csv')

# Define the features to be normalized
features_to_normalize = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'loudness', 'key', 'mode']

# Normalize the features
scaler = StandardScaler()
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

# Select the features for clustering
df_selected = df[features_to_normalize]

# Define the range of possible cluster numbers to try
cluster_numbers = range(2, 11)  # Start from 2 because silhouette_score requires at least 2 clusters

# Initialize an empty list to store the silhouette scores
silhouette_scores = []

# Perform KMeans for each number of clusters, save the silhouette score
for k in cluster_numbers:
    model = KMeans(n_clusters=k, random_state=1)
    model.fit(df_selected)
    score = silhouette_score(df_selected, model.labels_, metric='euclidean')
    silhouette_scores.append(score)

# Plot the Silhouette Method graph
plt.figure(figsize=(10, 6))
plt.plot(cluster_numbers, silhouette_scores, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method For Optimal Number of Clusters')
plt.grid(True)
plt.show()



In [None]:
# Based on the Silhouette Method graph, choose the optimal number of clusters
optimal_clusters = 7 # the number of clusters where the silhouette score is maximum

# Apply KMeans clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_clusters, random_state=1)
kmeans.fit(df_selected)

# Save the KMeans model
pickle.dump(kmeans, open("kmeans.pkl", "wb"))

# Add the KMeans cluster membership to the dataset
df['kmeans_cluster'] = kmeans.labels_

In [None]:
from sklearn.cluster import KMeans
import pickle

# Apply KMeans clustering
kmeans = KMeans(n_clusters=7, random_state=1)
kmeans.fit(df_selected)

# Save the model
pickle.dump(kmeans, open("kmeans.pkl", "wb"))


In [None]:
from sklearn.metrics import silhouette_score

# Calculate Silhouette Score
score = silhouette_score(df_selected, kmeans.labels_, metric='euclidean')

# Print the score
print('Silhouette Score: %.3f' % score)


In [None]:
# Add cluster membership to the dataset
df['kmeans_cluster'] = kmeans.labels_


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pickle

# Load the data
df = pd.read_csv('extended_songs.csv')

# Define the features to be normalized
features_to_normalize = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'loudness', 'key', 'mode']

# Normalize the features
scaler = StandardScaler()
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

"""
# Select the features for clustering
df_selected = df[features_to_normalize]

# Apply KMeans clustering with 5 clusters
kmeans5 = KMeans(n_clusters=5, random_state=1)
kmeans5.fit(df_selected)

# Save the KMeans model with 5 clusters
pickle.dump(kmeans5, open("kmeans5.pkl", "wb"))

# Calculate and print the Silhouette Score for 5 clusters
score5 = silhouette_score(df_selected, kmeans5.labels_, metric='euclidean')
print('Silhouette Score for 5 clusters: %.3f' % score5)

# Add the KMeans cluster membership with 5 clusters to the dataset
df['kmeans5_cluster'] = kmeans5.labels_
"""

# Apply KMeans clustering with 7 clusters
kmeans7 = KMeans(n_clusters=7, random_state=1)
kmeans7.fit(df_selected)

# Save the KMeans model with 7 clusters
pickle.dump(kmeans7, open("kmeans7.pkl", "wb"))

# Calculate and print the Silhouette Score for 7 clusters
score7 = silhouette_score(df_selected, kmeans7.labels_, metric='euclidean')
print('Silhouette Score for 7 clusters: %.3f' % score7)

# Add the KMeans cluster membership with 7 clusters to the dataset
df['kmeans7_cluster'] = kmeans7.labels_

# save the new dataset as extended_songs_clustered.csv
df.to_csv('extended_songs_clustered.csv', index=False)


In [None]:
df = pd.read_csv('extended_songs_clustered.csv')
df.head(1600)

### Now we can use the KMeans model to predict the cluster membership of new songs

In [1]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
import numpy as np
import pickle
from credentials import *

# Load the scaler
with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

# Load the KMeans model
with open('kmeans7.pkl', 'rb') as file:
    model = pickle.load(file)

# Load the song data
df = pd.read_csv('extended_songs_clustered.csv')

# Create Spotify object
credentials = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=credentials)

def search_song(title, artist, matches=10):
    """Search a song on Spotify and return its Spotify ID."""
    query = f'artist:{artist} track:{title}'
    song_ids = []
    song_titles = []
    song_artists = []
    try:
        results = sp.search(q=query, type='track', limit=matches)
        for i in range(matches):
            song_ids.append(results['tracks']['items'][i]['id'])
            song_titles.append(results['tracks']['items'][i]['name'])
            song_artists.append(results['tracks']['items'][i]['artists'][0]['name'])
    except:
        print("Song not found.")
        
    return list(zip(song_ids, song_titles, song_artists))

def recommend_songs(song_id):
    # Fetch audio features for the selected song
    audio_features = sp.audio_features([song_id])[0]
    song_df = pd.DataFrame([audio_features])

    # Define the features the scaler was trained on
    features = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'loudness', 'key', 'mode']

    # Only keep columns that the scaler was trained on
    song_df = song_df[features]

    # Scale the audio features and predict the cluster
    song_df_scaled = scaler.transform(song_df)

    # Convert scaled array back to DataFrame and assign original feature names
    song_df_scaled = pd.DataFrame(song_df_scaled, columns=features)

    # Predict the cluster
    cluster = model.predict(song_df_scaled)[0]

    # Check if the song is in our data and is 'hot'
    song_in_data = df[(df['id'] == song_id)]
    if not song_in_data.empty and song_in_data.iloc[0]['source'] == 'hot':
        # If the song is 'hot', recommend other 'hot' songs from the same cluster
        recommendations = df[(df['kmeans7_cluster'] == cluster) & (df['source'] == 'hot')]
    elif not song_in_data.empty:
        # If the song is not 'hot' but in our data, recommend other 'not hot' songs from the same cluster
        recommendations = df[(df['kmeans7_cluster'] == cluster) & (df['source'] == 'not_hot')]
    else:
        # If the song is not in our data, recommend 'not hot' songs from the same cluster
        recommendations = df[(df['kmeans7_cluster'] == cluster) & (df['source'] == 'not_hot')]

    # Select up to 5 recommendations
    recommendations = recommendations.sample(min(5, len(recommendations)))

    # Debugging step: Print the columns of the recommendations dataframe
    print(f"Recommendations columns: {recommendations.columns}")

    # Return the recommendations
    return (recommendations[['song', 'artist', 'uri']]
        .reset_index(drop=True)
        .rename(columns={'song': 'Song', 'artist': 'Artist', 'uri': 'URI'})
        .rename(lambda x: x + 1))

In [None]:
recommend_songs('3KkXRkHbMCARz0aVfEt68P')

In [12]:
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QPushButton, QLineEdit, QLabel, QInputDialog, QListWidget, QListWidgetItem, QDialog
from PyQt5.QtCore import Qt, QUrl
from PyQt5.QtGui import QDesktopServices, QFont
import webbrowser

class SongRecommenderApp(QWidget):
    def __init__(self):
        super().__init__()

        self.title = 'SongGPT'
        self.initUI()

    def initUI(self):
        self.setWindowTitle(self.title)

        layout = QVBoxLayout()

        self.label_header = QLabel('SoundGPT - Make Your Sound Come True')
        self.label_header.setFont(QFont('Arial', 20, QFont.Bold))

        self.label_song = QLabel('Enter a song title:')
        self.line_edit_song = QLineEdit()

        self.label_artist = QLabel('Enter the song artist:')
        self.line_edit_artist = QLineEdit()

        self.button = QPushButton('Search Songs')
        self.button.clicked.connect(self.on_click)

        layout.addWidget(self.label_header)
        layout.addWidget(self.label_song)
        layout.addWidget(self.line_edit_song)
        layout.addWidget(self.label_artist)
        layout.addWidget(self.line_edit_artist)
        layout.addWidget(self.button)

        self.setLayout(layout)

    def on_click(self):
        song = self.line_edit_song.text()
        artist = self.line_edit_artist.text()

        # Get song suggestions
        suggestions = search_song(song, artist)

        # Create a new dialog to display the suggestions
        dialog = SongSelectionDialog(suggestions)

        # If the user confirmed the selection, get the selected song and artist
        if dialog.exec_():
            song_id, song, artist = dialog.selected_song

            # Fetch audio features, scale the features, predict the cluster, and get song recommendations
            recommendations = recommend_songs(song_id)

            # Create a new dialog to display the song recommendations
            dialog = SongRecommendationsDialog(recommendations)
            dialog.exec_()


class SongSelectionDialog(QDialog):
    def __init__(self, suggestions):
        super().__init__()

        self.setWindowTitle('Select a Song')

        layout = QVBoxLayout()

        self.list_widget = QListWidget()

        for song_id, song, artist in suggestions:
            item = QListWidgetItem(f'{song} by {artist}')
            item.setData(Qt.UserRole, (song_id, song, artist))  # Store song_id, song, and artist in the item data
            self.list_widget.addItem(item)

        self.list_widget.itemDoubleClicked.connect(self.accept)  # Accept the dialog when an item is double clicked

        layout.addWidget(self.list_widget)

        self.setLayout(layout)

    @property
    def selected_song(self):
        item = self.list_widget.currentItem()
        return item.data(Qt.UserRole)  # Return song_id, song, and artist of the selected item


class SongRecommendationsDialog(QDialog):
    def __init__(self, recommendations):
        super().__init__()

        self.setWindowTitle('Song Recommendations')

        layout = QVBoxLayout()

        self.list_widget = QListWidget()
        self.list_widget.itemDoubleClicked.connect(self.on_item_double_clicked)

        for row in recommendations.itertuples():
            song = row.Song
            artist = row.Artist
            uri = row.URI
            item = QListWidgetItem(f'{song} by {artist}')
            item.setData(Qt.UserRole, uri)  # Store uri in the item data
            self.list_widget.addItem(item)

        self.repeat_button = QPushButton('Repeat')
        self.repeat_button.clicked.connect(self.close)

        self.exit_button = QPushButton('Exit')
        self.exit_button.clicked.connect(QApplication.instance().quit)

        layout.addWidget(self.list_widget)
        layout.addWidget(self.repeat_button)
        layout.addWidget(self.exit_button)

        self.setLayout(layout)

    def on_item_double_clicked(self, item):
        uri = item.data(Qt.UserRole)
        webbrowser.open(uri)


def main():
    app = QApplication([])
    app.setStyleSheet("QWidget { background-color: black; color: lightgreen }")

    window = SongRecommenderApp()
    window.show()

    app.exec_()


if __name__ == '__main__':
    main()


Recommendations columns: Index(['song', 'artist', 'source', 'id', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'type', 'id.1', 'uri', 'track_href',
       'analysis_url', 'duration_ms', 'time_signature', 'kmeans7_cluster'],
      dtype='object')
