# Lab | Unsupervised learning intro

It's the moment to perform clustering on the songs you collected. Remember that the ultimate goal of this little project is to improve the recommendations of artists. Clustering the songs will allow the recommendation system to limit the scope of the recommendations to only songs that belong to the same cluster - songs with similar audio features.

The experiments you did with the Spotify API and the Billboard web scraping will allow you to create a pipeline such that when the user enters a song, you:

Check whether or not the song is in the Billboard Hot 200.
Collect the audio features from the Spotify API.
After that, you want to send the Spotify audio features of the submitted song to the clustering model, which should return a cluster number.

We want to have as many songs as possible to create the clustering model, so we will add the songs you collected to a bigger dataset available on Kaggle containing 160 thousand songs.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler

from matplotlib.lines import Line2D
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

import requests

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from random import randint

from sklearn.cluster import KMeans

import random

pd.set_option('display.max_columns', 165)
pd.set_option('display.max_rows', 165)

In [None]:
secrets_file = open("secrets.txt","r")

In [None]:
string = secrets_file.read()

In [None]:
string

In [None]:
string.split('\n')

In [None]:
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        print(line.split(': '))
        secrets_dict[line.split(':')[0]]=line.split(':')[1].strip()

In [None]:
secrets_dict

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
                                                           client_secret=secrets_dict['csecret']))

songs_df and multiple_playlist_df were previously created in API Wrappers Lab. Then, csv files were produced. In this lab, the csv files are read.

In [None]:
songs_df = pd.read_csv('songs_df.csv')
songs_df

In [None]:
songs_df = songs_df.drop(['Unnamed: 0'], axis=1)
songs_df

In [None]:
multiple_playlist_df = pd.read_csv('multiple_playlist_df.csv')
multiple_playlist_df

In [None]:
multiple_playlist_df = multiple_playlist_df.drop(['Unnamed: 0'], axis=1)
multiple_playlist_df

In [None]:
multiple_playlist_df.columns

In [None]:
multiple_playlist_df.shape

In [None]:
multiple_playlist_df.info()

In [None]:
# statistical distribution of the data
multiple_playlist_df.describe()

In [None]:
X = multiple_playlist_df[['danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo','duration_ms', 'time_signature']]

In [None]:
print(X.shape)
X.head()


In [None]:
X.dtypes

In [None]:
transformer = StandardScaler().fit(X)
X_normalized = transformer.transform(X)
X_norm = pd.DataFrame(X_normalized, columns=X.columns)
X_norm.head()

In [None]:
K = range(2, 20) # up to 19 clusters
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234) # build model with k as my number of clusters.
    kmeans.fit(X_norm) # fit model with data X_norm. # group data based on similarity of features
    inertia.append(kmeans.inertia_) # evaluate the clusters after clustering is the way to evaluate clustering strategy.

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

In [None]:
from sklearn.metrics import silhouette_score
K = range(2, 20)
silhouette = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(X_norm)
    silhouette.append(silhouette_score(X_norm, kmeans.predict(X_norm)))


plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Slhouette score showing the optimal k')

# highest value of silhouette score is optimum number of clusters.

In [None]:
kmeans = KMeans(n_clusters=8, random_state=1234) # define kmeans model. there are 8 clusters.
kmeans.fit(X_norm)

In [None]:
# Predicting / assigning the clusters:
clusters = kmeans.predict(X_norm) # predict the clusters

# Check the size of the clusters
pd.Series(clusters).value_counts().sort_index()

In [None]:
clusters

In [None]:
kmeans.inertia_ # inertia of that model

In [None]:
# converting predictions into a df
cluster_no = pd.DataFrame(data=clusters, dtype=int)
cluster_no.columns = ['k_cluster']

# predictions as a df
print(cluster_no.shape)
cluster_no.head()

In [None]:
# concatenating the cluster column to the dataframe
multiple_playlist_df = pd.concat([multiple_playlist_df, cluster_no], axis=1)

# checking the dataframe
print(multiple_playlist_df.shape)
multiple_playlist_df.head()

In [None]:
# checking for null
multiple_playlist_df.isnull().sum().sum()

In [None]:
# checking number of songs in each cluster
multiple_playlist_df['k_cluster'].value_counts()

In [None]:
# checking the songs in the cluster
multiple_playlist_df.loc[multiple_playlist_df['k_cluster'] == 4][:5]

In [None]:
favorite_song = input('Please enter your favorite song: ')

In [None]:
# Function to obtain an uri from a song name
def song_uri(favorite_song):
    try:
        # Creating the spotipy element for the playlist URI
        querry = 'track:'+str(favorite_song)
        track = sp.search(q=favorite_song, limit=1)
        return track['tracks']['items'][0]['uri'].split('spotify:track:')[1]                 
    except:
        return 'Null'

In [None]:
song_uri(favorite_song)

In [None]:
def collect_song_features(uri):
    
    # Create empty dataframe
    playlist_features_list = ["danceability","energy","key","loudness","mode", "speechiness","acousticness",
                              "instrumentalness","liveness","valence","tempo","duration_ms","time_signature"]

    playlist_df = pd.DataFrame(columns = playlist_features_list)
        
    # Get audio features
    audio_features = sp.audio_features(uri)[0]
    playlist_df.loc[len(playlist_df)] = [audio_features[feature] for feature in playlist_features_list]
        
    return playlist_df

In [None]:
collect_song_features('32OlwWuMpZ6b0aN2RZOeMS')

In [None]:
collect_song_features(song_uri(favorite_song))

In [None]:
secure_random = random.SystemRandom()
print(secure_random.choice(songs_df['song']))

In [None]:
def song_recommender():
    favorite_song = input('Please enter your favorite song: ')
    if len(songs_df[(songs_df['song'].isin([favorite_song]))])>0:
        random_song = secure_random.choice(songs_df['song'])
        artist = songs_df.loc[songs_df['song'] == random_song, 'artist'].item()
        print("Our song recommendation is: ", random_song,'by', artist)
    else:
        df = collect_song_features(song_uri(favorite_song))
        cluster = kmeans.predict(pd.DataFrame(transformer.transform(df), columns=df.columns))
        element = multiple_playlist_df[multiple_playlist_df['k_cluster'] == cluster[0]]
        random_song = element['track_name'].sample().item()
        artist = element.loc[element['track_name'] == random_song, 'artist'].item()
        print("Our song recommendation is: ", random_song, 'by', artist)

In [None]:
song_recommender()