<a href="https://colab.research.google.com/github/luciaokay/DataMining2023/blob/main/Music_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

**Read Data**

In [None]:
data = pd.read_csv('Spotify_Dataset/data.csv')
genre_data = pd.read_csv('Spotify_Dataset/data_by_genres.csv')
year_data = pd.read_csv('Spotify_Dataset/data_by_year.csv')

In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36532 entries, 0 to 36531
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   valence           36532 non-null  float64
 1   year              36532 non-null  int64  
 2   acousticness      36532 non-null  float64
 3   artists           36532 non-null  object 
 4   danceability      36532 non-null  float64
 5   duration_ms       36532 non-null  int64  
 6   energy            36532 non-null  float64
 7   explicit          36532 non-null  int64  
 8   id                36532 non-null  object 
 9   instrumentalness  36532 non-null  float64
 10  key               36532 non-null  int64  
 11  liveness          36532 non-null  float64
 12  loudness          36532 non-null  float64
 13  mode              36532 non-null  int64  
 14  name              36531 non-null  object 
 15  popularity        36531 non-null  float64
 16  release_date      36531 non-null  object

In [None]:
print(genre_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2973 entries, 0 to 2972
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              2973 non-null   int64  
 1   genres            2973 non-null   object 
 2   acousticness      2973 non-null   float64
 3   danceability      2973 non-null   float64
 4   duration_ms       2973 non-null   float64
 5   energy            2973 non-null   float64
 6   instrumentalness  2973 non-null   float64
 7   liveness          2973 non-null   float64
 8   loudness          2973 non-null   float64
 9   speechiness       2973 non-null   float64
 10  tempo             2973 non-null   float64
 11  valence           2973 non-null   float64
 12  popularity        2973 non-null   float64
 13  key               2973 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 325.3+ KB
None


In [None]:
print(year_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              100 non-null    int64  
 1   year              100 non-null    int64  
 2   acousticness      100 non-null    float64
 3   danceability      100 non-null    float64
 4   duration_ms       100 non-null    float64
 5   energy            100 non-null    float64
 6   instrumentalness  100 non-null    float64
 7   liveness          100 non-null    float64
 8   loudness          100 non-null    float64
 9   speechiness       100 non-null    float64
 10  tempo             100 non-null    float64
 11  valence           100 non-null    float64
 12  popularity        100 non-null    float64
 13  key               100 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 11.1 KB
None


**Clustering Genres with K-Means**

Here, the simple K-means clustering algorithm is used to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

In [None]:
# Visualizing the Clusters with t-SNE

from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2973 samples in 0.008s...
[t-SNE] Computed neighbors for 2973 samples in 0.744s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2973
[t-SNE] Computed conditional probabilities for sample 2000 / 2973
[t-SNE] Computed conditional probabilities for sample 2973 / 2973
[t-SNE] Mean sigma: 0.777516
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.106270
[t-SNE] KL divergence after 1000 iterations: 1.393033


**Clustering Songs with K-Means**

In [None]:
from sklearn.impute import SimpleImputer

song_cluster_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=20, verbose=False))
])

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

In [None]:
# Visualizing the Clusters with PCA
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import plotly.express as px

# Create a new pipeline for PCA visualization with imputation
pca_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Choose a different strategy if needed
    ('scaler', StandardScaler()),
    ('PCA', PCA(n_components=2))
])

# Fit and transform the data with imputation
song_embedding = pca_pipeline.fit_transform(X)

# Create a DataFrame for visualization
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']

# Plot the clusters using Plotly Express
fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])

# Show the plot
fig.show()

In [None]:
# Manually setting the size of the bidimensional space

fig.update_layout(height=900, width=1000)

fig.show()

**Build Recommender System**

In [None]:
!pip install spotipy

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.1-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.3/250.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, spotipy
Successfully installed redis-5.0.1 spotipy-2.23.0


In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="795a6aad3c444f75a1e5331bf39e380c", client_secret="72dfc7cca1944cd38b149d88ba6627bc"))

def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [None]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']


In [None]:
def get_song_data(song, spotify_data):

    try:
        song_data = spotify_data[(spotify_data['name'] == song['name'])
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data

    except IndexError:
        return find_song(song['name'], song['year'])


In [None]:
def get_mean_vector(song_list, spotify_data):

    song_vectors = []

    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

In [None]:
def flatten_dict_list(dict_list):

    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []

    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)

    return flattened_dict

In [None]:
def recommend_songs( song_list, spotify_data, n_songs=10):

    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)

    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])

    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [None]:
recommend_songs([{'name': 'Come As You Are', 'year': 1991},
                 {'name': 'Smells Like Teen Spirit', 'year': 1991},
                 {'name': 'Lithium', 'year': 1992},
                 {'name': 'All Apologies', 'year': 1993},
                 {'name': 'Stay Away', 'year': 1993}],  data)

[{'name': 'Turn! Turn! Turn! (To Everything There Is a Season)',
  'year': 1965,
  'artists': "['The Byrds']"},
 {'name': 'All The Time', 'year': 2014, 'artists': "['Bahamas']"},
 {'name': 'The Way You Move (feat. Sleepy Brown)',
  'year': 2003,
  'artists': "['OutKast', 'Sleepy Brown']"},
 {'name': 'Swing, Swing',
  'year': 2002,
  'artists': "['The All-American Rejects']"},
 {'name': 'I Need Your Love (feat. Ellie Goulding)',
  'year': 2012,
  'artists': "['Calvin Harris', 'Ellie Goulding']"},
 {'name': "While We're Young", 'year': 2017, 'artists': "['Jhené Aiko']"},
 {'name': "Club Can't Handle Me (feat. David Guetta)",
  'year': 2010,
  'artists': "['Flo Rida', 'David Guetta']"},
 {'name': 'California Gurls',
  'year': 2012,
  'artists': "['Katy Perry', 'Snoop Dogg']"},
 {'name': 'Mi Corazoncito', 'year': 2006, 'artists': "['Aventura']"},
 {'name': 'Propuesta Indecente', 'year': 2014, 'artists': "['Romeo Santos']"}]

In [None]:
recommend_songs([{'name': 'Life is a Highway - From "Cars"',
  'year': 2009,
  'artists': "['Rascal Flatts']"},
 {'name': 'Of Wolf And Man', 'year': 1991, 'artists': "['Metallica']"},
 {'name': 'Somebody Like You', 'year': 2002, 'artists': "['Keith Urban']"},
 {'name': 'Kayleigh', 'year': 1992, 'artists': "['Marillion']"},
 {'name': 'Little Secrets', 'year': 2009, 'artists': "['Passion Pit']"},
 {'name': 'No Excuses', 'year': 1994, 'artists': "['Alice In Chains']"},
 {'name': 'Corazón Mágico', 'year': 1995, 'artists': "['Los Fugitivos']"},
 {'name': 'If Today Was Your Last Day',
  'year': 2008,
  'artists': "['Nickelback']"},
 {'name': "Let's Get Rocked", 'year': 1992, 'artists': "['Def Leppard']"},
 {'name': "Breakfast At Tiffany's",
  'year': 1995,
  'artists': "['Deep Blue Something']"}], data)

[{'name': "Livin' It Up", 'year': 2001, 'artists': "['Ja Rule', 'Case']"},
 {'name': 'I Can Still Make Cheyenne',
  'year': 1996,
  'artists': "['George Strait']"},
 {'name': "We Don't Need Another Hero (Thunderdome)",
  'year': 1991,
  'artists': "['Tina Turner']"},
 {'name': 'When I Need You - Remastered',
  'year': 1976,
  'artists': "['Leo Sayer']"},
 {'name': 'Two Weeks', 'year': 2008, 'artists': "['All That Remains']"},
 {'name': 'Shake Ya Ass', 'year': 2000, 'artists': "['Mystikal']"},
 {'name': 'Free Yourself (feat. Missy Elliott)',
  'year': 2004,
  'artists': "['Fantasia', 'Missy Elliott']"},
 {'name': 'Photograph', 'year': 2013, 'artists': "['Ed Sheeran']"},
 {'name': 'To Each His Own', 'year': 1984, 'artists': "['Patrice Rushen']"},
 {'name': "We Don't Need Another Hero (Thunderdome)",
  'year': 1984,
  'artists': "['Tina Turner']"}]

In [None]:
recommend_songs([{'name': 'BOOMBAYAH', 'year': 2016},
                 {'name': 'Paradise City', 'year': 1987},
                 {'name': 'Rosso Relativo', 'year': 2001},
                 {'name': 'Counting Stars', 'year': 2013},
                 {'name': 'The Nights', 'year': 2014}],  data)

[{'name': 'Prey', 'year': 2015, 'artists': "['The Neighbourhood']"},
 {'name': 'Cut My Lip', 'year': 2018, 'artists': "['Twenty One Pilots']"},
 {'name': 'Te Extraño, Te Olvido, Te Amo',
  'year': 1995,
  'artists': "['Ricky Martin']"},
 {'name': 'Carry Me Home',
  'year': 2016,
  'artists': "['Jorja Smith', 'Maverick Sabre']"},
 {'name': 'Fooled Around And Fell In Love',
  'year': 1975,
  'artists': "['Elvin Bishop']"},
 {'name': 'Golpes En El Corazón',
  'year': 2011,
  'artists': "['Los Tigres Del Norte', 'Paulina Rubio']"},
 {'name': 'Si tú no vuelves', 'year': 1993, 'artists': "['Miguel Bosé']"},
 {'name': 'The Way You Look Tonight',
  'year': 2003,
  'artists': "['Michael Bublé']"},
 {'name': 'Excitement (feat. PARTYNEXTDOOR)',
  'year': 2020,
  'artists': "['Trippie Redd', 'PARTYNEXTDOOR']"},
 {'name': 'Salvatore', 'year': 2015, 'artists': "['Lana Del Rey']"}]

In [None]:
recommend_songs ([{'name': 'White Ferrari', 'year':2016},
                 {'name': 'Pulsewidth', 'year': 1992},
                 {'name': 'L3GO', 'year': 2023},
                 {'name': 'I DM U', 'year': 2023},
                 {'name': 'Square Heart', 'year': 2023}],  data)


[{'name': 'The Greatest', 'year': 2006, 'artists': "['Cat Power']"},
 {'name': "There's Still A Light In The House",
  'year': 2019,
  'artists': "['Valley']"},
 {'name': 'Retro (Rough)', 'year': 2014, 'artists': "['Childish Gambino']"},
 {'name': 'Babooshka - 2018 Remaster',
  'year': 1980,
  'artists': "['Kate Bush']"},
 {'name': 'cellophane', 'year': 2019, 'artists': "['FKA twigs']"},
 {'name': 'Si Yo Muero', 'year': 2014, 'artists': "['Romeo Santos']"},
 {'name': 'O Quizás Simplemente Le Regale una Rosa',
  'year': 1968,
  'artists': "['Leonardo Favio']"},
 {'name': 'If I Ever Leave This World Alive',
  'year': 2002,
  'artists': "['Flogging Molly']"},
 {'name': 'Changes',
  'year': 2004,
  'artists': "['Butterfly Boucher', 'David Bowie']"},
 {'name': 'Mira oye', 'year': 2002, 'artists': "['Los Tigrillos']"}]

In [None]:
recommend_songs([{'name': 'Mr. Brightside', 'year':2004},
                 {'name': 'Friends', 'year': 2014},
                 {'name': '505', 'year': 2007},
                 {'name': 'CUFF IT', 'year': 2022},
                 {'name': 'Gold', 'year': 2020}],  data)

[{'name': 'Take It Easy - 2006 Remaster',
  'year': 2007,
  'artists': "['Travis Tritt']"},
 {'name': 'I Loved Her First', 'year': 2006, 'artists': "['Heartland']"},
 {'name': 'Clean My Wounds',
  'year': 1994,
  'artists': "['Corrosion Of Conformity']"},
 {'name': 'Mamma Mia', 'year': 1994, 'artists': "['ABBA']"},
 {'name': "Movin' Out (Anthony's Song)",
  'year': 1977,
  'artists': "['Billy Joel']"},
 {'name': 'I Cross My Heart - Pure Country Soundtrack Version',
  'year': 1992,
  'artists': "['George Strait']"},
 {'name': "Movin' Out (Anthony's Song)",
  'year': 1977,
  'artists': "['Billy Joel']"},
 {'name': 'Panama', 'year': 1996, 'artists': "['Van Halen']"},
 {'name': 'Living After Midnight',
  'year': 1980,
  'artists': "['Judas Priest']"},
 {'name': 'Cruel to Be Kind', 'year': 1979, 'artists': "['Nick Lowe']"}]

You can change the given songs list as per your choice.