In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as img
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
import time

In [2]:
tracks = pd.read_csv('data/tracks.csv')
print('Shape tracks dataset: ', tracks.shape)
tracks.head()

Shape tracks dataset:  (586672, 20)


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [3]:
artists = pd.read_csv('data/artists.csv')
print('Shape tracks dataset: ', tracks.shape)
artists.head()

Shape tracks dataset:  (586672, 20)


Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0


In [4]:
tracks.dtypes

id                   object
name                 object
popularity            int64
duration_ms           int64
explicit              int64
artists              object
id_artists           object
release_date         object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
dtype: object

In [5]:
tracks['release_date'] = pd.to_datetime(tracks['release_date'], errors='coerce')

In [6]:
tracks = tracks.drop_duplicates(subset=['name', 'artists'])

In [7]:
print('Shape tracks dataset: ', tracks.shape)


Shape tracks dataset:  (526610, 20)


In [8]:
artists.dtypes

id             object
followers     float64
genres         object
name           object
popularity      int64
dtype: object

In [9]:
artists = artists.drop_duplicates(subset='name')

In [10]:
print('Shape artists dataset: ', artists.shape)


Shape artists dataset:  (1134430, 5)


In [11]:
def col_to_list(df, column): 
    tr_df = df.copy()
    tr_df[column] = tr_df[column].str.replace(r"[\[''\]]", "", regex=True)
    tr_df[column] = tr_df[column].str.split(",")
    return tr_df

In [12]:
# create df to count artists
tr_tracks = col_to_list(tracks, 'artists')
tr_tracks = tr_tracks.explode('artists')
# remove trailing whitespaces
tr_tracks['artists'] = tr_tracks['artists'].str.strip() 

In [13]:
tr_artists = col_to_list(artists, 'genres')
tr_artists = tr_artists.explode('genres')
# remove trailing whitespaces
tr_artists['genres'] = tr_artists['genres'].str.strip()

In [14]:
tracks_part = tracks.loc[:, ['id', 'artists', 'id_artists']]
tracks_part = col_to_list(tracks_part, 'id_artists')
tracks_part = tracks_part.explode('id_artists')

In [15]:
# merging tracks part list and artists datasets
merged_tracks = tracks_part.merge(artists[['id', 'followers', 'genres']], how='left', left_on='id_artists',
                                 right_on='id')
merged_tracks = merged_tracks.drop('id_y', axis=1)
merged_tracks = merged_tracks.rename({'id_x': 'id'}, axis='columns')
merged_tracks = col_to_list(merged_tracks, 'genres')

In [16]:
list_merge = merged_tracks[['id', 'id_artists', 'followers', 'genres']].groupby(
                                                                         ['id'], 
                                                                         as_index=False).aggregate(lambda x: list(x))

In [17]:
def flatten_list(l):
    
    flat_list = [] 
    for sub in l:
        # nan is float type, ignore and do not add to list
        # so that e.g. [nan] becomes an empty list []
        if isinstance(sub, float):
            continue
        for j in sub:
            flat_list.append(j)

    return flat_list        

In [18]:
list_merge['genres'] = list_merge['genres'].apply(flatten_list)

In [19]:
# merge with "full" tracks dataset
tmp_tracks = tracks.drop('id_artists', axis=1)
merged_df = pd.merge(list_merge, tmp_tracks, on=['id', 'id'])

In [21]:
def get_model_columns(df): #uses the selected features

    return df.drop(['id', 'name','explicit', 'artists', 'mode',
                    'id_artists', 'release_date', 'key',  'time_signature', 'duration_ms',  # 'mode_0', 'mode_1',
                    'popularity', 'energy'], axis=1) #  'popularity', 'mode_0', 'mode_1'

In [22]:
# create dataset for model

X = get_model_columns(tracks)

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [24]:
model = KMeans(
    n_clusters=9, init='random', #determined using elbow method n_clusters = 9
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0, 
)
y_pred = model.fit_predict(X)

In [37]:
def get_song_ids_from_names(input_song_names, tracks_df):
    song_ids = []
    for song_name in input_song_names:
        # Find the corresponding song in the tracks DataFrame
        matching_song = tracks_df[tracks_df['name'] == song_name]
        if not matching_song.empty:
            # If a match is found, add the song ID to the list
            song_ids.append(matching_song.iloc[0]['id'])
    return song_ids

In [38]:
# Initialize an empty list to store user-selected song names
user_selected_songs = []
song_list = []
# Ask the user for song names and append them to the list
while True:
    song_name = input("Enter a song name (or type 'done' to finish): ")
    if song_name.lower() == 'done':
        break
    user_selected_songs.append(song_name)

# Convert user-selected song names to song IDs
user_song_ids = get_song_ids_from_names(user_selected_songs, tracks)

# Append the user-selected song IDs to the song_list
song_list.extend(user_song_ids)

# Get features for user-selected songs
user_songs = tracks[tracks['id'].isin(song_list)]


In [39]:
# get features
user_songs = tracks[tracks['id'].isin(song_list)]
user_songs

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
89082,1yvMUkIOTeUNtNWlWRgANS,Unstoppable,75,217747,0,['Sia'],['5WUlDfRSoLAfcVSX1WnrxN'],2016-10-21,0.468,0.779,9,-4.839,1,0.0779,0.112,0.000373,0.101,0.26,173.799,4
284034,1hda15yId8Z2U2buvdYFLe,No Good for Me,39,240240,0,['The Corrs'],['1VbWUxZTRNY2gw3qZ1tg9W'],1998-10-17,0.482,0.712,4,-5.23,0,0.027,0.0538,0.0,0.0944,0.275,173.952,4


In [40]:
# use list
songs_fts = get_model_columns(user_songs) 
songs_fts.describe()

Unnamed: 0,danceability,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,0.475,-5.0345,0.05245,0.0829,0.000187,0.0977,0.2675,173.8755
std,0.009899,0.276479,0.035992,0.041154,0.000264,0.004667,0.010607,0.108187
min,0.468,-5.23,0.027,0.0538,0.0,0.0944,0.26,173.799
25%,0.4715,-5.13225,0.039725,0.06835,9.3e-05,0.09605,0.26375,173.83725
50%,0.475,-5.0345,0.05245,0.0829,0.000187,0.0977,0.2675,173.8755
75%,0.4785,-4.93675,0.065175,0.09745,0.00028,0.09935,0.27125,173.91375
max,0.482,-4.839,0.0779,0.112,0.000373,0.101,0.275,173.952


In [41]:
def get_frequent_clusters(predictions, top_n): 
    unique_values, frequency = np.unique(predictions, return_counts=True)
    cluster_num = min(top_n, unique_values.shape[0])
    
    # get most frequent clusters
    sorted_indexes = np.argsort(frequency)[::-1]

    frequent_clusters = unique_values[sorted_indexes]
    sorted_freq = frequency[sorted_indexes]
    freq_sum = np.sum(sorted_freq[:cluster_num]) # takeing sum for cluster_num 
    freq_perc = [(i / freq_sum) for i in sorted_freq[:cluster_num]]     # To what percentage should cluster x be used for the recommendation?
    return cluster_num, frequent_clusters, freq_perc

In [42]:
def make_recommendations(df, all_data, scaler, model, rec_max=5, top_n=3):

    # transform data and make cluster predictions
    song_ids = df['id'].values.tolist()
    X = get_model_columns(df)
    
    data = get_model_columns(all_data)
    data = scaler.transform(data)
    
    transformed_X = scaler.transform(X)
    predictions = model.predict(transformed_X)
    print('Prediction of cluster classes: ', predictions)
    
    # determine the most frequent cluster classes from user input 
    # dependent on the frequency it is decided how many recommendations come from the respective classes
    # e.g. 20 % of predictions is class 3: .2 * rec_max recommendations are done with user items with class prediction 3 
    cluster_num, frequent_clusters, freq_perc = get_frequent_clusters(predictions, top_n)
        
    recs_id = pd.DataFrame(columns=['id', 'similarity'])
    recs = pd.DataFrame(columns=all_data.columns)
    for i in range(cluster_num):
        # determine the number of recommendations
        rec_num = round(freq_perc[i] * rec_max)
        cluster_number = frequent_clusters[i]

        # make a mean vector out of songs from the cluster
        # find the position of elements from the cluster
        pos = np.where(predictions == cluster_number)[0] 
        cluster_songs = transformed_X[pos, :]

        mean_song = np.mean(cluster_songs, axis=0)
        
        # make rec_num recommendations using cluster_num

        # calculate similarity
        similarity = cdist(np.reshape(mean_song, (1,-1)), data)
        
        # create a DataFrame for similar songs with columns 'id' and 'similarity'
        similar_songs = pd.DataFrame({'id': all_data['id'], 'similarity': similarity.flatten()})
        
        # remove songs from the user_songs list
        similar_songs = similar_songs[~similar_songs['id'].isin(song_ids)]
        similar_songs = similar_songs.sort_values(by='similarity', ascending=True).reset_index(drop=True)
        
        # concatenate the DataFrames using pd.concat
        recs_id = pd.concat([recs_id, similar_songs], ignore_index=True)
        
    recs_id = recs_id.reset_index(drop=True)
    
    return recs_id.loc[:rec_max-1]

# Assuming you have defined your user_songs, tracks DataFrames, scaler, and model
recs = make_recommendations(user_songs, tracks, scaler, model)
rec_ids = list(recs['id'])
recommendations = tracks[tracks['id'].isin(rec_ids)]


Prediction of cluster classes:  [7 7]


In [43]:
recommendations

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
246310,67YDjSgxuKWtoZRG32rbWI,Innocent Black Coffee,30,236752,0,['Beissoul & Einius'],['6JOVhCHXHhk92Eo6ppVNlE'],2019-02-19,0.474,0.801,10,-4.848,1,0.0315,0.0495,0.0014,0.119,0.253,176.149,4
312257,6MFjDmgs7hS23QesiAOTU9,Kærlighed & Krig,41,235534,0,"['Burhan G', 'Molly Sandén']","['14nVYgOiwVBJIEATXIDYC8', '0NRMzT05nsc8mTm4iU...",2013-01-01,0.483,0.644,6,-4.701,1,0.0441,0.111,0.0,0.0906,0.278,170.21,4
416521,4up9fjkBtf5X2hmvV8YWyf,人前人後,46,253995,0,['告五人'],['6xErgeZYatiaQ36SB5bvi8'],2020-12-31,0.448,0.761,3,-5.024,1,0.0351,0.0482,0.00255,0.0801,0.284,170.217,4
545090,1KxjICDqDZDHVWbmfMiz5I,Trammipeatuses,19,255048,0,['Terminaator'],['1e69U0tUjsnT5qhuXkIMqH'],2006-05-25,0.466,0.598,4,-5.656,0,0.0271,0.0918,0.0,0.0859,0.3,169.851,3
562413,3jAjSQXOTgQDHjRtlvW6P6,りんどう,47,329443,0,['WANIMA'],['6YqdtpUutxodni6lUD4stM'],2019-10-23,0.493,0.658,7,-4.32,1,0.0321,0.0779,0.0,0.0863,0.286,174.964,4
