In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.decomposition import PCA

import seaborn as sb

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')

In [3]:
#Read in teh csv file
playlistDF = pd.read_csv('cleaned_data.csv')
playlistDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          10000 non-null  object 
 1   artists           9999 non-null   object 
 2   album_name        9999 non-null   object 
 3   track_name        9999 non-null   object 
 4   popularity        10000 non-null  int64  
 5   duration_ms       10000 non-null  int64  
 6   explicit          10000 non-null  bool   
 7   danceability      10000 non-null  float64
 8   energy            10000 non-null  float64
 9   key               10000 non-null  int64  
 10  loudness          10000 non-null  float64
 11  mode              10000 non-null  int64  
 12  speechiness       10000 non-null  float64
 13  acousticness      10000 non-null  float64
 14  instrumentalness  10000 non-null  float64
 15  liveness          10000 non-null  float64
 16  valence           10000 non-null  float64

In [4]:
#Drop null values
playlistDF = playlistDF.dropna().reset_index(drop=True)
playlistDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          9999 non-null   object 
 1   artists           9999 non-null   object 
 2   album_name        9999 non-null   object 
 3   track_name        9999 non-null   object 
 4   popularity        9999 non-null   int64  
 5   duration_ms       9999 non-null   int64  
 6   explicit          9999 non-null   bool   
 7   danceability      9999 non-null   float64
 8   energy            9999 non-null   float64
 9   key               9999 non-null   int64  
 10  loudness          9999 non-null   float64
 11  mode              9999 non-null   int64  
 12  speechiness       9999 non-null   float64
 13  acousticness      9999 non-null   float64
 14  instrumentalness  9999 non-null   float64
 15  liveness          9999 non-null   float64
 16  valence           9999 non-null   float64


In [5]:
#Display playlistDF
playlistDF.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,1gUAX2ImxDsB3YDcyxMXlB,美波,カワキヲアメク,カワキヲアメク,71,251933,False,0.541,0.846,6,-2.729,0,0.0551,0.0122,0.000149,0.163,0.524,129.138,4,anime
1,1di1C0QI6Y92yZPYn6XYAZ,KANA-BOON,TIME,シルエット,73,240133,False,0.436,0.934,2,-2.685,1,0.0507,1e-05,0.14,0.321,0.384,91.481,4,anime
2,3khEEPRyBeOUabbmOPJzAG,Kenshi Yonezu,KICK BACK,KICK BACK,83,193495,False,0.577,0.941,1,-5.17,1,0.105,0.00207,3e-06,0.0891,0.292,101.921,4,anime
3,1rN9QoVxw5U7TJkyaUR8C1,TK from Ling tosite sigure,Fantastic Magic,unravel,73,238360,False,0.508,0.889,7,-2.755,0,0.0862,0.0495,0.0,0.0984,0.332,135.014,4,anime
4,23phSRwoMy48rwFpmuAP8q,Yoko Takahashi,残酷な天使のテーゼ/魂のルフラン,残酷な天使のテーゼ,60,247746,False,0.691,0.773,0,-5.244,0,0.0494,0.0174,0.000451,0.117,0.502,128.162,4,anime


In [6]:
#Display the column names for playlistDF
playlistDF.columns

Index(['track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre'],
      dtype='object')

In [7]:
# drop columns that won't be used for scaling
songQualities = playlistDF.drop(['track_id', 'artists', 'album_name', 'track_name', 'track_genre', 'duration_ms'], axis=1)
songQualities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        9999 non-null   int64  
 1   explicit          9999 non-null   bool   
 2   danceability      9999 non-null   float64
 3   energy            9999 non-null   float64
 4   key               9999 non-null   int64  
 5   loudness          9999 non-null   float64
 6   mode              9999 non-null   int64  
 7   speechiness       9999 non-null   float64
 8   acousticness      9999 non-null   float64
 9   instrumentalness  9999 non-null   float64
 10  liveness          9999 non-null   float64
 11  valence           9999 non-null   float64
 12  tempo             9999 non-null   float64
 13  time_signature    9999 non-null   int64  
dtypes: bool(1), float64(9), int64(4)
memory usage: 1.0 MB


In [8]:
# scale data in songQualities
songQualities = StandardScaler().fit_transform(songQualities) #makes list of lists
songQualities = pd.DataFrame(songQualities,
                             columns = ['popularity', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature'])
songQualities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        9999 non-null   float64
 1   explicit          9999 non-null   float64
 2   danceability      9999 non-null   float64
 3   energy            9999 non-null   float64
 4   key               9999 non-null   float64
 5   loudness          9999 non-null   float64
 6   mode              9999 non-null   float64
 7   speechiness       9999 non-null   float64
 8   acousticness      9999 non-null   float64
 9   instrumentalness  9999 non-null   float64
 10  liveness          9999 non-null   float64
 11  valence           9999 non-null   float64
 12  tempo             9999 non-null   float64
 13  time_signature    9999 non-null   float64
dtypes: float64(14)
memory usage: 1.1 MB


In [9]:
scaled_playlistDF= pd.concat([playlistDF[['track_id', 'artists', 'album_name', 'track_name', 'track_genre', 'duration_ms']],songQualities],axis=1)
# # scaled_playlistDF = scaled_playlistDF.iloc[0:9999,:]
scaled_playlistDF

Unnamed: 0,track_id,artists,album_name,track_name,track_genre,duration_ms,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,1gUAX2ImxDsB3YDcyxMXlB,美波,カワキヲアメク,カワキヲアメク,anime,251933,1.135754,-0.39795,-0.400266,1.020092,0.193256,1.264801,-1.262714,-0.354642,-1.105236,-0.316050,-0.248795,0.209782,0.251379,0.221667
1,1di1C0QI6Y92yZPYn6XYAZ,KANA-BOON,TIME,シルエット,anime,240133,1.251491,-0.39795,-1.100851,1.417430,-0.927929,1.276069,0.791945,-0.405302,-1.144854,0.334792,0.603762,-0.423554,-0.988341,0.221667
2,3khEEPRyBeOUabbmOPJzAG,Kenshi Yonezu,KICK BACK,KICK BACK,anime,193495,1.830177,-0.39795,-0.160065,1.449036,-1.208226,0.639683,0.791945,0.219894,-1.138160,-0.316728,-0.647555,-0.839747,-0.644642,0.221667
3,1rN9QoVxw5U7TJkyaUR8C1,TK from Ling tosite sigure,Fantastic Magic,unravel,anime,238360,1.251491,-0.39795,-0.620450,1.214246,0.473552,1.258142,-1.262714,0.003436,-0.984007,-0.316744,-0.597373,-0.658793,0.444825,0.221667
4,23phSRwoMy48rwFpmuAP8q,Yoko Takahashi,残酷な天使のテーゼ/魂のルフラン,残酷な天使のテーゼ,anime,247746,0.499201,-0.39795,0.600570,0.690483,-1.488522,0.620732,-1.262714,-0.420270,-1.088336,-0.314645,-0.497008,0.110258,0.219248,0.221667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,6L7edR8ePoAiOEthSAfeBD,Gerson Rufino,Reconstrução,Reconstrução,sertanejo,262143,-0.310959,-0.39795,1.121004,0.442146,-1.488522,0.476040,0.791945,-0.711568,-0.384364,-0.316744,-0.221816,1.119071,-0.542783,0.221667
9995,0t9X7I69027UILQB4WYCai,Chico Rey & Paraná,"Sucessos de Ouro, Vol. 15",Velha Porteira,sertanejo,224600,-0.484564,-0.39795,-0.480333,-0.253195,-0.367337,0.756716,0.791945,-0.704660,1.026179,-0.316744,-0.070730,0.526450,1.282049,-2.352601
9996,6nYIAevQfh7QhuQYXYLy50,Léo & Raphael,Na Contramão do Sucesso,Aqui É Parana Cunhado,sertanejo,153426,-0.426696,-0.39795,0.593898,1.381308,-1.208226,0.533661,-1.262714,1.117966,-0.052854,-0.316744,-0.956202,1.761455,1.863902,0.221667
9997,7ALQOUgUrUifvmcGMCyvld,Os Serranos,"Os Serranos Interpretam Sucessos Gaúchos, Vol. 3",O Casamento de Doralice,sertanejo,250600,-0.426696,-0.39795,1.080971,0.672422,-1.208226,0.364385,0.791945,-0.652848,-0.381114,-0.316552,-0.834254,1.716217,-0.970103,0.221667


In [10]:
# one-hot-encode the "track_genre" column
track_genres= pd.get_dummies(scaled_playlistDF["track_genre"])
track_genres

Unnamed: 0,anime,chill,emo,grunge,indian,k-pop,pop,pop-film,sad,sertanejo
0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9994,0,0,0,0,0,0,0,0,0,1
9995,0,0,0,0,0,0,0,0,0,1
9996,0,0,0,0,0,0,0,0,0,1
9997,0,0,0,0,0,0,0,0,0,1


In [11]:
#Add track_genres to playlist_df and drop track_genre column
scaled_playlistDF_with_genres = pd.concat([scaled_playlistDF,track_genres],axis=1)
machineLearningDF = scaled_playlistDF_with_genres.drop(
    ['track_id', 'artists', 'album_name', 'track_name', 'track_genre', 'duration_ms']
    , axis=1)
machineLearningDF

Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,anime,chill,emo,grunge,indian,k-pop,pop,pop-film,sad,sertanejo
0,1.135754,-0.39795,-0.400266,1.020092,0.193256,1.264801,-1.262714,-0.354642,-1.105236,-0.316050,...,1,0,0,0,0,0,0,0,0,0
1,1.251491,-0.39795,-1.100851,1.417430,-0.927929,1.276069,0.791945,-0.405302,-1.144854,0.334792,...,1,0,0,0,0,0,0,0,0,0
2,1.830177,-0.39795,-0.160065,1.449036,-1.208226,0.639683,0.791945,0.219894,-1.138160,-0.316728,...,1,0,0,0,0,0,0,0,0,0
3,1.251491,-0.39795,-0.620450,1.214246,0.473552,1.258142,-1.262714,0.003436,-0.984007,-0.316744,...,1,0,0,0,0,0,0,0,0,0
4,0.499201,-0.39795,0.600570,0.690483,-1.488522,0.620732,-1.262714,-0.420270,-1.088336,-0.314645,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,-0.310959,-0.39795,1.121004,0.442146,-1.488522,0.476040,0.791945,-0.711568,-0.384364,-0.316744,...,0,0,0,0,0,0,0,0,0,1
9995,-0.484564,-0.39795,-0.480333,-0.253195,-0.367337,0.756716,0.791945,-0.704660,1.026179,-0.316744,...,0,0,0,0,0,0,0,0,0,1
9996,-0.426696,-0.39795,0.593898,1.381308,-1.208226,0.533661,-1.262714,1.117966,-0.052854,-0.316744,...,0,0,0,0,0,0,0,0,0,1
9997,-0.426696,-0.39795,1.080971,0.672422,-1.208226,0.364385,0.791945,-0.652848,-0.381114,-0.316552,...,0,0,0,0,0,0,0,0,0,1


Finished with Pre-processing data, and now it's time to train the models!

In [12]:
X = machineLearningDF[['popularity', 'explicit', 'danceability',
                       'energy', 'key', 'loudness','mode', 'speechiness',
                       'acousticness', 'instrumentalness', 'liveness',
                       'valence', 'tempo', 'time_signature']]
X.head()

Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,1.135754,-0.39795,-0.400266,1.020092,0.193256,1.264801,-1.262714,-0.354642,-1.105236,-0.31605,-0.248795,0.209782,0.251379,0.221667
1,1.251491,-0.39795,-1.100851,1.41743,-0.927929,1.276069,0.791945,-0.405302,-1.144854,0.334792,0.603762,-0.423554,-0.988341,0.221667
2,1.830177,-0.39795,-0.160065,1.449036,-1.208226,0.639683,0.791945,0.219894,-1.13816,-0.316728,-0.647555,-0.839747,-0.644642,0.221667
3,1.251491,-0.39795,-0.62045,1.214246,0.473552,1.258142,-1.262714,0.003436,-0.984007,-0.316744,-0.597373,-0.658793,0.444825,0.221667
4,0.499201,-0.39795,0.60057,0.690483,-1.488522,0.620732,-1.262714,-0.42027,-1.088336,-0.314645,-0.497008,0.110258,0.219248,0.221667


In [17]:
genre_models_to_make = list(track_genres.columns)
genre_models_to_make

['anime',
 'chill',
 'emo',
 'grunge',
 'indian',
 'k-pop',
 'pop',
 'pop-film',
 'sad',
 'sertanejo']

In [20]:
#Make list of genres to train for

models={}

#import tree modeule and train_test_split
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
from sklearn.neighbors import KNeighborsClassifier



#Loop through genres to make an h5 file for each
for genre in genre_models_to_make:
  #Make y = target genre
  y = machineLearningDF[[genre]]

  #Split into train_test
  X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

  #make the K Neighbors instance
  accuracy_list = []
  model_list = []
  neighbors_list = [1,3,5,7,9]

  for n in neighbors_list:
    kNNModel = KNeighborsClassifier(n_neighbors = n)
    kNNModel.fit(X_train, y_train)

    #OPTIONAL - SHOW PREDICTIONS FOR EACH MODEL
    predictedValues_KNeigh = kNNModel.predict(X_test)
    accuracy_list.append(accuracy_score(y_test,predictedValues_KNeigh))
    model_list.append(kNNModel)

  best_accuracy = max(accuracy_list)
  max_index = accuracy_list.index(best_accuracy)

  model = model_list[max_index]
  neighbors = neighbors_list[max_index]


  print(f"K Neighbors Accuracy for {genre} with {neighbors} neighbors: {(best_accuracy*100):.2f}%")
  print("")

  # Save the KNNmodel to an h5 file
  with open(f'{genre}Model.h5', 'wb') as file:
    pickle.dump(model, file)


K Neighbors Accuracy for anime with 5 neighbors: 91.16%

K Neighbors Accuracy for chill with 9 neighbors: 89.20%

K Neighbors Accuracy for emo with 9 neighbors: 89.32%

K Neighbors Accuracy for grunge with 9 neighbors: 91.64%

K Neighbors Accuracy for indian with 9 neighbors: 89.96%

K Neighbors Accuracy for k-pop with 7 neighbors: 91.44%

K Neighbors Accuracy for pop with 9 neighbors: 92.52%

K Neighbors Accuracy for pop-film with 9 neighbors: 88.92%

K Neighbors Accuracy for sad with 9 neighbors: 89.24%

K Neighbors Accuracy for sertanejo with 3 neighbors: 95.28%

