In [1]:
import numpy as np
import pandas as pd
import psycopg2 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, desc
from config import username, password
import io

In [2]:
# create engine
engine = create_engine(f'postgresql+psycopg2://{username}:{password}@aws-gt-dataviz-finalpg-001.cloqvwuqbywl.us-east-1.rds.amazonaws.com:5432/spotify_db')


In [3]:
# reflect an existing database into a new model
Base = automap_base()

# reflect the tables
Base.prepare(engine, reflect=True)

In [4]:
# We can view all of the classes that automap found
Base.classes.keys()

['tracks_wcluster',
 'artists',
 'genres',
 'tracks',
 'years',
 'artistgenre',
 'kmodel']

In [5]:
# Save references to each table
# Artistgenre = Base.classes.artistgenre
# Artists = Base.classes.artists
# Genres = Base.classes.genres
Tracks = Base.classes.tracks
# Years = Base.classes.years

In [6]:
# Create our session (link) from Python to the DB
session = Session(engine)

In [7]:
tracks_q = session.query(Tracks.id, Tracks.name, Tracks.artists, Tracks.acousticness,
                         Tracks.danceability, Tracks.energy, Tracks.instrumentalness,
                         Tracks.valence, Tracks.popularity, Tracks.year,
                         Tracks.key, Tracks.liveness, Tracks.loudness, Tracks.tempo).\
                        filter(Tracks.popularity > 25).all()

# store results in dataframe
tracks_kDF = pd.DataFrame(tracks_q, columns=['id', 'name', 'artists', 'acousticness',
                                             'danceability' ,'energy' ,'instrumentalness',
                                             'valence', 'popularity', 'year',
                                            'key', 'liveness', 'loudness', 'tempo'])

tracks_combined = tracks_kDF.copy()

#  Key: 0 is C natural, 1 is C♯, 2 is D♮ and so on up to 11, which is B♮
#  Loudness: Values typical range between -60 and 0 db.

In [8]:
tracks_kDF['popularity'] = tracks_kDF['popularity']/1000
tracks_kDF['year'] = tracks_kDF['year']/2021/10
tracks_kDF['key'] = tracks_kDF['key']/11
tracks_kDF['loudness'] = tracks_kDF['loudness']/60
tracks_kDF['tempo'] = tracks_kDF['tempo']/244

In [9]:
tracks_kDF.describe()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo
count,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0
mean,0.355442,0.553626,0.565912,0.108406,0.544487,0.044582,0.098406,0.474711,0.198516,-0.169631,0.490503
std,0.329519,0.174511,0.251569,0.255943,0.261462,0.013155,0.000811,0.320874,0.174247,0.084293,0.122963
min,0.0,0.0,0.0,0.0,0.0,0.026,0.095002,0.0,0.0,-1.0,0.0
25%,0.0436,0.435,0.381,0.0,0.335,0.034,0.097773,0.181818,0.0931,-0.212983,0.395093
50%,0.254,0.563,0.581,7.4e-05,0.554,0.042,0.098417,0.454545,0.129,-0.155367,0.48098
75%,0.65,0.68,0.772,0.015,0.765,0.053,0.098961,0.727273,0.252,-0.108767,0.565707
max,0.996,0.988,1.0,1.0,1.0,0.1,0.1,1.0,1.0,0.0624,0.99798


In [10]:
session.close()

In [11]:
# Kmeans

X = tracks_kDF.select_dtypes(np.number)
number_cols = list(X.columns)
tracks_fitted = KMeans(n_clusters=2200, init='k-means++', verbose=1).fit(X)


Initialization complete
Iteration 0, inertia 3331.4544584091277
Iteration 1, inertia 2949.51314866561
Iteration 2, inertia 2859.5731606137924
Iteration 3, inertia 2816.550073251434
Iteration 4, inertia 2791.284142739955
Iteration 5, inertia 2773.9639741990577
Iteration 6, inertia 2761.6255000470437
Iteration 7, inertia 2752.98411759109
Iteration 8, inertia 2746.2474193509165
Iteration 9, inertia 2740.9514727377377
Iteration 10, inertia 2736.9053346131604
Iteration 11, inertia 2733.6323813523545
Iteration 12, inertia 2731.1411322216773
Iteration 13, inertia 2729.203230797838
Iteration 14, inertia 2727.539212768596
Iteration 15, inertia 2726.0273141984194
Iteration 16, inertia 2724.6631388606993
Iteration 17, inertia 2723.4460669448094
Iteration 18, inertia 2722.4747634920514
Iteration 19, inertia 2721.6548587832663
Iteration 20, inertia 2720.954307070248
Iteration 21, inertia 2720.306450014419
Iteration 22, inertia 2719.773713610087
Iteration 23, inertia 2719.2801788609
Iteration 24, in

Iteration 28, inertia 2721.4121744203926
Iteration 29, inertia 2721.1645746511867
Iteration 30, inertia 2720.9791868777725
Iteration 31, inertia 2720.8301639300817
Iteration 32, inertia 2720.716419605048
Iteration 33, inertia 2720.6288779828496
Iteration 34, inertia 2720.550876193165
Iteration 35, inertia 2720.487376733379
Iteration 36, inertia 2720.4345471096062
Iteration 37, inertia 2720.377928766425
Iteration 38, inertia 2720.3473105198586
Iteration 39, inertia 2720.3270962231204
Iteration 40, inertia 2720.310814764985
Iteration 41, inertia 2720.294689618696
Iteration 42, inertia 2720.2744827338715
Iteration 43, inertia 2720.2585644883948
Iteration 44, inertia 2720.2517040764697
Iteration 45, inertia 2720.2440882379865
Iteration 46, inertia 2720.2408898190415
Iteration 47, inertia 2720.2394004395414
Iteration 48, inertia 2720.2388449765685
Converged at iteration 48: center shift 1.9627034540990653e-29 within tolerance 4.471886210897256e-06
Initialization complete
Iteration 0, inerti

Iteration 62, inertia 2716.651043987824
Iteration 63, inertia 2716.6214787580707
Iteration 64, inertia 2716.6051349211143
Iteration 65, inertia 2716.601586032127
Iteration 66, inertia 2716.5992853087023
Converged at iteration 66: center shift 1.9734958655281084e-29 within tolerance 4.471886210897256e-06
Initialization complete
Iteration 0, inertia 3333.56543341885
Iteration 1, inertia 2957.8139103504327
Iteration 2, inertia 2865.389536832649
Iteration 3, inertia 2821.065728529448
Iteration 4, inertia 2794.8464124703805
Iteration 5, inertia 2777.7947491442283
Iteration 6, inertia 2766.095954664822
Iteration 7, inertia 2756.7619965671383
Iteration 8, inertia 2749.1781590522414
Iteration 9, inertia 2743.529246045829
Iteration 10, inertia 2739.1416762784297
Iteration 11, inertia 2735.5708022038766
Iteration 12, inertia 2732.563142764473
Iteration 13, inertia 2730.215506428431
Iteration 14, inertia 2728.138030260878
Iteration 15, inertia 2726.196258290112
Iteration 16, inertia 2724.54476163

In [12]:
song_cluster_labels = tracks_fitted.predict(X)


In [13]:
tracks_kDF['cluster_label'] = song_cluster_labels

In [14]:
# save model to csv
tracks_kDF.to_csv('KmeansModel.csv', index=False)

In [15]:
#drops old table and creates new empty table
tracks_kDF.head(0).to_sql('kmodel', engine, if_exists='replace',index=False)

In [16]:
# KMEANS MODEL
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
tracks_kDF.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, 'kmodel', null="") # null values become ''
conn.commit()
engine.execute('alter table kmodel add primary key(id)')

<sqlalchemy.engine.result.ResultProxy at 0x1f7c8404c10>

In [17]:
# create a data frame of tracks data merged with cluster label
model_k = tracks_kDF[['id','cluster_label']]

tracks_combined = tracks_combined.merge(model_k, how='left', on='id')
tracks_combined

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo,cluster_label
0,40W8Mm9t3ZO1iNQlls35lL,If,Bread,0.91200,0.388,0.1760,0.000336,0.342,60,1971,9,0.0977,-16.952,97.628,947
1,2wAfHM7Whz67VFbdanhZlk,Nobody Knows You When You're Down and Out,Bessie Smith,0.99600,0.614,0.0423,0.002930,0.211,41,1923,4,0.1830,-12.033,89.822,238
2,3eMrYc092k7SIJfWJ7oasR,Weather Bird,"Louis Armstrong, Earl Hines",0.98400,0.831,0.2620,0.912000,0.901,37,1923,8,0.2040,-12.386,104.606,2183
3,2AZgaYZSwUosJD71J2N2Zo,'Tain't Nobody's Bizness If I Do,Bessie Smith,0.99600,0.537,0.0443,0.000265,0.137,29,1923,3,0.1520,-16.474,80.468,581
4,6qRvnXftofjYJm1Mg98UWL,Need a Little Sugar in My Bowl,Bessie Smith,0.99200,0.693,0.0270,0.000000,0.402,26,1923,0,0.1340,-13.506,75.749,1547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86743,3NUmUIyzNLBp8bCFMH8Mif,Waiting On A War,Foo Fighters,0.00984,0.530,0.7590,0.000000,0.502,69,2021,7,0.3190,-7.067,131.999,767
86744,0fJ1caLzidzTlIL3pPX1eU,Precious' Tale,Jazmine Sullivan,0.71500,0.734,0.3460,0.000000,0.930,59,2021,2,0.3940,-11.722,88.849,1398
86745,3HSUqAErTyFQWLfLdnFVnB,Connexion,ZAYN,0.49800,0.597,0.3680,0.000000,0.590,52,2021,2,0.1090,-10.151,171.980,1330
86746,660rulYF3eLCuW6rQpiMdL,Little Boy,Ashnikko,0.10500,0.781,0.4870,0.000000,0.327,61,2021,1,0.0802,-7.301,129.941,1486


In [18]:
#drops old table and creates new empty table
tracks_combined.head(0).to_sql('tracks_wcluster', engine, if_exists='replace',index=False)

In [19]:
# Tracks with clusters
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
tracks_combined.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, 'tracks_wcluster', null="") # null values become ''
conn.commit()
engine.execute('alter table tracks_wcluster add primary key(id)')

<sqlalchemy.engine.result.ResultProxy at 0x1f7c9e3bf70>

Stop here: Below line is for reading model from csv

# Read CSV file into DataFrame df
model_kDF = pd.read_csv('maraKmodel.csv',  encoding='latin1')
model_kDF = model_kDF[['id', 'name', 'artists', 'cluster_label']]