In [11]:
import numpy as np
import pandas as pd
import psycopg2 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, desc
from config import username, password
import io

In [2]:
# create engine
engine = create_engine(f'postgresql+psycopg2://{username}:{password}@aws-gt-dataviz-finalpg-001.cloqvwuqbywl.us-east-1.rds.amazonaws.com:5432/spotify_db')


In [3]:
# reflect an existing database into a new model
Base = automap_base()

# reflect the tables
Base.prepare(engine, reflect=True)

In [4]:
# We can view all of the classes that automap found
Base.classes.keys()

['artistgenre', 'artists', 'genres', 'tracks', 'years']

In [5]:
# Save references to each table
# Artistgenre = Base.classes.artistgenre
# Artists = Base.classes.artists
# Genres = Base.classes.genres
Tracks = Base.classes.tracks
# Years = Base.classes.years

In [6]:
# Create our session (link) from Python to the DB
session = Session(engine)

In [7]:
tracks_q = session.query(Tracks.id, Tracks.name, Tracks.artists, Tracks.acousticness,
                         Tracks.danceability, Tracks.energy, Tracks.instrumentalness,
                         Tracks.valence, Tracks.popularity, Tracks.year,
                         Tracks.key, Tracks.liveness, Tracks.loudness, Tracks.tempo).\
                        filter(Tracks.popularity > 25).all()

# store results in dataframe
tracks_kDF = pd.DataFrame(tracks_q, columns=['id', 'name', 'artists', 'acousticness',
                                             'danceability' ,'energy' ,'instrumentalness',
                                             'valence', 'popularity', 'year',
                                            'key', 'liveness', 'loudness', 'tempo'])

#  Key: 0 is C natural, 1 is C♯, 2 is D♮ and so on up to 11, which is B♮
#  Loudness: Values typical range between -60 and 0 db.

In [8]:
tracks_kDF['popularity'] = tracks_kDF['popularity']/1000
tracks_kDF['year'] = tracks_kDF['year']/2021/10
tracks_kDF['key'] = tracks_kDF['key']/11
tracks_kDF['loudness'] = tracks_kDF['loudness']/60
tracks_kDF['tempo'] = tracks_kDF['tempo']/244

In [9]:
tracks_kDF.describe()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo
count,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0
mean,0.355442,0.553626,0.565912,0.108406,0.544487,0.044582,0.098406,0.474711,0.198516,-0.169631,0.490503
std,0.329519,0.174511,0.251569,0.255943,0.261462,0.013155,0.000811,0.320874,0.174247,0.084293,0.122963
min,0.0,0.0,0.0,0.0,0.0,0.026,0.095002,0.0,0.0,-1.0,0.0
25%,0.0436,0.435,0.381,0.0,0.335,0.034,0.097773,0.181818,0.0931,-0.212983,0.395093
50%,0.254,0.563,0.581,7.4e-05,0.554,0.042,0.098417,0.454545,0.129,-0.155367,0.48098
75%,0.65,0.68,0.772,0.015,0.765,0.053,0.098961,0.727273,0.252,-0.108767,0.565707
max,0.996,0.988,1.0,1.0,1.0,0.1,0.1,1.0,1.0,0.0624,0.99798


In [10]:
session.close()

In [24]:
# Kmeans

X = tracks_kDF.select_dtypes(np.number)
number_cols = list(X.columns)
tracks_fitted = KMeans(n_clusters=2200, init='k-means++', verbose=1).fit(X)


Initialization complete
Iteration 0, inertia 3341.9734885488824
Iteration 1, inertia 2958.028526152823
Iteration 2, inertia 2865.533960463632
Iteration 3, inertia 2821.6744648528625
Iteration 4, inertia 2794.9874575827375
Iteration 5, inertia 2777.7174065733357
Iteration 6, inertia 2764.9304544126207
Iteration 7, inertia 2755.824337131769
Iteration 8, inertia 2748.985981834901
Iteration 9, inertia 2743.557497955236
Iteration 10, inertia 2739.283557948183
Iteration 11, inertia 2736.093943210369
Iteration 12, inertia 2733.7484163239
Iteration 13, inertia 2731.6034598485494
Iteration 14, inertia 2729.7038311203764
Iteration 15, inertia 2728.2267788671543
Iteration 16, inertia 2727.0176072898093
Iteration 17, inertia 2726.1145114514743
Iteration 18, inertia 2725.428496918189
Iteration 19, inertia 2724.8604746415763
Iteration 20, inertia 2724.4444227828976
Iteration 21, inertia 2724.055851575121
Iteration 22, inertia 2723.7154918814026
Iteration 23, inertia 2723.3802569295626
Iteration 24, 

Iteration 11, inertia 2735.4457168875574
Iteration 12, inertia 2732.767875136468
Iteration 13, inertia 2730.459237301054
Iteration 14, inertia 2728.7732792554243
Iteration 15, inertia 2727.3261916507336
Iteration 16, inertia 2726.088808099681
Iteration 17, inertia 2725.056582911842
Iteration 18, inertia 2724.1476627819475
Iteration 19, inertia 2723.398044686835
Iteration 20, inertia 2722.773655189112
Iteration 21, inertia 2722.222052141676
Iteration 22, inertia 2721.7109371506563
Iteration 23, inertia 2721.195695015317
Iteration 24, inertia 2720.7186201278378
Iteration 25, inertia 2720.2839529218927
Iteration 26, inertia 2719.902755022111
Iteration 27, inertia 2719.5562879217105
Iteration 28, inertia 2719.331186568866
Iteration 29, inertia 2719.1166061220315
Iteration 30, inertia 2718.8930980874106
Iteration 31, inertia 2718.6596275542247
Iteration 32, inertia 2718.4709176496563
Iteration 33, inertia 2718.299360895206
Iteration 34, inertia 2718.1633678739963
Iteration 35, inertia 2718.

Iteration 26, inertia 2718.348664312772
Iteration 27, inertia 2718.1545675111315
Iteration 28, inertia 2717.96921568589
Iteration 29, inertia 2717.8089924537544
Iteration 30, inertia 2717.6670082185656
Iteration 31, inertia 2717.5315101385854
Iteration 32, inertia 2717.441668348335
Iteration 33, inertia 2717.3576486958636
Iteration 34, inertia 2717.3059663320473
Iteration 35, inertia 2717.2527890070896
Iteration 36, inertia 2717.1776600263725
Iteration 37, inertia 2717.135644803808
Iteration 38, inertia 2717.100969241877
Iteration 39, inertia 2717.073000768697
Iteration 40, inertia 2717.039642181292
Iteration 41, inertia 2717.0286742081335
Iteration 42, inertia 2717.018926059787
Iteration 43, inertia 2717.0123751147416
Iteration 44, inertia 2717.0082214143054
Iteration 45, inertia 2717.006373271912
Iteration 46, inertia 2717.0034991649945
Iteration 47, inertia 2717.0024841389522
Converged at iteration 47: strict convergence.
Initialization complete
Iteration 0, inertia 3336.47898873178

In [25]:
song_cluster_labels = tracks_fitted.predict(X)


In [26]:
tracks_kDF['cluster_label'] = song_cluster_labels

In [72]:
# save model to csv
model_k = tracks_kDF[['id', 'name', 'artists', 'cluster_label']]
model_k.to_csv('KmeansModel.csv', index=False)

In [18]:
# Read CSV file into DataFrame df
model_kDF = pd.read_csv('maraKmodel.csv',  encoding='latin1')
model_kDF = model_kDF[['id', 'name', 'artists', 'cluster_label']]

In [19]:
#drops old table and creates new empty table
model_kDF.head(0).to_sql('kmodel', engine, if_exists='replace',index=False)

In [20]:
# KMEANS MODEL
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
model_kDF.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, 'kmodel', null="") # null values become ''
conn.commit()
engine.execute('alter table kmodel add primary key(id)')