In [1]:
import numpy as np
import pandas as pd
import psycopg2 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, desc
from config import username, password

In [2]:
# create engine
engine = create_engine(f'postgresql+psycopg2://{username}:{password}@aws-gt-dataviz-finalpg-001.cloqvwuqbywl.us-east-1.rds.amazonaws.com:5432/spotify_db')


In [3]:
# reflect an existing database into a new model
Base = automap_base()

# reflect the tables
Base.prepare(engine, reflect=True)

In [4]:
# We can view all of the classes that automap found
Base.classes.keys()

['artistgenre', 'kmodel', 'artists', 'genres', 'tracks', 'years']

In [5]:
# Save references to each table
Artistgenre = Base.classes.artistgenre
Artists = Base.classes.artists
Genres = Base.classes.genres
Tracks = Base.classes.tracks
Years = Base.classes.years

In [6]:
# look at columns in tables
inspector = inspect(engine)
columns = inspector.get_columns('tracks')
for column in columns:
    print(column["name"], column["type"])

acousticness DOUBLE PRECISION
artists TEXT
danceability DOUBLE PRECISION
duration_ms BIGINT
energy DOUBLE PRECISION
explicit BIGINT
id TEXT
instrumentalness DOUBLE PRECISION
key BIGINT
liveness DOUBLE PRECISION
loudness DOUBLE PRECISION
mode BIGINT
name TEXT
popularity BIGINT
release_date TEXT
speechiness DOUBLE PRECISION
tempo DOUBLE PRECISION
valence DOUBLE PRECISION
year BIGINT


In [7]:
# Create our session (link) from Python to the DB
session = Session(engine)

In [8]:
tracks_q = session.query(Tracks.id, Tracks.name, Tracks.artists, Tracks.acousticness,
                         Tracks.danceability, Tracks.energy, Tracks.instrumentalness,
                         Tracks.valence, Tracks.popularity, Tracks.year,
                         Tracks.key, Tracks.liveness, Tracks.loudness, Tracks.tempo).\
                        filter(Tracks.popularity > 25).all()

# store results in dataframe
tracks_kDF = pd.DataFrame(tracks_q, columns=['id', 'name', 'artists', 'acousticness',
                                             'danceability' ,'energy' ,'instrumentalness',
                                             'valence', 'popularity', 'year',
                                            'key', 'liveness', 'loudness', 'tempo'])

#  Key: 0 is C natural, 1 is C♯, 2 is D♮ and so on up to 11, which is B♮
#  Loudness: Values typical range between -60 and 0 db.

In [9]:
tracks_kDF['popularity'] = tracks_kDF['popularity']/1000
tracks_kDF['year'] = tracks_kDF['year']/2021/10
tracks_kDF['key'] = tracks_kDF['key']/11
tracks_kDF['loudness'] = tracks_kDF['loudness']/60
tracks_kDF['tempo'] = tracks_kDF['tempo']/244

In [10]:
tracks_kDF.describe()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo
count,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0,86748.0
mean,0.355442,0.553626,0.565912,0.108406,0.544487,0.044582,0.098406,0.474711,0.198516,-0.169631,0.490503
std,0.329519,0.174511,0.251569,0.255943,0.261462,0.013155,0.000811,0.320874,0.174247,0.084293,0.122963
min,0.0,0.0,0.0,0.0,0.0,0.026,0.095002,0.0,0.0,-1.0,0.0
25%,0.0436,0.435,0.381,0.0,0.335,0.034,0.097773,0.181818,0.0931,-0.212983,0.395093
50%,0.254,0.563,0.581,7.4e-05,0.554,0.042,0.098417,0.454545,0.129,-0.155367,0.48098
75%,0.65,0.68,0.772,0.015,0.765,0.053,0.098961,0.727273,0.252,-0.108767,0.565707
max,0.996,0.988,1.0,1.0,1.0,0.1,0.1,1.0,1.0,0.0624,0.99798


In [11]:
session.close()

In [12]:
# Kmeans

X = tracks_kDF.select_dtypes(np.number)
number_cols = list(X.columns)
tracks_fitted = KMeans(n_clusters=2200, init='k-means++', verbose=1).fit(X)


Initialization complete
Iteration 0, inertia 3331.354915726467
Iteration 1, inertia 2953.221140236694
Iteration 2, inertia 2863.5480365968574
Iteration 3, inertia 2818.454467230647
Iteration 4, inertia 2791.3196829904396
Iteration 5, inertia 2773.1065215679128
Iteration 6, inertia 2760.259278591015
Iteration 7, inertia 2751.449466428624
Iteration 8, inertia 2745.1797309791673
Iteration 9, inertia 2740.4640071570134
Iteration 10, inertia 2736.6441228562007
Iteration 11, inertia 2733.447476976392
Iteration 12, inertia 2730.9011393264536
Iteration 13, inertia 2728.6932269859426
Iteration 14, inertia 2726.967803876664
Iteration 15, inertia 2725.56882587243
Iteration 16, inertia 2724.40410519399
Iteration 17, inertia 2723.2897379951223
Iteration 18, inertia 2722.3162015925004
Iteration 19, inertia 2721.453309602554
Iteration 20, inertia 2720.576610850262
Iteration 21, inertia 2719.8106981302217
Iteration 22, inertia 2719.1217596770125
Iteration 23, inertia 2718.4623486029996
Iteration 24, i

Iteration 6, inertia 2767.682771114861
Iteration 7, inertia 2758.917843043462
Iteration 8, inertia 2752.0021260910826
Iteration 9, inertia 2746.648848803641
Iteration 10, inertia 2742.684307163927
Iteration 11, inertia 2739.3209283929373
Iteration 12, inertia 2736.5402331691857
Iteration 13, inertia 2734.297489746584
Iteration 14, inertia 2732.5471155198293
Iteration 15, inertia 2731.0661576776033
Iteration 16, inertia 2729.621991534917
Iteration 17, inertia 2728.537782443752
Iteration 18, inertia 2727.6544920710917
Iteration 19, inertia 2726.7374930131314
Iteration 20, inertia 2726.009677118585
Iteration 21, inertia 2725.426803713447
Iteration 22, inertia 2724.9784805991712
Iteration 23, inertia 2724.5839021409124
Iteration 24, inertia 2724.231667725587
Iteration 25, inertia 2723.9509990240535
Iteration 26, inertia 2723.6857445451924
Iteration 27, inertia 2723.3951793660644
Iteration 28, inertia 2723.116374005081
Iteration 29, inertia 2722.8808668669863
Iteration 30, inertia 2722.6911

Iteration 45, inertia 2716.133966324626
Converged at iteration 45: center shift 2.066779401626848e-29 within tolerance 4.471886210897255e-06
Initialization complete
Iteration 0, inertia 3338.946537961234
Iteration 1, inertia 2958.055301161176
Iteration 2, inertia 2869.193262040976
Iteration 3, inertia 2827.062458877974
Iteration 4, inertia 2800.567032547378
Iteration 5, inertia 2782.436481859234
Iteration 6, inertia 2769.394706503007
Iteration 7, inertia 2760.1173693912074
Iteration 8, inertia 2753.233699154231
Iteration 9, inertia 2748.0068523327036
Iteration 10, inertia 2743.8650991047975
Iteration 11, inertia 2740.1914725575875
Iteration 12, inertia 2737.1353660637706
Iteration 13, inertia 2734.748723202888
Iteration 14, inertia 2732.8172419723364
Iteration 15, inertia 2731.10525561709
Iteration 16, inertia 2729.623146766985
Iteration 17, inertia 2728.283758760637
Iteration 18, inertia 2727.1988655891855
Iteration 19, inertia 2726.2421669126
Iteration 20, inertia 2725.3324461095613


In [13]:
song_cluster_labels = tracks_fitted.predict(X)


In [14]:
tracks_kDF['cluster_label'] = song_cluster_labels

In [15]:
# save model to csv
model_k = tracks_kDF[['id', 'name', 'artists', 'acousticness',
                                             'danceability' ,'energy' ,'instrumentalness',
                                             'valence', 'popularity', 'year',
                                            'key', 'liveness', 'loudness', 'tempo', 'cluster_label']]
model_k.to_csv('maraKmodel2200.csv', index=False)

# Explore Model

In [16]:
tracks_kDF.loc[tracks_kDF['name']== "Tell Me I'm A Wreck"]

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo,cluster_label
21649,203zTFd1zbfG0fiOG9OREv,Tell Me I'm A Wreck,['Every Avenue'],0.00188,0.578,0.76,0.0,0.7,0.062,0.099406,0.454545,0.142,-0.03845,0.524197,1903


In [17]:
tracks_kDF['search']= tracks_kDF["artists"].str.find('Shania')

# All Time Low, Taylor Swift, Fleetwood Mac, Luke Bryan, Dan + Shay

In [None]:
tracks_kDF.loc[tracks_kDF['search'] != -1].sort_values(by = 'popularity', ascending=False)

In [None]:
tracks_kDF.loc[tracks_kDF['search'] != -1]["cluster_label"].unique()

In [None]:
recommended = tracks_kDF.loc[tracks_kDF['cluster_label']== 1448].sort_values(by = 'popularity', ascending=False)
recommended

In [None]:
tracks_kDF.describe()

In [None]:
for i, row in recommended.iterrows():
    print(f"{row['name']} - {row['artists']}")

In [None]:
Sum_of_squared_distances = []
K = range(1,100,10)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    Sum_of_squared_distances.append(km.inertia_)
    print(k)

In [None]:
# sample elbow after added columns
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
plt.savefig('ElbowMethod-addedCol-max91.png')

In [None]:
for i in range(1,10):
    print(f'{K[i]} Clusters - {Sum_of_squared_distances[i]} inertia')

In [None]:
#don't mess with this one
import matplotlib.pyplot as plt

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
plt.savefig('ElbowMethodK.png')

In [None]:
for i in range(1,10):
    print(f'{K[i]} Clusters - {Sum_of_squared_distances[i]} inertia')

In [None]:
# Visualizing the Clusters with PCA
import plotly.express as px 
from sklearn.decomposition import PCA


song_embedding = PCA(n_components=2).fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = tracks_kDF['name']
projection['cluster'] = tracks_kDF['cluster_label']

fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()