In [1]:
import numpy as np
import pandas as pd
import psycopg2 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, desc
from config import username, password

In [2]:
# create engine
engine = create_engine(f'postgresql+psycopg2://{username}:{password}@aws-gt-dataviz-finalpg-001.cloqvwuqbywl.us-east-1.rds.amazonaws.com:5432/spotify_db')


In [3]:
# reflect an existing database into a new model
Base = automap_base()

# reflect the tables
Base.prepare(engine, reflect=True)

In [4]:
# We can view all of the classes that automap found
Base.classes.keys()

['artistgenre', 'kmodel', 'artists', 'genres', 'tracks', 'years']

In [5]:
# Save references to each table
Artistgenre = Base.classes.artistgenre
Artists = Base.classes.artists
Genres = Base.classes.genres
Tracks = Base.classes.tracks
Years = Base.classes.years

In [6]:
# look at columns in tables
inspector = inspect(engine)
columns = inspector.get_columns('tracks')
for column in columns:
    print(column["name"], column["type"])

acousticness DOUBLE PRECISION
artists TEXT
danceability DOUBLE PRECISION
duration_ms BIGINT
energy DOUBLE PRECISION
explicit BIGINT
id TEXT
instrumentalness DOUBLE PRECISION
key BIGINT
liveness DOUBLE PRECISION
loudness DOUBLE PRECISION
mode BIGINT
name TEXT
popularity BIGINT
release_date TEXT
speechiness DOUBLE PRECISION
tempo DOUBLE PRECISION
valence DOUBLE PRECISION
year BIGINT


In [7]:
# Create our session (link) from Python to the DB
session = Session(engine)

In [8]:
tracks_q = session.query(Tracks.id, Tracks.name, Tracks.artists, Tracks.acousticness,
                         Tracks.danceability, Tracks.energy, Tracks.instrumentalness,
                         Tracks.valence, Tracks.popularity, Tracks.year,
                         Tracks.key, Tracks.liveness, Tracks.loudness, Tracks.tempo).\
                        filter(Tracks.popularity > 20).all()

# store results in dataframe
tracks_kDF = pd.DataFrame(tracks_q, columns=['id', 'name', 'artists', 'acousticness',
                                             'danceability' ,'energy' ,'instrumentalness',
                                             'valence', 'popularity', 'year',
                                            'key', 'liveness', 'loudness', 'tempo'])

#  Key: 0 is C natural, 1 is C♯, 2 is D♮ and so on up to 11, which is B♮
#  Loudness: Values typical range between -60 and 0 db.

In [9]:
tracks_kDF['popularity'] = tracks_kDF['popularity']/1000
tracks_kDF['year'] = tracks_kDF['year']/2021/10
tracks_kDF['key'] = tracks_kDF['key']/11
tracks_kDF['loudness'] = tracks_kDF['loudness']/60
tracks_kDF['tempo'] = tracks_kDF['tempo']/244

In [10]:
tracks_kDF

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo
0,2wAfHM7Whz67VFbdanhZlk,Nobody Knows You When You're Down and Out,['Bessie Smith'],0.99600,0.614,0.0423,0.002930,0.211,0.041,0.095151,0.363636,0.1830,-0.200550,0.368123
1,3eMrYc092k7SIJfWJ7oasR,Weather Bird,"['Louis Armstrong', 'Earl Hines']",0.98400,0.831,0.2620,0.912000,0.901,0.037,0.095151,0.727273,0.2040,-0.206433,0.428713
2,2AZgaYZSwUosJD71J2N2Zo,'Tain't Nobody's Bizness If I Do,['Bessie Smith'],0.99600,0.537,0.0443,0.000265,0.137,0.029,0.095151,0.272727,0.1520,-0.274567,0.329787
3,0V1iYWPXCBTaB6dhbiprGF,Send Me to the 'Lectric Chair,['Bessie Smith'],0.98600,0.771,0.0905,0.000141,0.601,0.025,0.095151,0.272727,0.1520,-0.129800,0.358898
4,6qRvnXftofjYJm1Mg98UWL,Need a Little Sugar in My Bowl,['Bessie Smith'],0.99200,0.693,0.0270,0.000000,0.402,0.026,0.095151,0.000000,0.1340,-0.225100,0.310447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98463,3NUmUIyzNLBp8bCFMH8Mif,Waiting On A War,['Foo Fighters'],0.00984,0.530,0.7590,0.000000,0.502,0.069,0.100000,0.636364,0.3190,-0.117783,0.540980
98464,0fJ1caLzidzTlIL3pPX1eU,Precious' Tale,['Jazmine Sullivan'],0.71500,0.734,0.3460,0.000000,0.930,0.059,0.100000,0.181818,0.3940,-0.195367,0.364135
98465,3HSUqAErTyFQWLfLdnFVnB,Connexion,['ZAYN'],0.49800,0.597,0.3680,0.000000,0.590,0.052,0.100000,0.181818,0.1090,-0.169183,0.704836
98466,660rulYF3eLCuW6rQpiMdL,Little Boy,['Ashnikko'],0.10500,0.781,0.4870,0.000000,0.327,0.061,0.100000,0.090909,0.0802,-0.121683,0.532545


In [11]:
tracks_kDF.describe()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo
count,98468.0,98468.0,98468.0,98468.0,98468.0,98468.0,98468.0,98468.0,98468.0,98468.0,98468.0
mean,0.37253,0.548568,0.555895,0.114267,0.544981,0.042022,0.09831,0.473339,0.201544,-0.173715,0.489701
std,0.334264,0.173603,0.253578,0.262269,0.261449,0.014184,0.000839,0.320456,0.178854,0.08454,0.122758
min,0.0,0.0,0.0,0.0,0.0,0.021,0.095002,0.0,0.0,-1.0,0.0
25%,0.0498,0.431,0.365,0.0,0.336,0.031,0.097674,0.181818,0.0935,-0.218033,0.394701
50%,0.281,0.557,0.568,9e-05,0.555,0.04,0.098268,0.454545,0.13,-0.160283,0.480412
75%,0.679,0.674,0.763,0.0185,0.766,0.052,0.098911,0.727273,0.254,-0.112296,0.564842
max,0.996,0.988,1.0,1.0,1.0,0.1,0.1,1.0,1.0,0.0624,0.99798


In [12]:
session.close()

In [13]:
len(tracks_kDF)

98468

In [14]:
tracks_kDF['artists'].dtypes

dtype('O')

In [25]:
tracks_kDF['artists'] = tracks_kDF['artists'].str.strip("['']").astype(str)

In [26]:
tracks_kDF['artists'] = tracks_kDF['artists'].apply(lambda x: x.replace("', '",", ")).astype(str) 

In [27]:
tracks_kDF['artists'] = tracks_kDF['artists'].apply(lambda x: x.replace('"[""','').replace('""]"','').replace('"','')).astype(str)

In [29]:
tracks_kDF['artists'] = tracks_kDF['artists'].apply(lambda x: x.replace(", '",", ")).astype(str)

In [None]:
tracks_kDF

In [21]:
tracks_kDF[tracks_kDF['artists'].str.contains("Dre")]

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo
962,3AF40YsNhI8LfFnxKvcgGB,Chartreuse,Kenny Drew,0.9910,0.536,0.131,0.905000,0.09810,0.037,0.096685,0.181818,0.1280,-0.306083,0.531090
1020,63AH56BdSoBHKJAnsB6KiR,52nd Street Theme,Kenny Drew,0.9370,0.454,0.474,0.941000,0.78600,0.027,0.096685,0.181818,0.2530,-0.254700,0.658984
3582,00rDcOb5oBm6tUvhDnDhdJ,San Franciscan Nights,"GÃ¡bor SzabÃ³, The California Dreamers",0.8510,0.564,0.229,0.025000,0.40600,0.049,0.097328,0.000000,0.1250,-0.313600,0.386205
3787,6IIcvtmuGpWIasqOpyGlyY,"Tighten Up, Pt. 1",Archie Bell & The Drells,0.2420,0.708,0.441,0.000147,0.72400,0.045,0.097378,1.000000,0.0899,-0.215517,0.513078
7120,4m3OS54KWywYhP7WD7z1cg,Life in a Northern Town,The Dream Academy,0.2650,0.563,0.476,0.000000,0.43200,0.053,0.098219,0.363636,0.4940,-0.222367,0.498648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98069,1H7Zqkq54andtaSSnLRrfp,You Will Be Found,"Ben Platt, Kristolyn Lloyd, Will Roland, Laura...",0.0865,0.293,0.624,0.000000,0.21300,0.063,0.099802,0.818182,0.0874,-0.118833,0.357930
98103,4gkJWcgc9QCFursmUBOirO,River Rain,Dreams of Dreams,0.1330,0.095,0.999,0.999000,0.00001,0.063,0.099802,0.545455,0.8520,-0.290617,0.368725
98193,6gcopravayFoqn40l3XHu6,Oh Darling,"Freddie Dredd, Soudiere",0.2670,0.724,0.680,0.000005,0.24700,0.065,0.099852,0.636364,0.1180,-0.134000,0.327857
98228,5F3DM6Iz4axGiLUhxvQLs1,welcome and goodbye,"Dream, Ivory",0.0196,0.583,0.717,0.745000,0.67300,0.063,0.099852,0.000000,0.0766,-0.086867,0.573750


In [22]:
tracks_kDF['artists'] = tracks_kDF['artists'].str.strip("['']").astype(str)
tracks_kDF

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo
0,2wAfHM7Whz67VFbdanhZlk,Nobody Knows You When You're Down and Out,Bessie Smith,0.99600,0.614,0.0423,0.002930,0.211,0.041,0.095151,0.363636,0.1830,-0.200550,0.368123
1,3eMrYc092k7SIJfWJ7oasR,Weather Bird,"Louis Armstrong, Earl Hines",0.98400,0.831,0.2620,0.912000,0.901,0.037,0.095151,0.727273,0.2040,-0.206433,0.428713
2,2AZgaYZSwUosJD71J2N2Zo,'Tain't Nobody's Bizness If I Do,Bessie Smith,0.99600,0.537,0.0443,0.000265,0.137,0.029,0.095151,0.272727,0.1520,-0.274567,0.329787
3,0V1iYWPXCBTaB6dhbiprGF,Send Me to the 'Lectric Chair,Bessie Smith,0.98600,0.771,0.0905,0.000141,0.601,0.025,0.095151,0.272727,0.1520,-0.129800,0.358898
4,6qRvnXftofjYJm1Mg98UWL,Need a Little Sugar in My Bowl,Bessie Smith,0.99200,0.693,0.0270,0.000000,0.402,0.026,0.095151,0.000000,0.1340,-0.225100,0.310447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98463,3NUmUIyzNLBp8bCFMH8Mif,Waiting On A War,Foo Fighters,0.00984,0.530,0.7590,0.000000,0.502,0.069,0.100000,0.636364,0.3190,-0.117783,0.540980
98464,0fJ1caLzidzTlIL3pPX1eU,Precious' Tale,Jazmine Sullivan,0.71500,0.734,0.3460,0.000000,0.930,0.059,0.100000,0.181818,0.3940,-0.195367,0.364135
98465,3HSUqAErTyFQWLfLdnFVnB,Connexion,ZAYN,0.49800,0.597,0.3680,0.000000,0.590,0.052,0.100000,0.181818,0.1090,-0.169183,0.704836
98466,660rulYF3eLCuW6rQpiMdL,Little Boy,Ashnikko,0.10500,0.781,0.4870,0.000000,0.327,0.061,0.100000,0.090909,0.0802,-0.121683,0.532545


In [23]:
tracks_kDF[tracks_kDF['artists'].str.contains("Pickett")]


Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo
2495,0xxZY5C9xxij3D1HkzbnfC,Monster Mash,"Bobby Boris Pickett, The Crypt-Kickers",0.375,0.541,0.504,0.0,0.707,0.053,0.097081,1.0,0.421,-0.236517,0.280664
2544,6GmL39a9OazWtyMkAbJz7v,Monster Mash,Bobby Boris Pickett,0.113,0.66,0.542,0.0,0.592,0.05,0.097081,1.0,0.346,-0.225183,0.576316
3128,4NRQwaks9r58tTDvr4iEyv,In the Midnight Hour,Wilson Pickett,0.12,0.75,0.444,4e-06,0.849,0.061,0.097229,0.363636,0.118,-0.143833,0.458684
3353,76ICmoJ4PcoMWoooaTxnQs,Land of 1000 Dances,Wilson Pickett,0.0128,0.618,0.588,0.0266,0.768,0.061,0.097279,0.181818,0.351,-0.193733,0.35616
3915,1MMp1H2Kib2BCDtdL5nL63,Hey Jude,Wilson Pickett,0.146,0.561,0.385,0.000496,0.664,0.053,0.097427,0.545455,0.171,-0.2555,0.334422
4186,79krwDoFzJ6dbNRwceAwgH,Don't Let the Green Grass Fool You,Wilson Pickett,0.442,0.573,0.707,0.000241,0.961,0.051,0.097476,0.545455,0.123,-0.138167,0.624865
14233,3W3FDMXmY4mzCg7IMus1ZW,Transylvania Twist,"Bobby Boris Pickett, The Crypt-Kickers",0.12,0.615,0.663,0.673,0.971,0.027,0.097081,0.636364,0.06,-0.201283,0.386455
14264,45WXeYmMCNqnR1ZqLlFpWB,Skully Gully,"Bobby Boris Pickett, The Crypt-Kickers",0.205,0.744,0.413,0.0515,0.743,0.026,0.097081,0.0,0.363,-0.1949,0.502193
14327,3fLAkdIr3hLjb9Ft5nvOiH,Monster's Holiday,"Bobby Boris Pickett, The Crypt-Kickers",0.47,0.715,0.453,0.0,0.68,0.023,0.097081,0.0,0.193,-0.228533,0.546414
20073,7mRak6wBx9OGKXr3zStoHW,Mustang Sally,Wilson Pickett,0.0906,0.76,0.522,3e-06,0.551,0.052,0.098565,0.0,0.174,-0.1348,0.449758


In [24]:
tracks_kDF[tracks_kDF['artists'].str.contains("Yankovic")]

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,valence,popularity,year,key,liveness,loudness,tempo
493,55sdccuwTv6aPlwz39UVso,Too Fat Polka,Frankie Yankovic,0.655,0.791,0.507,0.0,0.964,0.021,0.096338,0.636364,0.0943,-0.178767,0.514307
9382,5r96TaQquRrlo3Ym3ZlSL2,"""Amish Paradise (Parody of """"Gangsta's Paradis...",Weird Al Yankovic,0.103,0.728,0.448,0.0,0.483,0.054,0.098763,0.727273,0.267,-0.175667,0.331566
19267,4is3oF4FlWmedh3TK6Ke7z,Fat,Weird Al Yankovic,0.166,0.87,0.551,3.7e-05,0.409,0.046,0.098367,0.545455,0.0642,-0.17605,0.479164
21531,74sUbOF9Zm8LdGUJjxleTl,"""The Saga Begins (Lyrical Adaption of """"Americ...",Weird Al Yankovic,0.332,0.487,0.429,0.0,0.508,0.051,0.098911,0.363636,0.207,-0.145267,0.588451
25711,7fGW74qgJrknzuhQ4A5foT,In Heaven There Is No Beer,Frankie Yankovic & His Yanks,0.348,0.586,0.585,0.00028,0.961,0.026,0.097229,0.454545,0.0611,-0.1689,0.510586
29061,77exFA9gOKLvj6yhyX07HD,My Bologna,Weird Al Yankovic,0.162,0.689,0.874,1e-06,0.648,0.04,0.09812,0.818182,0.0565,-0.138617,0.392119
29200,6tBzYurAiGkaGopgYPdNo7,I Love Rocky Road,Weird Al Yankovic,0.429,0.829,0.727,0.0,0.907,0.036,0.09812,0.636364,0.0709,-0.143433,0.416574
29330,7uwJC9ngTvHYBtk1DH0aBr,Eat It,Weird Al Yankovic,0.16,0.751,0.768,0.0,0.881,0.04,0.098169,0.636364,0.0409,-0.163883,0.605156
29461,2QuYig9VyECgbJIHHxYirK,Dare to Be Stupid,Weird Al Yankovic,0.113,0.638,0.961,9e-06,0.775,0.039,0.098219,0.363636,0.352,-0.110667,0.367795
29552,3gH52R54Atk3CF41PJMhFB,Yoda,Weird Al Yankovic,0.0499,0.561,0.841,0.0,0.75,0.037,0.098219,0.818182,0.232,-0.150783,0.656426


In [None]:
# Kmeans

X = tracks_kDF.select_dtypes(np.number)
number_cols = list(X.columns)
tracks_fitted = KMeans(n_clusters=2200, init='k-means++', verbose=1).fit(X)


In [None]:
song_cluster_labels = tracks_fitted.predict(X)


In [None]:
tracks_kDF['cluster_label'] = song_cluster_labels

In [None]:
# save model to csv
model_k = tracks_kDF[['id', 'name', 'artists', 'acousticness',
                                             'danceability' ,'energy' ,'instrumentalness',
                                             'valence', 'popularity', 'year',
                                            'key', 'liveness', 'loudness', 'tempo', 'cluster_label']]
model_k.to_csv('chose2200_Kmodel.csv', index=False)

# Explore Model

In [None]:
# tracks_kDF.loc[tracks_kDF['name']== "Hotel California"]
tracks_kDF[tracks_kDF['name'].str.contains("Hotel California")]

In [None]:
tracks_kDF['search']= tracks_kDF["artists"].str.find('Extreme')

# All Time Low, Taylor Swift, Fleetwood Mac, Luke Bryan, Dan + Shay

In [None]:
tracks_kDF.loc[tracks_kDF['search'] != -1].sort_values(by = 'popularity', ascending=False)

In [None]:
tracks_kDF.loc[tracks_kDF['search'] != -1]["cluster_label"].unique()

In [None]:
recommended = tracks_kDF.loc[tracks_kDF['cluster_label']== 2194].sort_values(by = 'popularity', ascending=False)
recommended

In [None]:
tracks_kDF.describe()

In [None]:
len(recommended)

In [None]:
for i, row in recommended.iterrows():
    print(f"{row['name']} - {row['artists']}")

In [None]:
Sum_of_squared_distances = []
K = range(100,2500,100)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    Sum_of_squared_distances.append(km.inertia_)
    print(k)

In [None]:
# sample elbow after added columns
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')

plt.savefig('ElbowMethod-addedCol-max2500.png')
plt.show()

In [None]:
for i in range(1,10):
    print(f'{K[i]} Clusters - {Sum_of_squared_distances[i]} inertia')

In [None]:
#don't mess with this one
import matplotlib.pyplot as plt

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')

plt.savefig('ElbowMethodK.png')
plt.show()

In [None]:
for i in range(1,10):
    print(f'{K[i]} Clusters - {Sum_of_squared_distances[i]} inertia')

In [None]:
# Visualizing the Clusters with PCA
import plotly.express as px 
from sklearn.decomposition import PCA


song_embedding = PCA(n_components=2).fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = tracks_kDF['name']
projection['cluster'] = tracks_kDF['cluster_label']

fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()