In [14]:
import os
import pandas as pd

DATA_PATH = './data/'

# List of all relevant files
file_list = [DATA_PATH + filename for filename in os.listdir(DATA_PATH) if filename != '.gitkeep']

# List of Dataframes for each file
df_list = [pd.read_json(file) for file in file_list]

# Concatenating all df's together
df = pd.concat(df_list)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 174902 entries, 0 to 16487
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   ts                                 174902 non-null  object 
 1   username                           174902 non-null  object 
 2   platform                           174902 non-null  object 
 3   ms_played                          174902 non-null  int64  
 4   conn_country                       174902 non-null  object 
 5   ip_addr_decrypted                  174902 non-null  object 
 6   user_agent_decrypted               172199 non-null  object 
 7   master_metadata_track_name         166423 non-null  object 
 8   master_metadata_album_artist_name  166423 non-null  object 
 9   master_metadata_album_album_name   166423 non-null  object 
 10  spotify_track_uri                  166423 non-null  object 
 11  episode_name                       7856 

In [31]:
df[['ts', 'ms_played', 'master_metadata_track_name']].head(20)

Unnamed: 0,ts,ms_played,master_metadata_track_name
0,2014-09-28T16:54:57Z,77475,Like I Love You - Video Edit
1,2014-08-19T13:26:58Z,1439,Rather Be (feat. Jess Glynne)
2,2020-03-17T08:16:55Z,4876,Feel Your Love Tonight - 2015 Remaster
3,2021-11-18T10:34:47Z,230922,CRUDELIA - I nervi
4,2022-07-01T05:24:40Z,23,Mantra
5,2022-01-07T14:55:11Z,129320,Cherry Stones
6,2016-04-08T13:28:31Z,1480,Restless Heart
7,2022-03-13T01:41:43Z,1710,"Mutter, der Mann mit dem Koks ist da - Mother'..."
8,2015-04-07T20:36:04Z,250066,Yeah!
9,2018-04-27T20:28:29Z,30680,Die letzten 30 Sekunden


In [26]:
print(sorted(df['reason_start'].unique().tolist()))
print(sorted(df['reason_end'].unique().tolist()))

['', 'appload', 'backbtn', 'clickrow', 'clickside', 'endplay', 'fwdbtn', 'playbtn', 'popup', 'remote', 'trackdone', 'trackerror', 'unknown', 'uriopen']
['', 'appload', 'backbtn', 'clickrow', 'clickside', 'endplay', 'fwdbtn', 'logout', 'popup', 'remote', 'trackdone', 'trackerror', 'unexpected-exit', 'unexpected-exit-while-paused', 'unknown', 'uriopen']


In [14]:
# Most playtime of all artists
sub_df = df[['master_metadata_album_artist_name', 'ms_played']]
sub_df = sub_df.rename(columns={'master_metadata_album_artist_name': 'artist'})
sub_df.groupby(['artist'])['ms_played'].sum().sort_values(ascending=False)

artist
Led Zeppelin          397958868
LGoony                390587297
Bilderbuch            360850531
Sabaton               275149926
Zugezogen Maskulin    270853651
                        ...    
Maximum Love                  0
Tokyo Blade                   0
Max Prosa                     0
Max Frost                     0
DUCKER SIMONIT                0
Name: ms_played, Length: 14288, dtype: int64

In [15]:
# Spotify API Integration
import os

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

CLIENT_ID = os.environ['SPOTIFY_CLIENT_ID']
CLIENT_SECRET = os.environ['SPOTIFY_CLIENT_SECRET']

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=CLIENT_ID,
                                                           client_secret=CLIENT_SECRET))

results = sp.search(q='led zeppelin', limit=20)
for idx, track in enumerate(results['tracks']['items']):
    print(idx, track['name'])

0 Immigrant Song - Remaster
1 Kashmir - Remaster
2 Stairway to Heaven - Remaster
3 Whole Lotta Love - 1990 Remaster
4 Going to California - Remaster
5 Ramble On - 1990 Remaster
6 D'yer Mak'er - Remaster
7 Black Dog - Remaster
8 Over the Hills and Far Away - Remaster
9 Tangerine - Remaster
10 Good Times Bad Times - 1993 Remaster
11 Hey, Hey, What Can I Do - Remaster
12 When the Levee Breaks - Remaster
13 D'yer Mak'er - Remaster
14 Rock and Roll - Remaster
15 Dazed and Confused - 1990 Remaster
16 Heartbreaker - 1990 Remaster
17 Fool in the Rain - Remaster
18 Babe I'm Gonna Leave You - 1990 Remaster
19 All My Love - Remaster


In [11]:
sp.artist('spotify:artist:3oKRxpszQKUjjaHz388fVA')

{'external_urls': {'spotify': 'https://open.spotify.com/artist/3oKRxpszQKUjjaHz388fVA'},
 'followers': {'href': None, 'total': 646864},
 'genres': ['aussietronica', 'indie soul'],
 'href': 'https://api.spotify.com/v1/artists/3oKRxpszQKUjjaHz388fVA',
 'id': '3oKRxpszQKUjjaHz388fVA',
 'images': [{'height': 640,
   'url': 'https://i.scdn.co/image/ab6761610000e5ebb6edcc3e5c79c2bb67a17d00',
   'width': 640},
  {'height': 320,
   'url': 'https://i.scdn.co/image/ab67616100005174b6edcc3e5c79c2bb67a17d00',
   'width': 320},
  {'height': 160,
   'url': 'https://i.scdn.co/image/ab6761610000f178b6edcc3e5c79c2bb67a17d00',
   'width': 160}],
 'name': 'Parcels',
 'popularity': 66,
 'type': 'artist',
 'uri': 'spotify:artist:3oKRxpszQKUjjaHz388fVA'}

In [16]:
sp.audio_features('spotify:track:0ax2Np3bXCUXCcYmcX5x1x')

[{'danceability': 0.724,
  'energy': 0.83,
  'key': 8,
  'loudness': -8.531,
  'mode': 0,
  'speechiness': 0.0328,
  'acousticness': 0.0418,
  'instrumentalness': 0.0158,
  'liveness': 0.0853,
  'valence': 0.821,
  'tempo': 119.976,
  'type': 'audio_features',
  'id': '0ax2Np3bXCUXCcYmcX5x1x',
  'uri': 'spotify:track:0ax2Np3bXCUXCcYmcX5x1x',
  'track_href': 'https://api.spotify.com/v1/tracks/0ax2Np3bXCUXCcYmcX5x1x',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0ax2Np3bXCUXCcYmcX5x1x',
  'duration_ms': 237464,
  'time_signature': 4}]

In [None]:
sp.track('spotify:track:0ax2Np3bXCUXCcYmcX5x1x')

In [60]:
import pandas as pd
from database import get_sql_engine
from sqlalchemy import text

engine = get_sql_engine()

enao_df = pd.read_csv('enao.csv')
enao_df = enao_df[['genre', 'cluster']]
with engine.connect() as conn:
    genre_df = pd.read_sql(text('SELECT * FROM project.genres'), con=conn)
    genre_df = genre_df.drop(columns=['cluster'], axis=1)

genre_df = genre_df.join(enao_df.set_index('genre'), on='name', how='left')

genre_df.to_sql('genres', con=engine, if_exists='replace', index=False, schema='project')

KeyError: "['genre'] not in index"

In [62]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.colors

URL = "https://everynoise.com/"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

In [63]:
names, colors, tops, lefts, font_sizes = [[] for _ in range(5)]

for genre in soup.find_all('div', class_='genre scanme'):
    name = genre.text.strip().replace('»', '')
    style = genre.attrs['style']
    (color, top, left, font_size) = style.split(';')
    color = color.split(':')[1].strip()
    top = top.split(':')[1].strip().replace('px', '')
    left = left.split(':')[1].strip().replace('px', '')
    font_size = font_size.split(':')[1].strip().replace('%', '')
    
    names.append(name)
    colors.append(color)
    tops.append(top)
    lefts.append(left)
    font_sizes.append(font_size)

genres = pd.DataFrame({'name': names, 'color': colors, 'top': tops, 'left': lefts, 'font_size': font_sizes})
genres[['top', 'left', 'font_size']] = genres[['top', 'left', 'font_size']].astype(int)
# split the column color consisting of hexadecimal value into three columns for red, green and blue
genres[['R', 'G', 'B']] = genres['color'].apply(lambda x: pd.Series(matplotlib.colors.hex2color(x)))
genres

Unnamed: 0,name,color,top,left,font_size,R,G,B
0,pop,#a78b09,5361,804,160,0.654902,0.545098,0.035294
1,dance pop,#b88410,3447,850,142,0.721569,0.517647,0.062745
2,rap,#a98405,5676,1095,140,0.662745,0.517647,0.019608
3,rock,#b1701d,10658,499,136,0.694118,0.439216,0.113725
4,hip hop,#ac7e0b,6452,1082,132,0.674510,0.494118,0.043137
...,...,...,...,...,...,...,...,...
6077,classical piano quartet,#21a3b4,21879,290,100,0.129412,0.639216,0.705882
6078,string quintet,#45a6b5,18529,499,100,0.270588,0.650980,0.709804
6079,quartetto d'archi,#34a45f,17883,436,100,0.203922,0.643137,0.372549
6080,classical string trio,#19ad82,21109,388,100,0.098039,0.678431,0.509804


In [64]:
from sklearn.cluster import DBSCAN, KMeans
from scipy.spatial.distance import pdist, squareform

# normalize top and left value
genres['top'] = genres['top'] / genres['top'].max()
genres['left'] = genres['left'] / genres['left'].max()


X_RGB = genres[['R', 'G', 'B']]
X_XY = genres[['top', 'left']]
X_ALL = genres[['R', 'G', 'B', 'top', 'left']]

centroids = genres.sort_values(by="font_size", ascending=False).head(40)

distances = pdist(centroids[['top', 'left']])

# Convert the pairwise distances to a square distance matrix
square_distances = squareform(distances)

# Set a threshold distance
threshold = 0.045

# Identify the rows to remove
rows_to_remove = set()
for i in range(square_distances.shape[0]):
    for j in range(i + 1, square_distances.shape[1]):
        if square_distances[i, j] < threshold:
            rows_to_remove.add(i)
            rows_to_remove.add(j)
            
centroids = centroids.drop(centroids.index[list(rows_to_remove)])


In [65]:
centroids = centroids.reset_index(drop=True)
centroids


Unnamed: 0,name,color,top,left,font_size,R,G,B
0,latin pop,#aa8b03,0.306417,0.546667,124,0.666667,0.545098,0.011765
1,modern rock,#bb771d,0.261658,0.304,123,0.733333,0.466667,0.113725
2,trap latino,#ad9602,0.214204,0.783333,122,0.678431,0.588235,0.007843
3,permanent wave,#aa7820,0.401553,0.328,121,0.666667,0.470588,0.12549
4,alternative metal,#d45f2a,0.287463,0.219333,120,0.831373,0.372549,0.164706
5,soft rock,#93820e,0.486915,0.419333,120,0.576471,0.509804,0.054902
6,contemporary country,#a68801,0.289153,0.368,120,0.65098,0.533333,0.003922
7,pop rock,#aa8011,0.33478,0.329333,120,0.666667,0.501961,0.066667
8,filmi,#8a8719,0.670747,0.521333,119,0.541176,0.529412,0.098039
9,canadian pop,#978a05,0.413519,0.429333,119,0.592157,0.541176,0.019608


In [42]:

centroids_rgb = centroids[['R', 'G', 'B']].values
centroids_xy = centroids[['top', 'left']].values
centroids_all = centroids[['top', 'left', 'R', 'G', 'B']].values

X = centroids_all

# kmeans = KMeans(n_clusters=len(centroids_rgb), init=centroids_rgb)
kmeans = KMeans(n_clusters=len(centroids_all))
kmeans.fit(X)

# dbscan = DBSCAN(eps=10, min_samples=50)
# dbscan.fit(X)

genres['cluster'] = kmeans.labels_



In [59]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
scatter = ax.scatter(genres['left'], genres['top'], c=genres['cluster'])
centroids_xy = centroids[['left', 'top']].values
plt.scatter(centroids_xy[:, 0], centroids_xy[:, 1], marker='*',
        s=100, linewidths=3, zorder=10, color='red')
# add cluster centroid to the plot


# add text to each point
for i, txt in enumerate(centroids['name']):
    ax.annotate(txt, (centroids_xy[i, 0], centroids_xy[i, 1]), fontsize=20)
# increase the size of the figure
fig.set_size_inches(18.5, 20.5)


Error in callback <function _draw_all_if_interactive at 0x14e951620> (for post_execute):


TypeError: only size-1 arrays can be converted to Python scalars

TypeError: only size-1 arrays can be converted to Python scalars

<Figure size 1850x2050 with 1 Axes>

In [44]:
genres['cluster_name'] = genres['cluster'].apply(lambda x: centroids['name'][x])
# genres.to_csv('enao.csv', index=False)

In [57]:
genres.sort_values(by=['cluster_name', 'font_size'], ascending=False).to_csv('clusrer.csv', index=False)


In [58]:
kmeans.cluster_centers_

array([[0.43813584, 0.14769362],
       [0.52180424, 0.68655769],
       [0.31135131, 0.69808374],
       [0.81507476, 0.47046037],
       [0.59734202, 0.33622906],
       [0.38314239, 0.34096182],
       [0.89977259, 0.21818182],
       [0.79115545, 0.6825404 ],
       [0.0918037 , 0.76214191],
       [0.17742965, 0.53330441],
       [0.25244226, 0.21587474],
       [0.62806322, 0.52575814],
       [0.41381978, 0.50885807]])