In [None]:
import pandas as pd
import datetime
import numpy as np
import pickle as pkl
import getpass
import datapane as dp 

# Load data and perform basic filtering

In [None]:
# Constants
TRACK_SKIP_MS = 30000 # Tracks played for less than this number of ms are considered 'skipped'
UPLOAD_TO_DATAPANE = False

In [None]:
import json

def load_track_history():
    f1 = open('data/MyData/StreamingHistory0.json')
    data1 = json.load(f1)
    f2 = open('data/MyData/StreamingHistory1.json')
    data2 = json.load(f2)
    return pd.DataFrame(data1 + data2)


full_track_history = load_track_history()
# Convert timestamps to datetime
full_track_history['endTime'] = pd.to_datetime(full_track_history['endTime'], utc=True)
full_track_history['endTime'] = full_track_history['endTime'].dt.tz_convert('Europe/Copenhagen')
# Add weekday info
full_track_history['weekday'] = np.where(full_track_history['endTime'].dt.dayofweek < 5, True, False)

In [None]:
track_dict = pkl.load(open('../vectormodels/track_map_clean.pkl', 'rb'))
# Add uri to track history
track_dict_pd = pd.DataFrame.from_dict(track_dict, orient='index')
track_dict_pd['track_uri'] = track_dict_pd.index
# Add track uri
full_track_history['orig_index'] = full_track_history.index
merged_df = pd.merge(full_track_history, track_dict_pd, how='left', left_on=['artistName', 'trackName'], right_on=['artist_name', 'track_name'])[['orig_index', 'track_uri', 'artist_uri', 'album_uri']]
# In case of multiple matches, keep only one
merged_df = merged_df.drop_duplicates(['orig_index'])
merged_df.index = merged_df['orig_index']
full_track_history['track_uri'] = merged_df['track_uri']
full_track_history['artist_uri'] = merged_df['artist_uri']
full_track_history['album_uri'] = merged_df['album_uri']

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials,SpotifyOAuth

spotify_client_id = getpass.getpass()
spotify_client_secret = getpass.getpass()

spotify_client = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id=spotify_client_id, client_secret=spotify_client_secret, redirect_uri='http://localhost:8080', scope='playlist-modify-public'))


In [None]:
# Find tracks with missing uris
tracks_with_missing_uris = full_track_history.loc[full_track_history['track_uri'].isnull()].groupby(['artistName', 'trackName']).size().reset_index(name='Frequency')
track_results = []
for idx,track in tracks_with_missing_uris.iterrows():
    search = spotify_client.search(q='artist:{} track:{}'.format(track['artistName'], track['trackName']), limit=1, type='track')
    if len(search['tracks']['items']) > 0:
        track_result = search['tracks']['items'][0]
        track_results.append((track['artistName'], track['trackName'], track_result))
    else:
        track_results.append((track['artistName'], track['trackName'], None))

pkl.dump(track_results, open('track_uri_lookup_list.pkl', 'wb'))

In [None]:
# Merge URIs from spotify into full_track_history
track_results = pkl.load(open('track_uri_lookup_list.pkl', 'rb'))
track_results_dict_list = [{'artistName': t[0], 'trackName': t[1], 'track_uri': t[2]['uri'], 'album_uri': t[2]['album']['uri'], 'artist_uri': t[2]['artists'][0]['uri']} for t in track_results if t[2] is not None]
track_results_df = pd.DataFrame(track_results_dict_list)

merged_df = pd.merge(full_track_history, track_results_df, how='left', left_on=['artistName', 'trackName'], right_on=['artistName', 'trackName'])#[['artistName', 'trackName', 'track_uri', 'artist_uri', 'album_uri']].dropna()
merged_df.track_uri_x.fillna(merged_df.track_uri_y, inplace=True)
merged_df.album_uri_x.fillna(merged_df.album_uri_y, inplace=True)
merged_df.artist_uri_x.fillna(merged_df.artist_uri_y, inplace=True)
merged_df = merged_df.rename(columns={'track_uri_x': 'track_uri', 'album_uri_x': 'album_uri', 'artist_uri_x': 'artist_uri'})
full_track_history = merged_df[full_track_history.columns]

In [None]:
# Remove skipped tracks
unique_skipped_tracks = full_track_history.loc[full_track_history['msPlayed'] < TRACK_SKIP_MS]['track_uri'].dropna().unique()
short_track_uris = set()
curr_batch = []
for track in unique_skipped_tracks:
    curr_batch.append(track)
    if len(curr_batch) == 10:
        tracks = spotify_client.tracks(curr_batch)
        for t in tracks['tracks']:
            if t['duration_ms'] < TRACK_SKIP_MS:
                short_track_uris.add(t['uri'])
        curr_batch = []
if len(curr_batch) > 0:
    tracks = spotify_client.tracks(curr_batch)
    for t in tracks['tracks']:
        if t['duration_ms'] < TRACK_SKIP_MS:
            short_track_uris.add(t['uri'])
    curr_batch = []
pkl.dump(short_track_uris, open('short_track_uris.pkl', 'wb'))
short_track_uris

In [None]:
track_history_no_skip = full_track_history[(full_track_history.track_uri.isin(short_track_uris)) | (full_track_history['msPlayed'] > TRACK_SKIP_MS)]
pkl.dump(track_history_no_skip, open('track_history_no_skip_df.pkl', 'wb'))
track_history_no_skip

In [None]:
track_history_no_skip = pkl.load(open('track_history_no_skip_df.pkl', 'rb'))
track_history_no_skip

In [None]:
# Most played artists measured by ms played
most_played_artists_df = track_history_no_skip[['artistName', 'msPlayed']].groupby(['artistName']).sum().sort_values('msPlayed', ascending=False)
if UPLOAD_TO_DATAPANE:
    table = dp.Table(most_played_artists_df)
    report = dp.Report(table)
    report.upload(name='Most Played Artists')

In [None]:
# Most played artists measured by number of plays
track_history_no_skip[['artistName', 'trackName']].groupby(['artistName']).count().sort_values('trackName', ascending=False)

Quite similar, but Rainbow surpasses Porcupine Tree and Mastodon, probably due to shorter average track length.

In [None]:
# Most played tracks measured by ms
most_played_tracks_df = track_history_no_skip[['artistName', 'trackName', 'msPlayed']].groupby(['artistName', 'trackName']).sum('msPlayed').sort_values('msPlayed', ascending=False)
if UPLOAD_TO_DATAPANE:
    table = dp.Table(most_played_tracks_df)
    report = dp.Report(table)
    report.upload(name='Most Played Tracks')
most_played_tracks_df

In [None]:
# Most played tracks measured by number of plays
track_history_no_skip[['trackName', 'artistName', 'msPlayed']].groupby(['trackName', 'artistName']).count().rename(columns={'msPlayed': 'count'}).sort_values('count', ascending=False)

In [None]:
# Most skipped tracks
skipped_tracks = full_track_history.loc[(~full_track_history.track_uri.isin(short_track_uris)) & (full_track_history['msPlayed'] < TRACK_SKIP_MS)]
skip_counts = pd.DataFrame(skipped_tracks.groupby(['trackName', 'artistName'])['msPlayed'].count()).rename(columns={'msPlayed': 'skips'})
total_num_plays = full_track_history[['artistName', 'trackName', 'msPlayed']].groupby(['trackName', 'artistName']).count().sort_values('msPlayed', ascending=False)
skip_and_play_counts = skip_counts.merge(total_num_plays, left_on=['artistName','trackName'],right_on=['artistName','trackName']).rename(columns={'msPlayed': 'plays'})
skip_and_play_counts['skipRate'] = skip_and_play_counts['skips']/skip_and_play_counts['plays']
skip_and_play_counts = skip_and_play_counts.loc[skip_and_play_counts['plays'] > 3].sort_values(['skipRate', 'plays'], ascending=False)
if UPLOAD_TO_DATAPANE:
    table = dp.Table(skip_and_play_counts)
    report = dp.Report(table)
    report.upload(name='Most Skipped Tracks')
skip_and_play_counts

In [None]:
# Most skipped artists
skipped_tracks = full_track_history.loc[(~full_track_history.track_uri.isin(short_track_uris)) & (full_track_history['msPlayed'] < TRACK_SKIP_MS)]
skip_counts = pd.DataFrame(skipped_tracks.groupby(['artistName'])['msPlayed'].count()).rename(columns={'msPlayed': 'skips'})
total_num_plays = full_track_history[['artistName', 'msPlayed']].groupby(['artistName']).count().sort_values('msPlayed', ascending=False)
skip_and_play_counts = skip_counts.merge(total_num_plays, left_on=['artistName'],right_on=['artistName']).rename(columns={'msPlayed': 'plays'})
skip_and_play_counts['skipRate'] = skip_and_play_counts['skips']/skip_and_play_counts['plays']
skip_and_play_counts = skip_and_play_counts.loc[skip_and_play_counts['plays'] > 3].sort_values(['skipRate', 'plays'], ascending=False)
if UPLOAD_TO_DATAPANE:
    table = dp.Table(skip_and_play_counts)
    report = dp.Report(table)
    report.upload(name='Most Skipped Artists')
skip_and_play_counts

# Days-of-week

In [None]:
import plotly.express as px

In [None]:
# Mean ms played per day (where at least one track was played)
mean_ms_played_per_day = track_history_no_skip.groupby(track_history_no_skip['endTime'].dt.date).sum('msPlayed')['msPlayed'].sum() / len(track_history_no_skip.groupby(track_history_no_skip['endTime'].dt.date))
mean_ms_played_per_day

In [None]:
# Mean ms played per day (where at least one track was played)
track_history_no_skip['msPlayed'].sum()

In [None]:
# Weekday
track_play_hour_weekday = track_history_no_skip.groupby([track_history_no_skip['endTime'].dt.weekday, track_history_no_skip['endTime'].dt.hour]).sum('msPlayed')['msPlayed']

# Create time matrix
week_time_matrix = np.zeros((24,7))
for weekday in range(0, 7):
    for h in range(0,24):
        if (weekday, h) in track_play_hour_weekday:
            week_time_matrix[h, weekday] = track_play_hour_weekday[weekday, h]

fig = px.imshow(week_time_matrix, origin='lower', template='plotly_dark', color_continuous_scale=['#11272e', '#1cc8ff'],
                labels=dict(x="Day of Week", y="Hour of Day", color="Play time"),
                x=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
                y=np.arange(0,24)
               )
fig.write_html('plot.html')
if UPLOAD_TO_DATAPANE:
    fig.update(layout_coloraxis_showscale=False) 
    report = dp.Report(dp.Plot(fig)) #Create a report
    report.upload(name='Streaming Hour/Weekday Distribution')

In [None]:
# Hour of Day
track_play_hour = track_history_no_skip.groupby([track_history_no_skip['endTime'].dt.hour]).sum('msPlayed')['msPlayed']

# Create time matrix
hour_matrix = np.zeros((24,1))
for h in range(0,24):
    if h in track_play_hour:
        hour_matrix[h, 0] = track_play_hour[h]

fig = px.imshow(hour_matrix, origin='lower', template='plotly_dark', color_continuous_scale=['#11272e', '#1cc8ff'],
                labels=dict(y="Hour of Day", color="Play time"),
                y=np.arange(0,24)
               )
fig.write_html('plot.html')

In [None]:
# Day of Month
track_play_hour_monthday = track_history_no_skip.groupby([track_history_no_skip['endTime'].dt.day, track_history_no_skip['endTime'].dt.hour]).sum('msPlayed')['msPlayed']

# Create time matrix
monthday_time_matrix = np.zeros((24,31))
for day in range(0, 31):
    for h in range(0,24):
        if (day+1, h) in track_play_hour_monthday:
            monthday_time_matrix[h, day] = track_play_hour_monthday[day+1, h]


fig = px.imshow(monthday_time_matrix, origin='lower', template='plotly_dark', color_continuous_scale=['#11272e', '#1cc8ff'],
                labels=dict(x="Day of Month", y="Hour of Day", color="Play time"),
                x=np.arange(1,32),
                y=np.arange(0,24)
               )
fig.write_html('plot.html')

In [None]:
# Month
track_play_hour_month = track_history_no_skip.groupby([track_history_no_skip['endTime'].dt.month, track_history_no_skip['endTime'].dt.hour]).sum('msPlayed')['msPlayed']

# Create time matrix
month_time_matrix = np.zeros((24,12))
for month in range(0, 12):
    for h in range(0,24):
        if (month+1, h) in track_play_hour_month:
            month_time_matrix[h, month] = track_play_hour_month[month+1, h]


fig = px.imshow(month_time_matrix, origin='lower', template='plotly_dark', color_continuous_scale=['#11272e', '#1cc8ff'],
                labels=dict(x="Month", y="Hour of Day", color="Play time"),
                x=np.arange(0,12),
                y=np.arange(0,24)
               )
fig.write_html('plot.html')

In [None]:
# Day of Month, per month
track_play_day_hour = track_history_no_skip.groupby([track_history_no_skip['endTime'].dt.month, track_history_no_skip['endTime'].dt.day]).sum('msPlayed')['msPlayed']

# Create time matrix
month_day_matrix = np.zeros((31,12))
for month in range(0, 12):
    for day in range(0,31):
        if (month+1, day+1) in track_play_day_hour:
            month_day_matrix[day, month] = track_play_day_hour[month+1, day+1]


fig = px.imshow(month_day_matrix, origin='lower', template='plotly_dark', color_continuous_scale=['#11272e', '#1cc8ff'],
                labels=dict(x="Month", y="Day of Month", color="Play time"),
                x=np.arange(0,12),
                y=np.arange(0,31)
               )
fig.write_html('plot.html')

In [None]:
# Hour of day, each day
track_play_date_hour = track_history_no_skip.groupby([track_history_no_skip['endTime'].dt.dayofyear, track_history_no_skip['endTime'].dt.hour]).sum('msPlayed')['msPlayed']

# Create time matrix
hour_day_matrix = np.zeros((24,365))
for day in range(0, 365):
    for hour in range(0,24):
        if (day+1, hour) in track_play_date_hour:
            hour_day_matrix[hour, day] = track_play_date_hour[day+1, hour]


fig = px.imshow(hour_day_matrix, origin='lower', template='plotly_dark', color_continuous_scale=['#11272e', '#1cc8ff'],
                labels=dict(x="Day of Year", y="Hour of Day", color="Play time"),
                x=np.arange(0,365),
                y=np.arange(0,24)
               )

fig.write_html('plot.html')

In [None]:
# Day of Year, Github style
track_play_dayofyear = track_history_no_skip.groupby([track_history_no_skip['endTime'].dt.dayofyear]).sum('msPlayed')['msPlayed']

# Create time matrix
dayofyear_matrix = np.zeros((7,53))
for week in range(0, 53):
    for day in range(0,7):
        day_of_year = week * 7 + day
        if day_of_year+1 in track_play_dayofyear:
            dayofyear_matrix[day, week] = track_play_dayofyear[day_of_year+1] / (1000.0 * 60.0 * 60.0)


fig = px.imshow(dayofyear_matrix, origin='lower', template='plotly_dark', color_continuous_scale=['#11272e', '#1cc8ff'],
                labels=dict(x="Week", y="Weekday", color="Play time (h)"),
                x=np.arange(0,53),
                y=np.arange(0,7)
               )
fig.write_html('plot.html')
if UPLOAD_TO_DATAPANE:
    fig.update(layout_coloraxis_showscale=False) 
    report = dp.Report(dp.Plot(fig)) #Create a report
    report.upload(name='test_plot')

# Vector Plots

In [None]:
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import OPTICS, MeanShift
from sklearn.neighbors import NearestNeighbors
import numpy as np
import plotly.express as px

In [None]:
# Load model
model = Word2Vec.load('../vectormodels/track2vec.model')
X = model.wv.get_normed_vectors()

In [None]:
# Load labels
labels = []
for v in model.wv.index_to_key:
    t = track_dict[v]
    labels.append(t['track_name'] + " - " + t['artist_name'])

In [None]:
import openTSNE

In [None]:
X_sample_indices = np.random.permutation(list(range(X_pca.shape[0])))
X_sample_indices_reverse = np.argsort(X_sample_indices)
X_sample, X_rest = X_pca[X_sample_indices[:100000]], X_pca[X_sample_indices[100000:]]

In [None]:
X_pca_affinities = openTSNE.affinity.PerplexityBasedNN(
    X_pca,
    perplexity=50,
    n_jobs=32,
    random_state=0,
)

In [None]:
X_sample_affinities = openTSNE.affinity.PerplexityBasedNN(
    X_sample,
    perplexity=500,
    n_jobs=32,
    random_state=0,
    verbose=True,
)

In [None]:
X_sample_init = openTSNE.initialization.pca(X_sample, random_state=42)
X_sample_embedding = openTSNE.TSNE(n_jobs=32, verbose=True).fit(affinities=X_sample_affinities, initialization=X_sample_init)

In [None]:
# Add remaining embeddings
X_rest_init = X_sample_embedding.prepare_partial(X_rest, k=1, perplexity=1/3)
X_init_full = np.vstack((X_sample_embedding, X_rest_init))[X_sample_indices_reverse]

In [None]:
X_init_full = X_init_full / (np.std(X_init_full[:, 0]) * 10000)
np.std(X_init_full, axis=0)

In [None]:
X_tsne_embedding = openTSNE.TSNEEmbedding(
    X_init_full,
    X_pca_affinities,
    n_jobs=32,
    verbose=True,
    random_state=42,
)
X_tsne_embedding = X_tsne_embedding.optimize(n_iter=500, exaggeration=12, momentum=0.5)
X_tsne_embedding = X_tsne_embedding.optimize(n_iter=750, exaggeration=4, momentum=0.8)

In [None]:
#pkl.dump(X_tsne_embedding,open('X_tsne_embedding.pkl', 'wb'))

In [None]:
X_tsne_embedding = pkl.load(open('X_tsne_embedding.pkl', 'rb'))

In [None]:
X_tsne_embedding_df = pd.DataFrame({'x': X_tsne_embedding[:, 0], 'y': X_tsne_embedding[:, 1]})
X_tsne_embedding_df['Title'] = labels

In [None]:
fig = px.scatter(X_tsne_embedding_df, x='x', y='y', hover_data=['Title'])
fig.update_layout(template='seaborn')
fig.update_traces(marker=dict(size=4, opacity=0.2),
                  selector=dict(mode='markers'))
fig.write_html('plot.html')

In [None]:
from annoy import AnnoyIndex

In [None]:
from annoy import AnnoyIndex
nn_index = AnnoyIndex(128, 'angular')
for i in range(0, X.shape[0]):
    nn_index.add_item(i, X[i])
print("Building...")
nn_index.build(100)
print("Saving")
nn_index.save('distances_.ann')

In [None]:
nn_index = AnnoyIndex(128, 'angular')
nn_index.load('distances_.ann') # super fast, will just mmap the file

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
all_distances = []
for idx in range(0, X.shape[0]):
    neighbor_indices = nn_index.get_nns_by_item(idx, 10)
    neighbor_points = X[neighbor_indices[1:]] # skip the first one, which should be itself
    cur_distances = np.sum([euclidean_distances([X[idx]], [neighbor]) for neighbor in neighbor_points])
    all_distances.append(cur_distances)
all_distances = np.asarray(all_distances)
all_distances -= all_distances.min()
all_distances /= all_distances.max()

In [None]:
X_tsne_embedding_df['Dist'] = np.clip(all_distances, 0.1, 1.0)

In [None]:
fig = px.scatter(X_tsne_embedding_df, x='x', y='y', hover_data=[
                 'Title'], color="Dist", template='plotly_dark')
#fig.update_layout(template='seaborn')
fig.update_traces(marker=dict(size=4, opacity=0.5),
                  selector=dict(mode='markers'))
fig.write_html('plot.html')

## Play history

In [None]:
# Add vector index to play history
track_history_indices = track_history_no_skip.apply(lambda row: model.wv.key_to_index[row['track_uri']] if row['track_uri'] in model.wv.key_to_index else -1, axis=1)
track_history_no_skip['track2vec_idx'] = track_history_indices

In [None]:
# Scale msPlayed
track_history_ms_played = track_history_no_skip.groupby(['track_uri']).sum('msPlayed').sort_values('msPlayed', ascending=False)
df_norm = (track_history_ms_played-track_history_ms_played.min())/(track_history_ms_played.max()-track_history_ms_played.min())
df_norm = df_norm.rename(columns={'msPlayed': 'msPlayed_scaled'})
track_history_no_skip = track_history_no_skip.merge(df_norm['msPlayed_scaled'], left_on='track_uri', right_on='track_uri', how='left')

In [None]:
X_tsne_embedding_df.loc[track_history_no_skip[(track_history_no_skip['track2vec_idx'] != -1) & (track_history_no_skip['track2vec_idx'] < 100000)]['track2vec_idx'],'played'] = True
X_tsne_embedding_df.loc[X_tsne_embedding_df['played'] != True, 'played'] = False

X_tsne_embedding_df.loc[track_history_no_skip[(track_history_no_skip['track2vec_idx'] != -1) & (track_history_no_skip['track2vec_idx'] < 100000)]['track2vec_idx'],'msPlayed_scaled'] = track_history_no_skip[(track_history_no_skip['track2vec_idx'] != -1) & (track_history_no_skip['track2vec_idx'] < 100000)]['msPlayed_scaled']
X_tsne_embedding_df.loc[X_tsne_embedding_df['played'] != True, 'msPlayed_scaled'] = np.NAN

fig = px.scatter(X_tsne_embedding_df, x='x', y='y', hover_data=[
                 'Title'], color='played', opacity=0.2, template='plotly_dark', color_continuous_scale=['#1688ab', '#58d400'])
#fig = px.scatter(tsne_df, x='x', y='y', hover_data=[
#                 'Title'], color='played', opacity=0.3, color_discrete_sequence=['#1688ab', '#58d400'], template='plotly_dark')

#fig.add_trace(px.scatter(tsne_df[tsne_df['played'] != True], x="x", y="y", opacity=0.8))

#fig.update_layout(template='plotly_dark')
#fig.update_traces(marker=dict(size=4, opacity=0.3, color='#1688ab'),
#                  selector=dict(mode='markers'))


fig.write_html('plot.html')
#fig.write_image("images/fig1.png")

# Artist2Vec

In [None]:
# Load model
artist_model = Word2Vec.load('../vectormodels/artist2vec.model')
artist_X = artist_model.wv.get_normed_vectors()

In [None]:
# Load labels
artist_dict = pickle.load(open('../vectormodels/artist_meta.pkl', 'rb'))

# Top genres
from collections import defaultdict
top_genres_dict = defaultdict(lambda: 0)
for k in artist_dict.keys():
    artist = artist_dict[k]
    for genre in artist['genres']:
        top_genres_dict[genre] += 1
        
#print(list(sorted([(cnt, genre) for genre, cnt in top_genres_dict.items()], reverse=True)))
# Select topN genres
top_genres = set([x[1] for x in sorted([(cnt, genre) for genre, cnt in top_genres_dict.items()], reverse=True)[:50]])

artist_labels = []
artist_genres = []
artist_played = []
for v in artist_model.wv.index_to_key:
    t = artist_dict[v]
    artist_labels.append(t['name'])
    if v in track_history_no_skip['artist_uri'].values:
        artist_played.append(True)
    else:
        artist_played.append(False)
    if len(t['genres']) > 0:
        for genre in t['genres']:
            if genre in top_genres:
                artist_genres.append(genre)
                break
        else:
            artist_genres.append('Unknown')
    else:
        artist_genres.append('Unknown')


In [None]:
artist_pca = PCA(n_components=50)
artist_X_pca = artist_pca.fit_transform(artist_X)

In [None]:
artist_X_sample_indices = np.random.permutation(list(range(artist_X_pca.shape[0])))
artist_X_sample_indices_reverse = np.argsort(artist_X_sample_indices)
artist_X_sample, artist_X_rest = artist_X_pca[artist_X_sample_indices[:100000]], artist_X_pca[artist_X_sample_indices[100000:]]

In [None]:
artist_X_pca_affinities = openTSNE.affinity.PerplexityBasedNN(
    artist_X_pca,
    perplexity=50,
    n_jobs=32,
    random_state=0,
)

artist_X_sample_affinities = openTSNE.affinity.PerplexityBasedNN(
    artist_X_sample,
    perplexity=500,
    n_jobs=32,
    random_state=0,
    verbose=True,
)

In [None]:
artist_X_sample_init = openTSNE.initialization.pca(artist_X_sample, random_state=42)
artist_X_sample_embedding = openTSNE.TSNE(n_jobs=32, verbose=True).fit(affinities=artist_X_sample_affinities, initialization=artist_X_sample_init)

In [None]:
# Add remaining embeddings
artist_X_rest_init = artist_X_sample_embedding.prepare_partial(artist_X_rest, k=1, perplexity=1/3)
artist_X_init_full = np.vstack((artist_X_sample_embedding, artist_X_rest_init))[artist_X_sample_indices_reverse]

In [None]:
artist_X_init_full = artist_X_init_full / (np.std(artist_X_init_full[:, 0]) * 10000)
np.std(artist_X_init_full, axis=0)

In [None]:
artist_X_tsne_embedding = openTSNE.TSNEEmbedding(
    artist_X_init_full,
    artist_X_pca_affinities,
    n_jobs=32,
    verbose=True,
    random_state=42,
)
artist_X_tsne_embedding = artist_X_tsne_embedding.optimize(n_iter=500, exaggeration=12, momentum=0.5)
artist_X_tsne_embedding = artist_X_tsne_embedding.optimize(n_iter=750, exaggeration=4, momentum=0.8)

pickle.dump(artist_X_tsne_embedding,open('artist_X_tsne_embedding.pkl', 'wb'))


In [None]:
#artist_X_tsne_embedding = pickle.load(open('artist_X_tsne_embedding.pkl', 'rb'))

In [None]:
artist_X_tsne_embedding_df = pd.DataFrame({'x': artist_X_tsne_embedding[:, 0], 'y': artist_X_tsne_embedding[:, 1]})
artist_X_tsne_embedding_df['Title'] = artist_labels
artist_X_tsne_embedding_df['Genre'] = artist_genres
artist_X_tsne_embedding_df['Played'] = artist_played

In [None]:
from annoy import AnnoyIndex
artist_nn_index = AnnoyIndex(128, 'angular')
for i in range(0, artist_X.shape[0]):
    artist_nn_index.add_item(i, artist_X[i])
print("Building...")
artist_nn_index.build(100)
print("Saving")
artist_nn_index.save('artist_distances.ann')

In [None]:
artist_all_distances = []
for idx in range(0, artist_X.shape[0]):
    neighbor_indices = artist_nn_index.get_nns_by_item(idx, 10)
    neighbor_points = artist_X[neighbor_indices[1:]] # skip the first one, which should be itself
    cur_distances = np.sum([euclidean_distances([X[idx]], [neighbor]) for neighbor in neighbor_points])
    artist_all_distances.append(cur_distances)
artist_all_distances = np.asarray(artist_all_distances)
artist_all_distances -= artist_all_distances.min()
artist_all_distances /= artist_all_distances.max()

In [None]:
artist_X_tsne_embedding_df['Dist'] = np.clip(artist_all_distances, 0.1, 1.0)

In [None]:
fig = px.scatter(artist_X_tsne_embedding_df, x='x', y='y', hover_data=[
                 'Title'], color="Dist", template='plotly_dark')
#fig.update_layout(template='seaborn')
fig.update_traces(marker=dict(size=4, opacity=0.5),
                  selector=dict(mode='markers'))
fig.write_html('plot.html')

In [None]:
fig = px.scatter(artist_X_tsne_embedding_df, x='x', y='y', hover_data=[
                 'Title', 'Genre'], color="Genre", template='plotly_dark')
#fig.update_layout(template='seaborn')
fig.update_traces(marker=dict(size=4, opacity=0.5),
                  selector=dict(mode='markers'))
fig.write_html('plot.html')

In [None]:
fig = px.scatter(artist_X_tsne_embedding_df, x='x', y='y', hover_data=[
                 'Title', 'Genre'], color="Played", template='plotly_dark')
#fig.update_layout(template='seaborn')
fig.update_traces(marker=dict(size=4, opacity=0.5),
                  selector=dict(mode='markers'))
fig.write_html('plot.html')

# Lyrics

In [None]:
import sqlite3
import pandas as pd
dbcon = sqlite3.connect('../lyrics/mxm_dataset.db')

In [None]:
cursor = dbcon.cursor()
cursor.execute("PRAGMA table_info(lyrics)")
print(cursor.fetchall())
cursor.execute("select count(distinct track_id) from lyrics")
print(cursor.fetchall())

In [None]:
lyrics_meta = pd.read_csv(open('../lyrics/mxm_779k_matches.txt', 'rt'), sep='<SEP>')

In [None]:
cursor = dbcon.cursor()
cursor.execute("SELECT distinct track_id FROM lyrics")
tid_lyrics_in_db = pd.DataFrame(cursor.fetchall(), columns=['tid'])
# Remove tracks without full lyrics in db
lyrics_meta = pd.merge(tid_lyrics_in_db, lyrics_meta, how='left', left_on=['tid'], right_on=['tid'])

In [None]:
lyrics_meta = pd.merge(lyrics_meta, track_dict_pd, how='left', left_on=['artist', 'title'], right_on=['artist_name', 'track_name'])[['tid', 'track_uri', 'artist_uri', 'album_uri']]

In [None]:
import requests

API_KEY = getpass.getpass()

tracks_without_lyrics = track_history_no_skip[~track_history_no_skip['track_uri'].isin(lyrics_meta['track_uri'])].drop_duplicates(['track_uri'])
lyrics = []
for idx, track in tracks_without_lyrics.iterrows():
    api_params = dict(
        q_track=track['trackName'],
        q_artist=track['artistName'],
        apikey=API_KEY
    )
    resp = requests.get(url="https://api.musixmatch.com/ws/1.1/matcher.lyrics.get", params=api_params)
    lyric_response = resp.json()
    if lyric_response['message']['header']['status_code'] != 200:
        continue
    lyrics_text = lyric_response['message']['body']['lyrics']['lyrics_body']
    lyrics_text,_,_ = lyrics_text.rpartition('******* This Lyrics is NOT for Commercial use *******')
    lyrics.append((track['track_uri'], lyrics_text))
    if idx % 100 == 0:
        print(idx)

In [None]:
api_lyrics_df = pd.DataFrame(lyrics, columns=['track_uri', 'lyrics'])
pkl.dump(api_lyrics_df, open('lyrics_from_api.pkl', 'wb'))

In [None]:
# Lookup lyrics of listened tracks from db
tracks_with_lyrics = track_history_no_skip[track_history_no_skip['track_uri'].isin(lyrics_meta['track_uri'])].drop_duplicates(['track_uri'])
# Join track id
tracks_with_lyrics = pd.merge(tracks_with_lyrics, lyrics_meta[['track_uri', 'tid']], how='left', left_on=['track_uri'], right_on=['track_uri']).drop_duplicates(['track_uri'])
lyrics = []
for idx, track in tracks_with_lyrics.iterrows():
    cursor = dbcon.cursor()
    cursor.execute("SELECT group_concat(word, ' ') FROM lyrics WHERE track_id = ? GROUP BY track_id", [track['tid']])
    lyrics_text = cursor.fetchall()[0][0]
    lyrics.append((track['track_uri'], lyrics_text))
db_lyrics_df = pd.DataFrame(lyrics, columns=['track_uri', 'lyrics'])

In [None]:
all_lyrics_df = db_lyrics_df.append(api_lyrics_df).replace('').dropna()
all_lyrics_df

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

In [None]:
import pickle as pkl
all_predictions = []
batch = []
for idx, row in all_lyrics_df.iterrows():
    batch.append(row)
    if len(batch) > 100:
        # Trim to 512 tokens (400 to be sure)
        try:
            prediction = classifier([' '.join(b['lyrics'].split()[:250]) for b in batch])
            for i in range(0, len(batch)):
                all_predictions.append((batch[i]['track_uri'], prediction[i]))
            batch = []
        except:
            print("ERROR")
            print(batch)
            raise
if len(batch) > 0:
    prediction = classifier([' '.join(b['lyrics'].split()[:250]) for b in batch])
    for i in range(0, len(batch)):
        all_predictions.append((batch[i]['track_uri'], prediction[i]))

pkl.dump(all_predictions, open('lyric_topics.pkl', 'wb'))

In [None]:
all_predictions[0]

In [None]:
all_predictions = pkl.load(open('lyric_topics.pkl', 'rb'))

In [None]:
lyric_topics_df = pd.DataFrame([{'spotify_uri': p[0], **{k['label']: k['score'] for k in p[1]}} for p in all_predictions])
lyric_topics_df.set_index('spotify_uri', inplace=True)

# Plot with lyrics

In [None]:
lyric_topics_df = lyric_topics_df.groupby([lyric_topics_df.index]).mean()
track_history_no_skip_topics = pd.merge(track_history_no_skip, lyric_topics_df, how='left', left_on=['track_uri'], right_index=True)
track_history_no_skip_topics

In [None]:
import os
import mplcairo
import matplotlib
matplotlib.use("module://mplcairo.macosx")
print(matplotlib.get_backend())
from matplotlib.font_manager import FontProperties
from matplotlib import font_manager

import seaborn as sns
import matplotlib.pyplot as plt

if not os.path.exists("timeline"):
    os.mkdir("timeline")

prop = FontProperties(fname='/System/Library/Fonts/Apple Color Emoji.ttc', size=28)
sns.set(font="Meiryo")

    
# Calculate diff from current date
curr_date = track_history_no_skip.iloc[0].endTime - datetime.timedelta(days=7)
curr_date = curr_date.replace(hour=0, minute=0, second=0)
#curr_date = track_history_no_skip.iloc[0].endTime
#curr_date += datetime.timedelta(days=30)
max_date = track_history_no_skip.iloc[-1].endTime + datetime.timedelta(days=7)
#max_date = curr_date + datetime.timedelta(days=30)

#X_tsne_embedding_df['freshness'] = 60*60*24*5
X_tsne_embedding_df['freshness'] = np.nan
nan_indices = X_tsne_embedding_df['freshness'].isna()

LYRIC_TOPICS = ['sadness', 'joy', 'love', 'anger', 'fear'] # Without 'Surprise' since it did never occur
LYRIC_TOPICS_LABELS = ['😢', '🤣', '❤️', '😡', '😱']

i = 0
sns.set(font="Meiryo")



while curr_date < max_date:
    track_history_no_skip_topics['freshness'] = abs((track_history_no_skip_topics['endTime'] - curr_date).dt.total_seconds()).clip(0, 60*60*24*5)
    most_fresh = track_history_no_skip_topics[['freshness', 'track2vec_idx']].groupby(['track2vec_idx']).min()
    X_tsne_embedding_df.loc[most_fresh[most_fresh.index != -1].index, 'freshness'] = most_fresh[most_fresh.index != -1]['freshness']
    
    fresh_topics = track_history_no_skip_topics[track_history_no_skip_topics['freshness'] < 60*60*24*5]
    if fresh_topics.size > 0:
        weighted_topic_distribution = np.multiply(fresh_topics[LYRIC_TOPICS].values, np.expand_dims((1-fresh_topics['freshness']/(60*60*24*5)).values,axis=-1))
        weighted_topic_distribution = np.nan_to_num(weighted_topic_distribution, 0)
        weighted_topic_distribution = np.mean(weighted_topic_distribution, axis=0)
        if np.sum(weighted_topic_distribution) > 0:
            norm = np.linalg.norm(weighted_topic_distribution)
            weighted_topic_distribution = weighted_topic_distribution/norm
    else:
        weighted_topic_distribution = [0.0] * len(LYRIC_TOPICS)

    plt.clf()
    sns.set(style="ticks", context="talk")
    plt.style.use("dark_background")
    plt.axis('off')
    sns.despine(fig=None, ax=None, top=True, right=True, left=True, bottom=True, offset=None, trim=False)
    plot = sns.scatterplot(data=X_tsne_embedding_df[X_tsne_embedding_df['freshness'].isna()].replace(np.nan, 60*60*24*5), x='x', y='y', hue='freshness', legend=False, palette=sns.color_palette("mako_r", as_cmap=True), hue_norm=(0, 60*60*24*5*1.2), linewidth=0, alpha=0.3, s=5)
    plot = sns.scatterplot(data=X_tsne_embedding_df[X_tsne_embedding_df['freshness'].notna()], x='x', y='y', hue='freshness', legend=False, palette=sns.color_palette("mako_r", as_cmap=True), hue_norm=(0, 60*60*24*5*1.2), linewidth=0, alpha=0.8, s=8)
    plot.set_title(curr_date.strftime("%b %d %Y %H:%M:%S"))
    fig = plot.get_figure()
    
    ax2 = fig.add_axes([0.8, 0.15, 0.15, 0.15])  #[lowerCorner_x, lowerCorner_y, width, height]
    ax2.set_ylim([0, 1.0])
    ax2.tick_params(color='#696969', labelcolor='#696969')
    for spine in ax2.spines.values():
        spine.set_edgecolor('#696969')
    plot = sns.barplot(x=LYRIC_TOPICS, y=weighted_topic_distribution, ax=ax2, color='#02b6ed')
    plot.set_xticklabels(LYRIC_TOPICS_LABELS, fontproperties=prop)
    plot.get_yaxis().set_visible(False)
    plt.setp(ax2.patches, linewidth=0)
    fig.set_size_inches(20,10)
    fig.savefig("timeline/{}_plot.png".format(i)) 
    
    i += 1
    curr_date += datetime.timedelta(hours=6)

# My Year as a Playlist

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

monthly_tracks = []
kmeans_classifiers = []
for month in range(1, 13):
    month_tracks = track_history_no_skip[track_history_no_skip['endTime'].dt.month == month]
    month_embeddings = X[month_tracks[month_tracks['track2vec_idx'] != -1]['track2vec_idx']]
    kmeans = KMeans(n_clusters=5)
    # Fit to tracks
    kmeans.fit(month_embeddings)
    # Find centroids in original space
    closest_tracks, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
    kmeans_classifiers.append(kmeans)
    monthly_tracks.append(closest_tracks)


In [None]:
# Visualize
for i in range(0, 12):
    month_name = datetime.datetime.strptime(str(i+1), "%m").strftime('%B')
    classes = kmeans_classifiers[i].predict(X)
    fig = px.scatter(X_tsne_embedding_df, x='x', y='y', title="Clusters of {}".format(month_name), hover_data=[
                     'Title'], color=[str(x) for x in classes], template='plotly_dark')
    #fig.update_layout(template='seaborn')
    fig.update_traces(marker=dict(size=4, opacity=0.5),
                      selector=dict(mode='markers'))
    fig.write_html('plots/clusters_{}.html'.format(month_name))

In [None]:
# Create playlist
playlist_tracks = []
for i, t_list in enumerate(monthly_tracks):
    month_name = datetime.datetime.strptime(str(i+1), "%m").strftime('%B')
    for t in t_list:
        t_id = model.wv.index_to_key[t]
        t = track_dict[t_id]
        playlist_tracks.append({
            'month': month_name,
            'track_uri': t_id,
            **t
        })


In [None]:
playlist_df = pd.DataFrame(playlist_tracks).drop_duplicates(subset=['track_uri'])
playlist_df

In [None]:
user_id = spotify_client.current_user()['id']
playlist = spotify_client.user_playlist_create(user_id, 'My Year as a Playlist - 2021', description="Playlist generated by AI")
spotify_client.user_playlist_add_tracks(user_id, playlist['id'], tracks=playlist_df['track_uri'])