In [1]:
#import packages for clustering
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import euclidean
from collections import defaultdict
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

final = pd.read_csv("../raw_data/track_meta_milestone3.csv", index_col="Unnamed: 0")

final_new = final[['Playlistid', 'Trackid', 'Artist_Name', 'Track_uri',
                   'Track_Name', 'Album_Name', 'Track_Duration',
                   'acousticness', 'artist_genres', 'artist_popularity',
                   'danceability', 'energy', 'instrumentalness', 'key',
                   'liveness', 'loudness', 'mode', 'speechiness', 'tempo',
                   'time_signature', 'valence', 'Track', 'Artist']] #keep columns of interest

In [2]:
np.random.seed(123)
subset10k = np.random.choice(final_new.Playlistid.unique(), size = 10000, replace = False) #randomly select 1000 playlist ids

In [3]:
df10k = final_new[final_new.Playlistid.isin(subset10k)] #subset 10k

In [4]:
len(df10k.Playlistid.unique()) #check if 10k unique playlists

10000

## Train-Test Split

In [5]:
## Train-Test Split
# Train-val-test split (20%)
train, test = train_test_split(df10k, test_size=0.2, random_state=48, stratify = df10k['Playlistid'])
train, val = train_test_split(train, test_size=0.2, random_state=48, stratify = train['Playlistid'])

In [6]:
train['index'] = np.arange(1, len(train)+1)

## Scale Features for Clustering

In [7]:
scaleCols = ['acousticness', 'danceability', 'energy', 'instrumentalness',
             'key', 'liveness', 'loudness', 'speechiness', 'tempo','valence', 'time_signature', 'artist_popularity'] #mode excluded from analysis
scaler = StandardScaler()
scaler.fit(train.loc[:, scaleCols])
train_scaled = train.copy() #copy original master data frame
train_scaled[scaleCols] = scaler.transform(train_scaled[scaleCols]) #scale transform cluster columns
train_scaled['index'] = np.arange(1, len(train_scaled)+1) #reappend index column
train_scaled = train_scaled.rename(columns = {'acousticness': 'acousticness_scaled',
                                              'danceability': 'danceability_scaled',
                                              'energy': 'energy_scaled',
                                              'instrumentalness': 'instrumentalness_scaled',
                                              'key': 'key_scaled',
                                              'liveness': 'liveness_scaled',
                                              'loudness': 'loudness_scaled',
                                              'speechiness': 'speechiness_scaled',
                                              'tempo': 'tempo_scaled',
                                              'valence': 'valence_scaled',
                                              'time_signature': 'time_signature_scaled',
                                              'artist_popularity': 'artist_popularity_scaled'})
joinCols = ['index', 'Playlistid', 'Trackid', 'Track_uri', 'Artist_Name',
            'Track_Name', 'Album_Name', 'Track_Duration', 
            'artist_genres', 'mode']
train_new = train.merge(train_scaled, on = joinCols, how = 'outer') #merge scaled data with original data

## Clustering

In [8]:
#initiate clusters
def makeCluster(df, n, rs, cols):
    '''
    Parameters:
    1) df: master data frame
    2) n: number of clusters
    3) rs: random state
    4) cols: list of clustering variables
    '''
    #copy dataframe
    dfCluster = df.copy()

    #fit clusters
    kmeans = KMeans(n_clusters = n, random_state = rs)
    kmeans.fit(dfCluster.loc[:, cols])

    #get location of cluster centroids and label
    center = kmeans.cluster_centers_
    label = kmeans.labels_
    dfCluster['cluster_label'] = label
    dfCluster['cluster_label'] = dfCluster['cluster_label'] + 1 #increment by 1 so 0 implies non-existence in prediction
    
    #append centroids to data frame
    centroids = defaultdict(list)
    for col in cols:
        centroids['columns'].append(col)
    for a in range(len(center)):
        for b in range(len(center[0])):
            centroids['c'+ str(a)].append(center[a][b])
            dfCluster['c'+ str(a) + cols[b]] = center[a][b]

    return dfCluster, pd.DataFrame.from_dict(centroids)

In [9]:
clusterCols = ['acousticness_scaled','danceability_scaled', 
               'energy_scaled', 'instrumentalness_scaled',
               'key_scaled', 'liveness_scaled', 'loudness_scaled',
               'speechiness_scaled', 'tempo_scaled', 'time_signature_scaled',
               'valence_scaled'] #variables to cluster

In [10]:
#train cluster
train_cluster, train_centroids = makeCluster(train_new, 8, 48, clusterCols)

In [11]:
#rank centroids
def rankC(dfCentroid, n):
    rankC = defaultdict(list)
    for i in range(n):
        rankC['cluster'].append(i)
        for j in range(n):
            rankC[str(j)].append(euclidean(dfCentroid['c'+str(i)], dfCentroid['c'+str(j)]))    
    rankC = pd.DataFrame(rankC)
    orderRankC = defaultdict(list)
    for i in range(n):
        orderRankC[str(i)] = rankC[str(i)].sort_values(ascending = True).index.values
    return orderRankC

orderRankc = rankC(train_centroids, 8)

In [12]:
#initialize prediction cluster
prediction_cluster = train_cluster[['Playlistid', 'Trackid', 'Track_uri', 'Artist_Name', 'Track_Name',
                                   'artist_genres','artist_popularity', 'cluster_label', 'Track_x', 'Artist_x']].rename(columns = {'Track_x':'Track',
                                                                                                                                   'Artist_x':'Artist'})

In [13]:
#compute the times an artist appears in one cluster
mode_artist = prediction_cluster.groupby(['cluster_label', 'Artist_Name'])['Playlistid'].count().reset_index()
mode_artist = mode_artist.rename(columns = {'Playlistid': 'mode_artist'})
prediction_cluster = prediction_cluster.merge(mode_artist, on = ['cluster_label', 'Artist_Name'])

In [14]:
#compute the times a track appears in one cluster
mode_track = prediction_cluster.groupby(['cluster_label', 'Track_Name'])['Playlistid'].count().reset_index()
mode_track = mode_track.rename(columns = {'Playlistid': 'mode_track'})
prediction_cluster = prediction_cluster.merge(mode_track, on = ['cluster_label', 'Track_Name'])

## Prediction

In [15]:
import predict_cluster #original prediction model
import predict_cluster_updated #tuned prediction model

## Performance

In [16]:
def r_precision(prediction, val_set):
# prediction should be a list of predictions
# val_set should be pandas Series of ground truths
    score = np.sum(val_set.isin(prediction))/val_set.shape[0]
    return score

### NDCG Code Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [17]:
len(test.Playlistid.unique())

10000

In [19]:
#validation check
rps = []
ndcgs = []
count = 1
for pid in val.Playlistid.unique():
    #track progress
    if count%100 == 0:
        print(count)
    ps = predict_cluster.cPredict(prediction_cluster, pid, orderRankc, val) # predictions
    vs = val[val.Playlistid == pid].Track_uri # ground truth
    rps.append(r_precision(ps, vs))
    
    r = np.zeros(len(ps))
    for i, p in enumerate(ps):
        if np.any(vs.isin([p])):
            r[i] = 1
    ndcgs.append(ndcg_at_k(r, len(r)))
    count += 1

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000


In [20]:
avg_rp = np.mean(rps)
avg_ndcg = np.mean(ndcgs)
print('Avg. R-Precision: ', avg_rp)
print('Avg. NDCG: ', avg_ndcg)
print('Total Sum: ', np.mean([avg_rp, avg_ndcg]))

Avg. R-Precision:  0.029928074931039567
Avg. NDCG:  0.04289429582709006
Total Sum:  0.03641118537906481


In [19]:
#test check
rps_updated = []
ndcgs_updated = []
count = 1
for pid in test.Playlistid.unique():
    #track progress
    if count%100 == 0:
        print(count)
    ps = predict_cluster_updated.cPredict(prediction_cluster, pid, orderRankc, test) # predictions
    vs = test[test.Playlistid == pid].Track_uri # ground truth
    rps_updated.append(r_precision(ps, vs))
    
    r = np.zeros(len(ps))
    for i, p in enumerate(ps):
        if np.any(vs.isin([p])):
            r[i] = 1
    ndcgs_updated.append(ndcg_at_k(r, len(r)))
    count += 1

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000


In [20]:
avg_rp_updated = np.mean(rps_updated)
avg_ndcg_updated = np.mean(ndcgs_updated)
print('Avg. R-Precision: ', avg_rp_updated)
print('Avg. NDCG: ', avg_ndcg_updated)
print('Total Sum: ', np.mean([avg_rp_updated, avg_ndcg_updated]))

Avg. R-Precision:  0.11821517705293787
Avg. NDCG:  0.1330804266303709
Total Sum:  0.1256478018416544
