In [8]:
import numpy as np
import pandas as pd

feature_MUSIC_dict = {
    'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
    'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
    'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
    'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
    'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
    'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
}
feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

msd = pd.read_hdf('data/full_msd_with_audio_features.h5', key='df')
msd_audio_only = msd[['song_id'] + list(feature_MUSIC_dict.keys())]

def get_MUSIC(song_ids):
    song_vectors = msd_audio_only.loc[msd['song_id'].isin(song_ids)][list(
        feature_MUSIC_dict.keys())].values
    return np.sum(
        np.dot(song_vectors, feature_MUSIC_matrix), axis=0) / len(song_ids)


In [9]:
triplets = pd.read_hdf('data/triplets.h5', key='df')

In [10]:
triplets.head()

Unnamed: 0,play_count,song_id,user_id
0,1,SOAKIMP12A8C130995,b80344d063b5ccb3212f76538f3d9e43d87dca9e
1,1,SOAPDEY12A81C210A9,b80344d063b5ccb3212f76538f3d9e43d87dca9e
2,2,SOBBMDR12A8C13253B,b80344d063b5ccb3212f76538f3d9e43d87dca9e
3,1,SOBFNSP12AF72A0E22,b80344d063b5ccb3212f76538f3d9e43d87dca9e
4,1,SOBFOVM12A58A7D494,b80344d063b5ccb3212f76538f3d9e43d87dca9e


In [11]:
print(triplets.shape)

(48373586, 3)


In [16]:
# create userid_playlist_dict where key is user id and value is the 2d list of song ids/playcounts
import time

def create_userid_playlist_dict(triplets):
    
    userid_playlist_dict = {}
    
    start_time = time.time()
    i = 0
    for index, row in triplets.iterrows():
        if row['user_id'] not in userid_playlist_dict:
            userid_playlist_dict[row['user_id']] = [row['song_id']]
        else:
            userid_playlist_dict[row['user_id']].append(row['song_id'])
        i += 1
        if i % 1000000 == 0:
            print(i)
            print(time.time() - start_time)
    
    return userid_playlist_dict

# calculate MUSIC score for every user in playlist_dict
# create dict of userid -> MUSIC score

def calculate_user_MUSIC_score(userid_playlist_dict):
    
    user_MUSIC_dict = {}
    
    i = 0
    for key, val in userid_playlist_dict.items():
        MUSIC_score = get_MUSIC(val)
        user_MUSIC_dict[key] = MUSIC_score
        i += 1
        if i % 100 == 0:
            print(i)
    
    return user_MUSIC_dict


# select one user at random

# perform kNN to find k nearest neighbours to that person

In [17]:
userid_playlist_dict = create_userid_playlist_dict(triplets)

1000000
84.05733108520508
2000000
142.669095993042
3000000
208.37852716445923
4000000
266.2397372722626
5000000
323.69307708740234
6000000
381.52230405807495
7000000
439.80319690704346
8000000
497.9907331466675
9000000
556.4368042945862
10000000
614.0467021465302
11000000
671.9736251831055
12000000
730.5401420593262
13000000
787.8881611824036
14000000
846.0090320110321
15000000
904.2330329418182
16000000
961.8722202777863
17000000
1019.9542760848999
18000000
1078.9642460346222
19000000
1136.399244070053
20000000
1196.060297012329
21000000
1255.5716762542725
22000000
1315.1208992004395
23000000
1376.3081171512604
24000000
1433.1432812213898
25000000
1491.1401121616364
26000000
1548.6891210079193
27000000
1605.8124511241913
28000000
1663.331757068634
29000000
1720.780839920044
30000000
1777.7668991088867
31000000
1838.0327219963074
32000000
1896.7292470932007
33000000
1954.787052154541
34000000
2013.8235161304474
35000000
2072.2242062091827
36000000
2129.1356501579285
37000000
2186.43528

In [42]:
import time

start = time.time()

df_dict = {}
df_dict['user_id'] = []
df_dict['playlist'] = []
for key, val in userid_playlist_dict.items():
    df_dict['user_id'].append(key)
    df_dict['playlist'].append(val)
    
df = pd.DataFrame(df_dict)

In [43]:
df.head()

Unnamed: 0,user_id,playlist
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,"[SOAKIMP12A8C130995, SOAPDEY12A81C210A9, SOBBM..."
1,85c1f87fea955d09b4bec2e36aee110927aedf9a,"[SOACWYB12AF729E581, SOAUSXX12A8C136188, SOBVA..."
2,bd4c6e843f00bd476847fb75c47b4fb430a06856,"[SOBDRND12A8C13FD08, SOCHBAJ12AAF3B3A4F, SOCZT..."
3,8937134734f869debcab8f23d77465b4caaa85df,"[SOAFPAX12AB0187A17, SOEBOAR12A6D4FD136, SOFRD..."
4,969cc6fb74e076a68e36a04409cb9d3765757508,"[SOABRAB12A6D4F7AAF, SOAOQFD12A6D4FAAA9, SOBFP..."


In [44]:
df.to_hdf('userid_playlist.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['user_id', 'playlist']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
