In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import theano.tensor as T
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam 
from tensorflow.python.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras import initializers
from tensorflow.python.keras.models import Sequential, Model, load_model, save_model
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
import numpy
from tensorflow.python.keras.layers.core import Dense, Lambda, Activation
from tensorflow.python.keras.layers import Embedding, Input, Dense, merge, Reshape, merge, Flatten, concatenate
from tensorflow.python.keras.regularizers import l2
from time import time
import multiprocessing as mp
import sys
import math
import json, sys, random, os, datetime, math




INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
%%time
def make_playlist_df(path, num_slices):
    df_list = []
    count = 0
    filenames = os.listdir(r'C:\Users\Mehar Unissa\Desktop\data')
    for filename in sorted(filenames):
        if count < num_slices and filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            data = json.load(open(fullpath))
            slice_df = pd.DataFrame.from_dict(data['playlists'], orient='columns')
            df_list.append(slice_df)
            count += 1
    return pd.concat(df_list)

path = r'C:\Users\Mehar Unissa\Desktop\data'
traindata = make_playlist_df(path, 10)

Wall time: 6.81 s


In [10]:
# also read in the challenge dataset which has missing songs
# that I want the model to predict
t = json.load(open(r'C:\Users\Mehar Unissa\Desktop\data/challenge_set.json'))
challenge_df = pd.DataFrame.from_dict(t['playlists'], orient='columns')

# combine train and challenge so can use cat code to map
# track ids to an index 0-N across both datasets
train_challengedata = pd.concat([traindata, challenge_df])

In [11]:
complete_array = []
for index, row in train_challengedata.iterrows():
    for track in row['tracks']:
        complete_array.append([track['track_uri'], track['artist_name'], track['track_name'], row['pid'], row['num_holdouts']])
df = pd.DataFrame(complete_array, columns=['trackid', 'artist_name', 'track_name', 'pid', 'num_holdouts'])

print(df.shape)
df.head()   # is a df of all track ids, corresponding artist names, track names and playlist ids

(951568, 5)


Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,


In [12]:
# turn songs into their unique cat codes so have a 0-N index for tracks
df['trackindex'] = df['trackid'].astype('category').cat.codes
print(len(df['trackindex'].unique()))
df.head()

189359


Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts,trackindex
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,,12216
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,,153108
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,,13148
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,,28658
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,,43317


In [13]:
# split training and challenge data
train = df[pd.isnull(df['num_holdouts'])]
challenge = df[pd.notnull(df['num_holdouts'])]
train.head()

Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts,trackindex
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0,,12216
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0,,153108
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0,,13148
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0,,28658
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0,,43317


In [14]:
challenge.head()

Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts,trackindex
670568,spotify:track:66U0ASk1VHZsqIkpMjKX3B,AronChupa,Little Swing,1000000,70.0,148529
670569,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,AronChupa,I'm an Albatraoz,1000000,70.0,130788
670570,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,Lorde,Yellow Flicker Beat - From The Hunger Games: M...,1000000,70.0,6745
670571,spotify:track:35kahykNu00FPysz3C2euR,Lorde,White Teeth Teens,1000000,70.0,75377
670572,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,Lorde,Team,1000000,70.0,79446


In [15]:
%%time
# save data in dok matrix (optimized sparse matrix object)
# create a sparse playlistid x trackindex matrix
# if a playlistid i has song j, mat[i,j]=1
mat = sp.dok_matrix((train.shape[0], len(df['trackindex'].unique())), dtype=np.float32)
for pid, trackindex in zip(train['pid'], train['trackindex']):
    mat[pid, trackindex] = 1.0
# sp.save_npz('spotify_train_matrix.npz', mat)

Wall time: 7.88 s


In [1]:
def get_model(num_users, num_items, latent_dim=8, dense_layers=[64, 32, 16, 8],
              reg_layers=[0, 0, 0, 0], reg_mf=[0,2]):

    # input layer
    input_user = Input(shape=(1,), dtype='int32', name='user_input')
    input_item = Input(shape=(1,), dtype='int32', name='item_input')
    
    # embedding layer
    mf_user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim,
                        name='mf_user_embedding',
                        embeddings_initializer='RandomNormal',
       mf_item_embedding = Embedding(input_dim=num_items, output_dim=latent_dim,
                        name='mf_item_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf[0]), input_length=1.0)
    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=int(dense_layers[0]/2),
                                   name='mlp_user_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=int(dense_layers[0]/2),
                         name='mlp_item_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)

    # MF latent vector
    mf_user_latent = Flatten()(mf_user_embedding(input_user))
    mf_item_latent = Flatten()(mf_item_embedding(input_item))
    mf_cat_latent = tf.multiply()([mf_user_latent, mf_item_latent])

    # MLP latent vector
    mlp_user_latent = Flatten()(mlp_user_embedding(input_user))
    mlp_item_latent = Flatten()(mlp_item_embedding(input_item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    mlp_vector = mlp_cat_latent
    # build dense layer for model
    for i in range(1,len(dense_layers)):
        layer = Dense(dense_layers[i],
                      activity_regularizer=l2(reg_layers[i]),
                      activation='relu',
                      name='layer%d' % i)
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])
    result = Dense(1, activation='sigmoid', 
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(inputs=[input_user,input_item], outputs=result(predict_layer))

    return model

# get the training samples
def get_train_samples(train_mat, num_negatives):
    user_input, item_input, labels = [], [], []
    num_user, num_item = train_mat.shape
    for (u, i) in train_mat.keys():
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_item)
            while (u, j) in train_mat.keys():
                j = np.random.randint(num_item)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

# hyperparameters
loaded = True
verbose = 1
epochs = 5   # I'd prefer to do 10, but time constraints
batch_size = 256
latent_dim = 8
dense_layers = [64, 32, 16, 8]
reg_layers = [0, 0, 0, 0]
reg_mf = [0]
num_negatives = 4
learning_rate = 0.001
learner = 'adam'
dataset = 'spotify'

# loading data
if loaded:
    train_mat = mat
else:
    train_mat = sp.load_npz('spotify_train_matrix.npz')
    
num_users, num_items = train_mat.shape
print('Done loading data!')

NameError: name 'mat' is not defined

In [34]:
%%time

# get model
model = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers, reg_mf)
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
    
# train model
# generate training instances
user_input, item_input, labels = get_train_samples(train_mat, num_negatives)

# training
hist = model.fit([np.array(user_input), np.array(item_input)], np.array(labels), 
                 batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=True)



Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
mlp_user_embedding (Embedding)  (None, 1, 32)        21458176    user_input[0][0]                 
__________________________________________________________________________________________________
mlp_item_embedding (Embedding)  (None, 1, 32)        6059488     item_input[0][0]                 
______________________________________________________________________________________________

In [35]:
# save model
model_file = '%s_NCF_%d_%s.h5' % (dataset, latent_dim, str(dense_layers))
model.save(model_file, overwrite=True)

In [36]:
%%time
from keras.models import load_model
from sklearn.cluster import KMeans

# this is a nice rock/oldies playlist
desired_user_id = 500
model_path = 'spotify_NCF_8_[64, 32, 16, 8].h5'
print('using model: %s' % model_path)
model = load_model(model_path)
print('Loaded model!')

mlp_user_embedding_weights = (next(iter(filter(lambda x: x.name == 'mlp_user_embedding', model.layers))).get_weights())

# get the latent embedding for your desired user
user_latent_matrix = mlp_user_embedding_weights[0]
one_user_vector = user_latent_matrix[desired_user_id,:]
one_user_vector = np.reshape(one_user_vector, (1,32))

print('\nPerforming kmeans to find the nearest users/playlists...')
# get 100 similar users
kmeans = KMeans(n_clusters=100, random_state=0, verbose=0).fit(user_latent_matrix)
desired_user_label = kmeans.predict(one_user_vector)
user_label = kmeans.labels_
neighbors = []
for user_id, user_label in enumerate(user_label):
    if user_label == desired_user_label:
        neighbors.append(user_id)
print('Found {0} neighbor users/playlists.'.format(len(neighbors)))

using model: spotify_NCF_8_[64, 32, 16, 8].h5
Loaded model!

Performing kmeans to find the nearest users/playlists...
Found 204 neighbor users/playlists.
Wall time: 17min 1s
Parser   : 201 ms


In [37]:
# get the tracks in similar users' playlists
tracks = []
for user_id in neighbors:
    tracks += list(df[df['pid'] == int(user_id)]['trackindex'])
print('Found {0} neighbor tracks from these users.'.format(len(tracks))) 

users = np.full(len(tracks), desired_user_id, dtype='int32')
items = np.array(tracks, dtype='int32')

print('\nRanking most likely tracks using the NeuMF model...')
# and predict tracks for my user
results = model.predict([users,items],batch_size=100, verbose=0) 
results = results.tolist()
print('Ranked the tracks!')

Found 11891 neighbor tracks from these users.

Ranking most likely tracks using the NeuMF model...
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmplrsn5oar.py, line 48)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmplrsn5oar.py, line 48)
Ranked the tracks!


In [38]:
results_df = pd.DataFrame(np.nan, index=range(len(results)), columns=['probability','track_name', 'track artist'])
print(results_df.shape)

# loop through and get the probability (of being in the playlist according to my model), the track, and the track's artist 
for i, prob in enumerate(results):
    results_df.loc[i] = [prob[0], df[df['trackindex'] == i].iloc[0]['track_name'], df[df['trackindex'] == i].iloc[0]['artist_name']]
results_df = results_df.sort_values(by=['probability'], ascending=False)

results_df.head(20)

(11891, 3)


Unnamed: 0,probability,track_name,track artist
4962,0.999999,Runaway,Timeflies
3177,0.999999,Oceans Away,Elton John
6228,0.999999,Yo Me Imagino,"Franco ""El Gorilla"""
715,0.999995,Daisy Cutter,311
9901,0.999995,""" Frühlingssinfonie "" , Sinfonie Nr. 1, B-Dur,...",Heribert Brandt
9158,0.999995,Christmastime Is Here - Vocal,Vince Guaraldi Trio
5018,0.999995,Elder Scrolls,Piano Tribute Players
10950,0.999995,La Negra Caderona,Aniceto Molina
6091,0.999995,The Three Of Me,William Bell
10715,0.999995,La Llave De Mi Corazón,Don Omar


In [39]:
df[df['pid'] == 500].head(20)


Unnamed: 0,trackid,artist_name,track_name,pid,num_holdouts,trackindex
33415,spotify:track:2H3ZUSE54pST4ubRd5FzFR,Marvin Gaye,Ain't No Mountain High Enough,500,,55489
33416,spotify:track:1QEEqeFIZktqIpPI4jSVSF,Boston,More Than a Feeling,500,,34827
33417,spotify:track:7BY005dacJkbO6EPiOh2wb,The Animals,House Of The Rising Sun,500,,174830
33418,spotify:track:43btz2xjMKpcmjkuRsvxyg,John Mellencamp,Jack & Diane,500,,98805
33419,spotify:track:00MI0oGDVJYM1qWbyUOIhH,Tommy Tutone,867-5309 / Jenny,500,,148
33420,spotify:track:6N1EjQjnvhOjFrF6oUmGPa,Bryan Adams,Summer Of '69,500,,154990
33421,spotify:track:3vV3cr2TpPqFk07zxYUbla,Quiet Riot,Cum on Feel the Noize,500,,95644
33422,spotify:track:0XIvZ82aDF7JiSi3ZE320u,Scorpions,Rock You Like A Hurricane,500,,13331
33423,spotify:track:5tVA6TkbaAH9QMITTQRrNv,Tom Petty,Free Fallin',500,,143462
33424,spotify:track:63OFKbMaZSDZ4wtesuuq6f,Steppenwolf,Born To Be Wild,500,,147302
