In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
%matplotlib inline

import os, math
import keras.backend as K
from numpy import random
from __future__ import division
from sklearn import dummy, metrics, cross_validation, ensemble
from keras.layers import Input, Embedding, Flatten, Dropout, Conv2D, merge, normalization, MaxPooling1D,Dense, Dot, Concatenate, Merge, Conv1D, Add,add
from keras.utils import to_categorical
from keras.regularizers import l2
from keras.models import Model
from IPython.display import SVG
from keras.optimizers import Adam
from keras.utils.vis_utils import model_to_dot
from keras.callbacks import ModelCheckpoint
from keras import regularizers
from keras.callbacks import LearningRateScheduler, ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.utils.vis_utils import plot_model
import keras
import deepdish as dd
import tensorflow
from read_activations import *

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from PIL import Image as PImage

In [8]:
HOME_DIR = '/home/jvidyala/final_data/'
TRAIN_DATA = HOME_DIR + 'mf_train_data/'
TEST_DATA = HOME_DIR + 'mf_test_data/'
WEIGHTS_DIR = HOME_DIR + 'model_weights/'

Data loading/preprocessing

In [None]:
def create_training_set():
    dataset = pd.read_csv("/home/jvidyala/data/train_triplets.txt",sep="\t",skiprows=1,names="user_id,song_id,play_count".split(","))

    n_users = len(dataset.user_id.unique())
    n_songs = len(dataset.song_id.unique())

    dataset.user_id = dataset.user_id.astype('category')
    dataset.song_id = dataset.song_id.astype('category')

    dataset.user_id = dataset.user_id.cat.codes.values
    dataset.song_id = dataset.song_id.cat.codes.values
    
    a_songid, b_songid, a_userid, b_userid, a_y, b_y = cross_validation.train_test_split(songids,userids,y,test_size=0.1)

    dd.io.save(TRAIN_DATA + 'train_songid.h5',a_songid)
    dd.io.save(TEST_DATA + 'test_songid.h5',b_songid)
    dd.io.save(TRAIN_DATA + 'train_userid.h5',a_userid)
    dd.io.save(TEST_DATA + 'test_userid.h5',b_userid)
    dd.io.save(TRAIN_DATA + 'train_y.h5',a_y)
    dd.io.save(TEST_DATA + 'test_y.h5',b_y)

Create model, define callbacks

In [None]:
def step_decay(epoch):
    initial_lrate = 0.0005
    drop = 0.5
    epochs_drop = 3
    lrate = initial_lrate * math.pow(drop,  
           math.floor((1+epoch)/epochs_drop))
    return lrate

In [1]:
def def_callbacks():
    plateau_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                                  patience=2, min_lr=0.0001)
    change_lr = LearningRateScheduler(step_decay)
    early_stopping = EarlyStopping(monitor='val_loss', patience=2)

    filepath = WEIGHTS_DIR + 'weights--{epoch:02d}-{val_loss:.2f}.h5'

    checkpoint = ModelCheckpoint(filepath = filepath, monitor = 'val_loss', 
                                 save_weights_only = True)

    callbacks_list = [change_lr,plateau_lr,early_stopping,checkpoint]
    
    return callbacks_list

In [2]:
def create_model():
    song_input = Input(shape=(1,), name='song_input')
    user_input = Input(shape=(1,), name='user_input')

    MF_Embedding_Song = Embedding(input_dim = n_songs, output_dim = 10, name = 'mf_embedding_song', input_length = 1)
    MF_Embedding_User = Embedding(input_dim = n_songs, output_dim = 10, name = 'mf_embedding_user', input_length = 1)

    MLP_Embedding_Song = Embedding(input_dim = n_songs, output_dim = 5, name = 'mlp_embedding_song', input_length = 1)
    MLP_Embedding_User = Embedding(input_dim = n_songs, output_dim = 5, name = 'mlp_embedding_user', input_length = 1)

    song_bias = Embedding(input_dim=n_songs,output_dim=1,input_length=1)(song_input)
    user_bias = Embedding(input_dim=n_users,output_dim=1,input_length=1)(user_input)

    mf_song_latent = Flatten()(MF_Embedding_Song(song_input))
    mf_user_latent = Flatten()(MF_Embedding_User(user_input))
    mf_vector = merge([mf_song_latent,mf_user_latent],mode='mul')

    mlp_song_latent = Flatten()(MLP_Embedding_Song(song_input))
    mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
    mlp_vector = merge([mlp_song_latent,mlp_user_latent],mode='concat')

    mlp_vector = Dense(20, activation='relu',activity_regularizer=l2(0.001))(mlp_vector)
    mlp_vector = Dense(10, activation='relu', activity_regularizer=l2(0.001))(mlp_vector)

    predict_vector = merge([mlp_vector,mf_vector],mode='dot')
    predict_vector = add([predict_vector,song_bias,user_bias])
    predict_vector = Flatten()(predict_vector)
    predict_vector = Dropout(0.4)(Dense(128, activation='relu',W_regularizer=l2(0.001))(predict_vector))
    predict_vector = normalization.BatchNormalization()(predict_vector)
    predict_vector = Dropout(0.4)(Dense(128, activation='relu',W_regularizer=l2(0.001))(predict_vector))
    predict_vector = normalization.BatchNormalization()(predict_vector)
    prediction = Dense(1)(predict_vector)

    model_deep_nmf = Model(input=[song_input,user_input],output=prediction)
    
    return model_deep_nmf

In [3]:
def train_model(model_deep_nmf):
    
    a_songid = dd.io.load(TRAIN_DATA + 'train_songid.h5', a_songid)
    b_songid =  dd.io.load(TEST_DATA + 'test_songid.h5', b_songid)
    a_userid = dd.io.load(TRAIN_DATA + 'train_userid.h5', a_userid)
    b_userid = dd.io.load(TEST_DATA + 'test_userid.h5', b_userid)
    a_y =  dd.io.load(TRAIN_DATA + 'train_y.h5', a_y)
    b_y =  dd.io.load(TEST_DATA + 'test_y.h5', b_y)

    model_deep_nmf.compile(loss='mse',optimizer='adam')
    callbacks_list = def_callbacks()
    history = model_deep_nmf.fit([a_songid,a_userid], a_y,
                   epochs = 50,
                   validation_data=([b_songid, b_userid], b_y),
                   batch_size=64, callbacks=callbacks_list)

In [None]:
def predict_song_playcount(model_deep_nmf, userid):
    
    model_deep_nmf.load_weights(WEIGHTS_DIR + 'weights--09--40.10.h5')
    user_predictions = {}
    
    predictions = model_deep_nmf.predict([np.array([userid for i in range(len(a_songid))]),a_songid])
    
    for song,prediction in zip(a_songid,predictions):
        user_predictions[int(song)] = round(prediction[0])
        
    predicted_songs = (sorted(user_predictions,key=user_predictions.get,reverse=True))
    return user_predictions, prediction_list

In [None]:
def main(userid):
    if not os.listdir(WEIGHTS_DIR):
        create_training_set()
        global model_deep_nmf = create_model()
        train_model(model_deep_nmf)

    user_predictions, prediction_list = predict_song_playcount(model_deep_nmf,userid)
    
    return user_predictions,prediction_list
