#### Matrix factorization via NN, contrasting MAE with surpriselib's baseline algos SVD, SVD++

In [1]:
import os
import numpy as np
import pandas as pd


data_path = os.path.expanduser('C:\\Users\\might\\Desktop\\jupyter notebooks\\u.data')

* #### original df containing all 100k data points

In [41]:
df = pd.read_csv(data_path, sep='\t', header=None)
df[2]= df[2].apply(lambda x: int(x>3))
df.head(5)

Unnamed: 0,0,1,2,3
0,196,242,0,881250949
1,186,302,0,891717742
2,22,377,0,878887116
3,244,51,0,880606923
4,166,346,0,886397596


In [95]:
shuffled_df = df.loc[np.random.randint(0, 100000, size=df.shape[0])]

#sm_train_df = shuffled_df[:80000]#small dataframe, trimming data points to random 2000 datapoints
#test_df= shuffled_df[-20000:]

#print('new trimmed dataset:', sm_train_df.shape, '\nnew test dataset:', test_df.shape)
print('shape of dataset:', shuffled_df.shape)

shape of dataset: (100000, 4)


* #### Training & testing results with SVD from surpirselib
    * Using same dataframes for all three

In [96]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader

from surprise.model_selection import train_test_split

rdr = Reader(rating_scale=(0,1))
#train_data = Dataset.load_from_df(sm_train_df[[0,1,2]], reader= rdr)
data = Dataset.load_from_df(shuffled_df[[0,1,2]], reader= rdr)

#trainset = train_data.build_full_trainset()#contains 80k data points

trainset, testset = train_test_split(data, test_size=0.2)

#test_data= Dataset.load_from_df(test_df[[0,1,2]], reader= rdr)
#testset = test_data.build_full_trainset().build_testset()#testset for surprise.SVD

In [97]:
from surprise import accuracy
import time

t1= time.time()

algo = SVD()

algo.fit(trainset)

print('training time:', time.time()-t1)

training time: 3.6322832107543945


In [98]:
pred = algo.test(testset)

print('\nTest results on 80k training data & 20k test data -- rmse:{}, mae:{}'.format(accuracy.rmse(pred), accuracy.mae(pred)))

RMSE: 0.3977
MAE:  0.3390

Test results on 80k training data & 20k test data -- rmse:0.3977382532089304, mae:0.33896013958287446


* #### Training & testing results with SVD++ from surpirselib
    * Using same dataframes for both 

In [115]:
from surprise import SVDpp

t1= time.time()

algo = SVDpp()

algo.fit(trainset)

print('training time:', time.time()-t1)


training time: 132.8025074005127


In [117]:
pred = algo.test(testset)

print('\nTest results on 80k training data & 20k test data -- rmse:{}, mae:{}'.format(accuracy.rmse(pred), accuracy.mae(pred)))

RMSE: 0.4148
MAE:  0.3563

Test results on 80k training data & 20k test data -- rmse:0.414811923180537, mae:0.35634124624684144


* #### Training & testing results with matrix factorization implemented with NN
    * Using same dataframes as above

In [99]:
#print('new trimmed dataset:', sm_train_df.shape, '\nnew test dataset:', test_df.shape)

print('shape of dataset:', shuffled_df.shape)

shape of dataset: (100000, 4)


In [100]:
shuffled_df.head(5)

Unnamed: 0,0,1,2,3
94422,378,38,0,880333383
89311,894,313,1,883518874
32277,498,1007,0,881954219
70178,10,617,1,877892160
30079,57,126,0,883697293


In [106]:
print('max value of user id:', max(shuffled_df[0]),'\nmax value of movie id:', max(shuffled_df[1]),'\nTherefore concatenated shape of input layer:',(max(shuffled_df[0])+max(shuffled_df[1]),))

max value of user id: 943 
max value of movie id: 1682 
Therefore concatenated shape of input layer: (2625,)


In [107]:
x_train_user_in = to_categorical(shuffled_df[0])#contains the one-hot encoded user_id data, shaped (batch_size, max(sm_df[0]))
x_train_movie_in = to_categorical(shuffled_df[1])#contains the one-hot encoded movie_id data, shaped (batch_size, max(sm_df[1]))

y_ratings= shuffled_df[2]
print('shape of x_train_user_in:', x_train_user_in.shape,'\nshape of x_train_movie_in:', x_train_movie_in.shape,
      '\nshape of x_train_user_in:',y_ratings.shape)

shape of x_train_user_in: (100000, 944) 
shape of x_train_movie_in: (100000, 1683) 
shape of x_train_user_in: (100000,)


In [104]:
import keras
from keras.models import Model
from keras.layers import Input, Dense
from keras import metrics
from keras.utils import to_categorical

In [108]:
user_in_layer = Input(shape=(max(shuffled_df[0])+1,))#top half of input layer,+1 to accomodate on-hot encoded vectors
movie_in_layer = Input(shape=(max(shuffled_df[1])+1,))#bottom half of input layer

#total input moves as concatenated user_id & movie_id one-hot encoded forms

hid_layer_u = Dense(100)(user_in_layer)#latent dimension k=100 for user_id #_Embedding(100, )
hid_layer_m= Dense(100)(movie_in_layer)#latent dimension k=100 for movie_id

merge_layer = keras.layers.dot([hid_layer_u, hid_layer_m], axes=1)



predictions = Dense(1, activation='sigmoid')(merge_layer)

model = Model(inputs=[user_in_layer, movie_in_layer], outputs= predictions)

model.compile(loss= 'binary_crossentropy', optimizer='sgd', metrics= ['mae', 'accuracy'])

In [109]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 944)          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 1683)         0                                            
__________________________________________________________________________________________________
dense_9 (Dense)                 (None, 100)          94500       input_7[0][0]                    
__________________________________________________________________________________________________
dense_10 (Dense)                (None, 100)          168400      input_8[0][0]                    
__________________________________________________________________________________________________
dot_3 (Dot

In [114]:
model.fit([x_train_user_in, x_train_movie_in], y_ratings, verbose=1, epochs=40, batch_size=32, validation_split=0.2)

Train on 80000 samples, validate on 20000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1f8ac2e9400>

* #### Therefore with 80k training datapoints & 20k validation datapoints:
    * MAE value with surprise SVD implementation: 0.3390
    * MAE value with surprise SVD++ implementation: 0.3563
    * MAE value after 40 epochs with matrix factorization with NN implementation: 0.3120 
    

