In [1]:
# Enable module reloading
%load_ext autoreload
%autoreload 2

%load_ext tensorboard

import datetime
import os
import sys
sys.path.append('..')

import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import time;

from tensorflow.keras import initializers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, concatenate
from src.data.load_data import *
from sklearn.model_selection import train_test_split

In [2]:
layers = 2
units_in_layers = [50, 5]
units_first_layer = 100

# load data // load_data.py
df = load_dataset(filename='../data/Jester-Dataset-ratings.csv')

train, test = train_test_split(df, test_size=0.2)

user_ids, joke_ids, ratings = get_data(df=train, batch_size=100000)

In [3]:
# neural network architecture // model.py

# input layers
user_input = Input(shape=(1,), dtype='float64')
joke_input = Input(shape=(1,), dtype='float64')

# embedding layers
user_embedding = Embedding(
    input_dim=len(user_ids),
    output_dim=int(units_first_layer / 2),
    input_length=1
)

joke_embedding = Embedding(
    input_dim=len(joke_ids),
    output_dim=int(units_first_layer / 2),
    input_length=1
)


# flatten embeddings
user_latent = Flatten()(user_embedding(user_input))
joke_latent = Flatten()(joke_embedding(joke_input))

# concatenate user and joke embeddings

final_vector = concatenate([user_latent, joke_latent])


# hidden layers
for i in range(1, layers):
    layer = Dense(units=units_in_layers[i], activation='relu')
    final_vector = layer(final_vector)

# output layer
output = Dense(units=1, activation='tanh')(final_vector)

# Training routine // train.py

model = Model(
    inputs=[user_input, joke_input],
    outputs=output
)

model.compile(
    optimizer='adam', 
    loss='mean_squared_error', 
    metrics=['accuracy']
)

tensorboard_callback = keras.callbacks.TensorBoard(
    log_dir=os.path.join("../logs", str(time.time())),
    histogram_freq=1)

model.fit(
    x=[np.array(user_ids), np.array(joke_ids)],
    y=np.array(ratings),
    batch_size=100, 
    epochs=10,
    callbacks=[tensorboard_callback],
    validation_split=0.2
)

Train on 80000 samples, validate on 20000 samples
Epoch 1/10
11500/80000 [===>..........................] - ETA: 1:16 - loss: 29.1579 - accuracy: 0.0019

KeyboardInterrupt: 

In [44]:
result = model.predict([np.array(user_ids), np.array(joke_ids)])

In [45]:
pd.DataFrame(result).describe()

Unnamed: 0,0
count,80000.0
mean,0.301933
std,0.934859
min,-1.0
25%,-0.998457
50%,0.999995
75%,1.0
max,1.0


In [46]:
np.column_stack((result, ratings))[:10]

array([[ 1.        ,  2.375     ],
       [ 1.        ,  0.469     ],
       [ 1.        ,  7.406     ],
       [-0.99997735, -4.344     ],
       [ 0.99992305,  0.906     ],
       [ 1.        ,  1.219     ],
       [ 0.99993145, -9.031     ],
       [-0.99925292, -3.031     ],
       [ 1.        ,  3.375     ],
       [ 1.        ,  8.281     ]])

In [47]:
result_data = pd.DataFrame(data={'user': user_ids, 'joke': joke_ids, 'rating': result.T[0]})
real_data = pd.DataFrame(data={'user': user_ids, 'joke': joke_ids, 'rating': ratings})

In [48]:
from src.recommender_model import *
from src.svd import *

recommender_model = RecommenderModel(model)
recommender_model.fit(train)
svd = Svd()
svd.fit(train)

# print(recommender_model.predictions)

users = test['USER_ID'].unique()
precision_model = 0
precision_svd = 0
for user in users:
    user_rec_model = list(recommender_model.recommend(user, 10)['JOKE_ID'])
    user_rec_svd = list(svd.recommend(user, 10)['JOKE_ID'])
    user_test = list(test[test['USER_ID'] == user]['JOKE_ID'])
    
    user_precision_model = len(set(user_rec_model).intersection(user_test)) / 10
    user_precision_svd = len(set(user_rec_svd).intersection(user_test)) / 10
    
    precision_model = precision_model + user_precision_model
    precision_svd = precision_svd + user_precision_svd

print('precision@10 model', precision_model / len(users))
print('precision@10 svd', precision_svd / len(users))

INFO:src.recommender_model:RecommenderModel initialized
INFO:src.recommender_model:Data handling
INFO:src.recommender_model:Predicting
INFO:src.recommender_model:Saving
INFO:src.recommender_model:Done
INFO:src.svd:SVD model initialized
INFO:src.svd:Creating pivot table
INFO:src.svd:Sparse rating with shape: (2388, 140)
INFO:src.svd:Decompomposing matrix
INFO:src.svd:Calculating prediction
INFO:src.svd:Done


precision@10 model 0.016853428955625357
precision@10 svd 0.22469744509189066


In [None]:
df.head()