In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split


# 11.1 Data preparation


In [4]:
# Read data from 2 files:
df_R = pd.read_csv('ratings.csv')
df_M = pd.read_csv('movies.csv')

#Join dataframe rating and movie
df = pd.merge(df_R,df_M)

print(df)

          userId  movieId  rating   timestamp                         title  \
0              1      296     5.0  1147880044           Pulp Fiction (1994)   
1              3      296     5.0  1439474476           Pulp Fiction (1994)   
2              4      296     4.0  1573938898           Pulp Fiction (1994)   
3              5      296     4.0   830786155           Pulp Fiction (1994)   
4              7      296     4.0   835444730           Pulp Fiction (1994)   
...          ...      ...     ...         ...                           ...   
25000090  162358   200192     2.0  1553453039   Den frusna leoparden (1986)   
25000091  162358   200194     2.0  1553453843             Tough Luck (2004)   
25000092  162386   139970     3.5  1549215965  I Don't Speak English (1995)   
25000093  162386   200726     4.0  1554651417          The Graduates (1995)   
25000094  162386   200728     4.0  1554651472    Il pesce innamorato (1999)   

                               genres  
0         C

In [5]:
#Preapare Train, validation data
df_train, df_val = train_test_split(df, train_size=0.007,test_size=0.003, stratify=df['rating'])#using option stratify=df.rating

# Re-numbering userId ( to eliminate skipping id)
userId_list = list(set(df['userId']))
index = 0
dict_users = {}
for ids in sorted(userId_list):
    dict_users.update ({ids : index})
    index += 1

# Re-numbering movieId
movieId_list = list(set(df['movieId']))
index = 0
dict_movies = {}
for ids in sorted(movieId_list):
    dict_movies.update ({ids : index})
    index += 1


In [6]:
#Map new id to train, validation for both “userId”, “movieId”
df_train["userId"] = df_train["userId"].map(dict_users)
df_train["movieId"] = df_train["movieId"].map(dict_movies)
df_val["userId"] = df_val["userId"].map(dict_users)
df_val["movieId"] = df_val["movieId"].map(dict_movies)

In [7]:
num_unique_users = len(userId_list)
num_unique_movies = len(movieId_list)

# 11.2 model preparing and training

In [8]:
# Create Model NCF
EMBEDDING_SIZE=100
users_input = Input(shape=(1,), name="users_input")
users_embedding = Embedding(num_unique_users + 1, EMBEDDING_SIZE,name="users_embeddings")(users_input)
users_bias = Embedding(num_unique_users + 1, 1, name="users_bias")(users_input)
movies_input = Input(shape=(1,), name="movies_input")
movies_embedding = Embedding(num_unique_movies + 1, EMBEDDING_SIZE,name="movies_embeddings")(movies_input)
movies_bias = Embedding(num_unique_movies + 1, 1, name="movies_bias")(movies_input)
dot_product_users_movies = tf.math.multiply(users_embedding, movies_embedding)
input_terms = dot_product_users_movies + users_bias + movies_bias
input_terms = Flatten(name="fl_inputs")(input_terms)
output = Dense(1, activation="relu", name="output")(input_terms)
model = Model(inputs=[users_input, movies_input], outputs=output)
opt_adam = Adam(lr = 0.005)
model.compile(optimizer=opt_adam, loss= ['mse'], metrics=['mean_absolute_error'])


  super().__init__(name, **kwargs)


In [9]:
# ------------ View Model Summary -------------------------------
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 users_input (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 movies_input (InputLayer)      [(None, 1)]          0           []                               
                                                                                                  
 users_embeddings (Embedding)   (None, 1, 100)       16254200    ['users_input[0][0]']            
                                                                                                  
 movies_embeddings (Embedding)  (None, 1, 100)       5904800     ['movies_input[0][0]']           
                                                                                              

In [12]:
# ------------ Training Model -------------------------------
BS = [512,1024,2048] #[64, 128, 256, 512, 1024, 2048] # try at least 3 values
EP = [10,20,50] #[5, 10, 20, 50, 60 ] # try at least 3 values
# history = model.fit(x=[df_train.userId, df_train.movieId],y=df_train.rating,batch_size=BS,epochs=EP,verbose=1,validation_data=([df_val.userId, df_val.movieId], df_val.rating))


In [13]:
for i in range(len(BS)):    
    history = model.fit(x=[df_train.userId, df_train.movieId],y=df_train.rating,batch_size=BS[i],epochs=EP[i],verbose=1,validation_data=([df_val.userId, df_val.movieId], df_val.rating))    
    # ------------ Visualize loss -------------------------------
    plt.plot(history.history["mean_absolute_error"],'y',linewidth=2,label ='train')
    plt.plot(history.history["val_mean_absolute_error"],'m',linewidth=2,label ='test')
    plt.title('model loss mean absolute error')
    plt.xlabel('Epochs')
    plt.ylabel('mean absolute error')
    plt.legend()
    fig_name = (f'../Picture/Lab11/MAE_Bs{BS[i]}_Ep{EP[i]}.png')
    plt.savefig(fig_name,dpi=500)
    plt.show()
    plt.plot(history.history["loss"],'y',linewidth=2,label ='train')
    plt.plot(history.history["val_loss"],'m',linewidth=2,label ='test')
    plt.title('model loss MSE')
    plt.xlabel('Epochs')
    plt.ylabel('mean square error')
    plt.legend()
    fig_name = (f'../Picture/Lab11/MSE_Bs{BS[i]}_Ep{EP[i]}.png')
    plt.savefig(fig_name,dpi=500)
    plt.show()
    out_path = f'../Picture/Lab11/predict_Bs{BS[i]}_Ep{EP[i]}.csv'
    userPredictR = pd.DataFrame(model.predict([df_val.userId, df_val.movieId]))
    pd.DataFrame.to_csv(userPredictR,out_path)

Epoch 1/10
 76/342 [=====>........................] - ETA: 50s - loss: 7.4630 - mean_absolute_error: 2.3567

In [None]:
# ------------ Model Prediction -------------------------------
userPredictR = pd.DataFrame(model.predict([df_val.userId, df_val.movieId]))
userPredictR
pd.DataFrame.to_csv(userPredictR,f'../Picture/Lab11/predict.csv')



In [None]:
# plt.plot(history.history["mean_absolute_error"],'y',linewidth=2,label ='train')
# plt.plot(history.history["val_mean_absolute_error"],'m',linewidth=2,label ='test')
# plt.title('model loss mean absolute error')
# plt.xlabel('Epochs')
# plt.ylabel('mean absolute error')
# plt.legend()
# plt.plot()

NameError: name 'plt' is not defined