<a href="https://colab.research.google.com/github/orenshor/eCommerce_project/blob/master/task_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Embedding, Input, Dense, Flatten, Dropout
from keras.optimizers import Adam, Adamax
from keras.layers import Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [0]:
RATING_DATA_TEST_FILE = "u1.test"
RATING_DATA_TRAIN_FILE = "u1.base"
MODEL_WEIGHTS_FILE = "u_emb_weights.h5"

USER_DATA_FILE = 'u.user'
MODEL_WEIGHTS_FILE_CORE = 'u_emb_weights'

In [0]:
#import of the data

user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
movie_cols = ['user_id','movie_id','rating','timestamp']

# load users data
user_data = pd.read_csv(USER_DATA_FILE, sep='|', engine='python', encoding='latin-1', names=user_cols)

# replace values in the data
user_data['gender'].replace(['F','M'],['0','1'],inplace=True)
user_data['age'] = pd.cut(user_data['age'],bins=[0,18,25,30,40,50,100], labels=["0","1","2","3","4","5"])

# load movies data
ratings_train = pd.read_csv(RATING_DATA_TRAIN_FILE, sep='\t', engine='python', encoding='latin-1', names=movie_cols)

# marge users info and ratings 
ratings_train = pd.merge(ratings_train, user_data, on='user_id',how='inner')
print(ratings_train.head())
# train
max_userid = ratings_train['user_id'].drop_duplicates().max()
max_movieid = ratings_train['movie_id'].drop_duplicates().max()
ratings_train['user_emb_id'] = ratings_train['user_id'] - 1
ratings_train['movie_emb_id'] = ratings_train['movie_id'] - 1
print(str(len(ratings_train))+' ratings loaded from train')

# test
ratings_test = pd.read_csv(RATING_DATA_TEST_FILE, sep='\t',   engine='python', encoding='latin-1',  names=movie_cols)
ratings_test = pd.merge(ratings_test, user_data, on='user_id', how='inner')
print(ratings_test.head())
ratings_test['user_emb_id'] = ratings_test['user_id'] - 1
ratings_test['movie_emb_id'] = ratings_test['movie_id'] - 1
print(str(len(ratings_test))+' ratings loaded from test')



In [0]:
# train values
train_users = ratings_train['user_emb_id'].values
train_movies = ratings_train['movie_emb_id'].values
train_ratings = ratings_train['rating'].values
train_gender = ratings_train['gender'].values
train_age = ratings_train['age'].values

Test_Users = ratings_test['user_emb_id'].values
Test_Movies = ratings_test['movie_emb_id'].values
Test_Ratings = ratings_test['rating'].values

In [0]:
# model from ex. 4 with adjustments
def get_ncf_model_gen1(num_users, num_items, latent_dim, hidden_dim,do):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    gender_input = Input(shape=(1,), dtype='float32', name = 'gender_input')
    age_input = Input(shape=(1,), dtype='float32', name = 'age_input')

    NCF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding', input_length=1)
    NCF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding', input_length=1)   
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(NCF_Embedding_User(user_input))
    item_latent = Flatten()(NCF_Embedding_Item(item_input))
    
    # Concat user and item embeddings with gender
    conc = Concatenate()([user_latent, item_latent, gender_input, age_input])
    drop = Dropout(do)(conc)
    hid1 = Dense(hidden_dim, activation='relu')(conc)
    drop2  = Dropout(do)(hid1)
    prediction = Dense(1, activation='relu', kernel_initializer='lecun_uniform', name = 'prediction')(drop2)
    
    model = Model(inputs=[user_input, item_input, gender_input, age_input], outputs=prediction)
    print("ncf model gender 1")
    model.summary()

    return model

In [0]:
K_LATENT = 20
hidden_dim = 20
do = 0.3

NCF_G_model1 = get_ncf_model_gen1(max_userid, max_movieid, K_LATENT,hidden_dim, do)
NCF_G_model1.compile(loss='mse',optimizer=Adamax(),metrics=['mae'])

In [0]:
callbacks_ncfg = [EarlyStopping('val_loss', patience=20), 
             ModelCheckpoint(MODEL_WEIGHTS_FILE_CORE +'_ncfg_'+str(do)+'_'+str(K_LATENT)+'.h5', save_best_only=True)]
history_history_ncfg = NCF_G_model1.fit([train_users, train_movies, train_gender, train_age], train_ratings, epochs=100, validation_split=.1, verbose=1, callbacks=callbacks_ncfg, batch_size = 32)

In [29]:
# test values
test_users = ratings_test['user_emb_id'].values
test_movies = ratings_test['movie_emb_id'].values
test_ratings = ratings_test['rating'].values
test_gender = ratings_test['gender'].values
test_age = ratings_test['age'].values

preddict_model_gen = NCF_G_model1.predict([test_users,test_movies, test_gender, test_age])
test_predict1 = pd.DataFrame(data=preddict_model_gen, columns=['prediction'])
test_predict1['actual_rating'] = test_ratings

MAE1 = np.sum(abs(test_predict1['actual_rating']-test_predict1['prediction']))/test_predict1.shape[0]
print("MAE = "+ str(MAE1))

MAE = 0.739847119140625


In [0]:
def get_ncf_model_gen2(num_users, num_items, latent_dim, hidden_dim,do):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    gender_input = Input(shape=(1,), dtype='float32', name = 'gender_input')
    age_input = Input(shape=(1,), dtype='float32', name = 'age_input')

    NCF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding', input_length=1)
    NCF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding', input_length=1)   
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(NCF_Embedding_User(user_input))
    item_latent = Flatten()(NCF_Embedding_Item(item_input))
    
    # Concat user and item embeddings with gender
    conc = Concatenate()([user_latent, item_latent, gender_input, age_input])
    drop = Dropout(0.3)(conc)
    hid1 = Dense(hidden_dim, activation='relu')(conc)
    drop2  = Dropout(do)(hid1)
    hid2 = Dense(hidden_dim,activation='relu')(drop2)
    drop3  = Dropout(do)(hid2)
    prediction = Dense(1, activation='relu', kernel_initializer='lecun_uniform', name = 'prediction')(drop3)
    
    model = Model(inputs=[user_input, item_input, gender_input, age_input], outputs=prediction)
    print("ncf model gender 2")
    model.summary()

    return model

In [0]:
K_LATENT = 20
hidden_dim = 20
do = 0.5
NCF_G_model2 = get_ncf_model_gen2(max_userid, max_movieid, K_LATENT, hidden_dim, do)
NCF_G_model2.compile(loss='mse',optimizer=Adamax(),metrics=['mae'])

In [0]:
callbacks_ncfg2 = [EarlyStopping('val_loss', patience=20), 
             ModelCheckpoint(MODEL_WEIGHTS_FILE_CORE+'_ncfg_'+str(do)+'_'+str(K_LATENT)+'.h5', save_best_only=True)]
history_history_ncfg2 = GNCFG_model2.fit([Users_train, Movies_train, Gender_train, Age_train], Ratings_train, epochs=100, validation_split=.1, verbose=1, callbacks=callbacks_ncfg2, batch_size = 32)