In [1]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



dataset = pd.read_csv('/content/drive/MyDrive/CS550 final project/final proj code/small data/ratings.csv', header=0, names=['user_id', 'movie_id', 'rating', 'timestamp'])
#train = pd.read_csv('/content/drive/MyDrive/CS550 final project/final proj code/train_data.csv', header=0, names=['user_id', 'movie_id', 'rating', 'timestamp', 'title', 'genres'])
#test = pd.read_csv('/content/drive/MyDrive/CS550 final project/final proj code/test_data.csv', header=0, names=['user_id', 'movie_id', 'rating', 'timestamp', 'title', 'genres'])


# map movie ID to [1, num_movies]
movie_id_to_new_id = dict()
idex = 1
for index, row in dataset.iterrows():
    
    if movie_id_to_new_id.get(row['movie_id']): #not none
        dataset.at[index, 'movie_id'] = movie_id_to_new_id.get(row['movie_id'])
    else: #none, init
        movie_id_to_new_id[row['movie_id']] = idex
        dataset.at[index, 'movie_id'] = idex
        idex += 1

num_users = len(dataset.user_id.unique())
num_movies = len(dataset.movie_id.unique())


train, test = train_test_split(dataset, test_size=0.2)


In [2]:
print(len(dataset))
print(num_users)
print(num_movies)

100836
610
9724


## GMF (General Matrix Factorization) LAYER

In [3]:
from keras.models import Model, Sequential
from keras.layers import Embedding, Flatten, Input, add, concatenate, Dropout, Dense, BatchNormalization
from tensorflow.python.keras.layers import Dense


import tensorflow as tf

In [4]:


gmf_latent_dim = 10

movie_input = Input(shape=[1],name='movie-input')

user_input = Input(shape=[1],name='user-input')



# MF Embeddings
movie_embedding_mf = Embedding(num_movies + 1, gmf_latent_dim, name='movie-embedding-mf')(movie_input)
movie_vec_mf = Flatten(name='flatten-movie-mf')(movie_embedding_mf)
#movie_vec_mf = Dropout(0.2)(movie_vec_mf)

user_embedding_mf = Embedding(num_users + 1, gmf_latent_dim, name='user-embedding-mf')(user_input)
user_vec_mf = Flatten(name='flatten-user-mf')(user_embedding_mf)
#user_vec_mf = Dropout(0.2)(user_vec_mf)


dotted = tf.keras.layers.dot([movie_vec_mf, user_vec_mf], axes=-1, name='DotProduct')
model = Model([user_input, movie_input], dotted)

# compile and moniter error type mae and mse
model.compile('adam', loss='mean_squared_error', metrics=['mae', 'mse'])


In [5]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 movie-input (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 user-input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 movie-embedding-mf (Embedding)  (None, 1, 10)       97250       ['movie-input[0][0]']            
                                                                                                  
 user-embedding-mf (Embedding)  (None, 1, 10)        6110        ['user-input[0][0]']             
                                                                                              

In [6]:

from sklearn.metrics import mean_absolute_error

# epochs 5 and 15 do not produce better results, 10 does
history = model.fit([train.user_id, train.movie_id], train.rating, epochs=10)


gmf_pred = model.predict([test.user_id, test.movie_id])
gmf_true = test.rating


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
mean_absolute_error(gmf_true, gmf_pred)

0.7834510124548488

In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


In [9]:
mae = mean_absolute_error(gmf_true, gmf_pred)
rms = mean_squared_error(gmf_true, gmf_pred, squared=False)
print(mae)
print(rms)

0.7834510124548488
1.110617256259809


In [10]:
GMF_pred = gmf_pred

In [11]:
print(len(gmf_pred))

20168


## MLP (Multi-Layer Perceptron) Layer APPLICATION IN NCF


In [12]:
from keras.models import Model, Sequential
from keras.layers import Embedding, Flatten, Input, add, concatenate, Dropout, Dense, BatchNormalization

movie_latent_dim = 10
user_latent_dim = 10
gmf_latent_dim = 10

# Define inputs
movie_input = Input(shape=[1],name='movie-input')
user_input = Input(shape=[1], name='user-input')

# MLP Embeddings
movie_embedding_mlp = Embedding(num_movies + 1, movie_latent_dim, name='movie_id-mlp')(movie_input)
movie_vec_mlp = Flatten(name='flatten-movie-mlp')(movie_embedding_mlp)

user_embedding_mlp = Embedding(num_users + 1, user_latent_dim, name='user-id-mlp')(user_input)
user_vec_mlp = Flatten(name='flatten-user-mlp')(user_embedding_mlp)

# MF Embeddings
movie_embedding_mf = Embedding(num_movies + 1, gmf_latent_dim, name='movie-embedding-mf')(movie_input)
movie_vec_mf = Flatten(name='flatten-movie-mf')(movie_embedding_mf)
movie_vec_mf = Dropout(0.2)(movie_vec_mf)

user_embedding_mf = Embedding(num_users + 1, gmf_latent_dim, name='user-embedding-mf')(user_input)
user_vec_mf = Flatten(name='flatten-user-mf')(user_embedding_mf)
user_vec_mf = Dropout(0.2)(user_vec_mf)


# MLP layers (decrease density)
concat = concatenate([movie_vec_mlp, user_vec_mlp])
concat_dropout = Dropout(0.2)(concat)
#layer 1
fc_1 = Dense(120, name='fc-1', activation='relu')(concat_dropout)
fc_1_bn = BatchNormalization(name='batch-norm-1')(fc_1)
fc_1_dropout = Dropout(0.2)(fc_1_bn)
#layer 2
fc_2 = Dense(75, name='fc-2', activation='relu')(fc_1_dropout)
fc_2_bn = BatchNormalization(name='batch-norm-2')(fc_2)
fc_2_dropout = Dropout(0.2)(fc_2_bn)


# layer 3
fc_3 = Dense(50, name='fc-3', activation='relu')(fc_2_dropout)
#fc_3_bn = BatchNormalization(name='batch-norm-3')(fc_3)
#fc_3_dropout = Dropout(0.2)(fc_3_bn)

# layer 4
fc_4 = Dense(20, name='fc-4', activation='relu')(fc_3)
#fc_4_bn = BatchNormalization(name='batch-norm-4')(fc_4)
#fc_4_dropout = Dropout(0.2)(fc_4_bn)

# Prediction from both layers
# ncf_layer result
ncf_pred = Dense(1, name='NCF-pred', activation='relu')(fc_4)
# from MF section, the MF layer's result
dotted = tf.keras.layers.dot([movie_vec_mf, user_vec_mf], axes=1, normalize=False, name='DotProduct')

# combine results
combine_mlp_mf = concatenate([dotted, ncf_pred], axis=-1)

result_combine = Dense(100, name='combine-mf-mlp')(combine_mlp_mf)
deep_combine = Dense(100, name='fully-connected-1')(result_combine)

# Final prediction
#result = Dense(1, name='result', activation='relu')(combine_mlp_mf)
result = Dense(1, name='result', activation='relu')(deep_combine)

model = Model([user_input, movie_input], result)
#learn rate of 0.01
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mean_absolute_error', metrics=['mae', 'mse'])

In [13]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 movie-input (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 user-input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 movie_id-mlp (Embedding)       (None, 1, 10)        97250       ['movie-input[0][0]']            
                                                                                                  
 user-id-mlp (Embedding)        (None, 1, 10)        6110        ['user-input[0][0]']             
                                                                                            

In [14]:


history = model.fit([train.user_id, train.movie_id], train.rating, epochs=10)

ncf_pred = model.predict([test.user_id, test.movie_id])
ncf_true = test.rating


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
#from sklearn.metrics import mean_squared_error
mae = mean_absolute_error(ncf_true, ncf_pred)
rms = mean_squared_error(ncf_true, ncf_pred, squared=False)
print(mae)
print(rms)

0.7041938564149409
0.9266612747940155


In [16]:
MLP_pred = ncf_pred

## COMPILE TABLE

In [None]:
#

In [18]:
MLP_table = test.copy()
GMF_table = test.copy()

In [None]:
print(MLP_table)

In [None]:
print(GMF_table)

In [21]:
MLP_table.insert(3, 'NCP_est', MLP_pred)

In [None]:
#print(MLP_table)

In [23]:
GMF_table.insert(3, 'GMF_est', GMF_pred)

In [None]:
#print(GMF_table)

In [25]:
GMF_table.to_csv('GMF results.csv')

In [26]:
MLP_table.to_csv('NCF results.csv')

In [27]:
# make table with both results
compiled_results = MLP_table.copy()

In [28]:
compiled_results.insert(3, 'GMF_est', GMF_pred)

In [None]:
print(compiled_results)

In [30]:
compiled_results.to_csv('NCF+GMF results.csv')

## 4 Measures

In [31]:
from collections import defaultdict
import math

In [32]:
#test_data_pred_list = test_data_pred_table.values.tolist()

def recall_function(dataframe, top_ratings_from_user):
    
    #df = pd.DataFrame(dataframe)

    temp = dataframe
    
    l = temp.values.tolist()
    
    prediction_data = defaultdict(list)
    
    #print(dataframe)

    for user_id, movie_id, true_rating, predicted_rating, _ in l:
        prediction_data[user_id].append([true_rating, predicted_rating])
    
    user_id_list = prediction_data.keys()
    
    recall_list = defaultdict(list)
    
    for user_id in user_id_list:
        
        total_items=0
        total_rel_rec = 0
                
        prediction_data[user_id].sort(key=lambda x : x[0], reverse=True)
        
        rating_number = 0
        
        for rating_list in prediction_data[user_id]:
            
            if rating_number >= top_ratings_from_user:
                break
            else:
                rating_number+=1

            true_rating, predicted_rating = rating_list[0], rating_list[1]
            
            total_items+=1
            
            if true_rating >= 3.5:

                if predicted_rating >= 3.5:
                    total_rel_rec+=1
            
        if total_items != 0: 
            recall_list[user_id].append(total_rel_rec/total_items)
        else:
            recall_list[user_id].append(1)
                
    
    return recall_list
     

In [33]:
#test_data_pred_list = test_data_pred_table.values.tolist()

def precision_function(dataframe, top_ratings_from_user):
    
    df = pd.DataFrame(dataframe)

    temp = df
    
    l = temp.values.tolist()
    
    prediction_data = defaultdict(list)
    
    for user_id, movie_id, true_rating, predicted_rating, _ in l:
        prediction_data[user_id].append([true_rating, predicted_rating])
    
    user_id_list = prediction_data.keys()
    
    p_list = defaultdict(list)
    
    for user_id in user_id_list:
        total_rec = 0
        total_rel_rec = 0
                
        prediction_data[user_id].sort(key=lambda x : x[0], reverse=True)
        
        rating_number = 0
        
        for rating_list in prediction_data[user_id]:
            
            if rating_number >= top_ratings_from_user:
                break
            else:
                rating_number+=1

            true_rating, predicted_rating = rating_list[0], rating_list[1]
            
            if true_rating >= 3.5:
                total_rec+= 1

                if predicted_rating >= 3.5:
                    total_rel_rec+=1
            
        if total_rec != 0: 
            p_list[user_id].append(total_rel_rec/total_rec)
        else:
            p_list[user_id].append(1)
                
    
    return p_list
     

In [34]:
def get_average(measure_list):
    
    temp = measure_list
    
    total=0
    
    for val in temp.values():
        total += val[0]
    
    return total/len(measure_list)
    

In [35]:
def f_measure(recall, precision):
    return (2* recall * precision)/ (recall+precision)

In [36]:
def ndcg_function(prediction, top_k):
    
    def divide_sorted_and_original(predictions):
    
        # First map the predictions to each user.
        top = defaultdict(list)
        original_ratings = defaultdict(list)

        i=0
        length=len(predictions)

        while i!=length:
            user_id, movie_id, true_rating, predict_rating, _ = predictions.iloc[i]
            top[user_id].append([movie_id, predict_rating])
            original_ratings[user_id].append([movie_id, true_rating])

            i+=1


        for user_id, user_ratings in top.items():
            #sort by user's rating
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top[user_id] = user_ratings

        return top, original_ratings
    
    
    def ndcg_k(scores):
        
        def dcg_k(scores):
            score_list = zip(scores[1:], range(2, len(scores) + 1))
            
            total=0
            
            for score, index in score_list:
                total+= score/math.log(index, 2)
            
            return scores[0] +total
        
        sort_score = sorted(scores,reverse=True)
        
        total_dcg = dcg_k(sort_score)
        
        if total_dcg == 0.0: return 0.0
        
        return dcg_k(scores)/total_dcg
    
    
    def average_ndcg(ndcg_list):
        total=0
        for ndcg in ndcg_list.values():
            total+=ndcg
        return total/len(ndcg_list)
    

    top, b4_sort_ratings = divide_sorted_and_original(prediction)

    ndcg_list = dict()
    
    
    for user_id, user_ratings in top.items():
        
        flag=0
        score_list = []
        
        for movie_id, est in user_ratings:
            if flag> top_k:
                break
            else:
                flag+=1
                
            movie_flag = 0
            user_unsorted_ratings = b4_sort_ratings[user_id]
            
            for m_id, rating in user_unsorted_ratings:
                
                if movie_id == m_id:
                    score_list.append(rating)
                    movie_flag = 1
                    break
                    
            if movie_flag==0:
                score_list.append(0)
                
        ndcg_list[user_id] = ndcg_k(score_list)
    final_ndcg = average_ndcg(ndcg_list)

    return final_ndcg

In [37]:
def create_4_measures_table(top_k):
    
    def compile_4_measures(pred, top_k):
        #user_id starts from 1

        #recall
        recall_list = recall_function(pred, top_k) 
        #print(baseline_recall_list)
        average_recall = get_average(recall_list)

        #precision
        p_list = precision_function(pred, top_k)
        average_precision = get_average(p_list)

        #print(average_baseline_precision)

        #f-measure
        f_m = f_measure(average_recall, average_precision)

        # ndcg
        #ndcg_list = ndcg_function(pred, top_k)
        #ndcg = get_average_ndcg(ndcg_list)
        final_ndcg = ndcg_function(pred, top_k)

        return average_recall, average_precision, f_m, final_ndcg
    
    #length = len(pred_data_frame)
    
    final_df = pd.DataFrame(columns= ['Algorithm', 'Recall', 'Precision', 'F-Measure', 'NDCG'])
    

    gmf = GMF_table

    ncf = MLP_table
        
    model_dict = {'Generalized Matrix Factorization': gmf, 'Neural Collaborative Filtering': ncf}
    
    list_order = ['Generalized Matrix Factorization', 'Neural Collaborative Filtering']
    
    for algor_str in list_order:
        
        recall, precision, f_m, ndcg= compile_4_measures(model_dict[algor_str], top_k)
    
        df = pd.DataFrame([[algor_str, recall, precision, f_m, ndcg]], 
                          columns= ['Algorithm', 'Recall', 'Precision', 'F-Measure', 'NDCG'])
    
        #print(df.head())
        final_df = pd.concat([df, final_df], ignore_index=True)
    
    return final_df
    

In [38]:
top_k = 10
ncf_4_measures_table = create_4_measures_table(top_k)

In [None]:
ncf_4_measures_table

In [40]:
ncf_4_measures_table.to_csv('NCF 4 measures.csv')

## STORE MAE AND RSME

In [41]:
mae_rsme_list = [[0.7751338284519322,
1.0949647014682058], [0.679384940734982,
0.9022068009032483]]

In [None]:
MAE_RSME_table = pd.DataFrame(columns= ['Algorithm', 'MAE', 'RSME'])
algor_list = ['GMF', 'NCF']

i=0
for value_list in mae_rsme_list:
  mae, rsme = value_list[0], value_list[1]
  df = pd.DataFrame([[algor_list[i], mae, rsme]], 
                          columns= ['Algorithm', 'MAE', 'RSME'])
  MAE_RSME_table = pd.concat([df, MAE_RSME_table], ignore_index=True)

  i+=1

print(MAE_RSME_table)
    

In [43]:
MAE_RSME_table.to_csv('NCF_MAE_RSME_table.csv')