# Neural Network-Based Collaborative Filtering

In this notebook we will implement a collaborative filtering model based on a Deep Neural Network (DNN). In contrast to classic matrix factorization, where an inner product of the latent features is calculated, **the NCF model is able to learn an arbitrary function to encapsulate non-linear user-item-interactions** utilizing a multi-layer perceptron architecture. The model is also able to learn different weights for the different latent factors.

## Import Python Packages

In [None]:
import numpy as np
import tensorflow as tf
import scipy.sparse as sp
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split

from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Embedding, Input, Dense, Reshape, 
                                     Flatten, Dropout)
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import initializers
from tensorflow.keras.metrics import MeanSquaredError, Precision, AUC

from NCFHelper import eval_one_rating

## Data Import

In [None]:
# Choose dataset type and size
dataset_type = "train"
dataset_size = "small"

data_path = f"../../data/mind_{dataset_size}_{dataset_type}/"
train_filename = data_path + f"{dataset_size}_train.csv"
test_filename = data_path + f"{dataset_size}_test.csv"
test_negatives_fn = data_path + f"{dataset_size}_test_negatives.tsv"

In [None]:
num_users, num_articles = 0, 0
with open(train_filename, "r") as f:
    header = f.readline()
    # print(header)
    line = f.readline()
    # print(line)
    while line != None and line != "":
        line_list = line.split(",")
        u, i = int(line_list[2]), int(line_list[3])
        num_users = max(num_users, u)
        num_articles = max(num_articles, i)
        line = f.readline()

num_users += 1
num_articles += 1

In [None]:
num_users, num_articles

In [None]:
train = sp.dok_matrix((num_users, num_articles), dtype=np.float32)

with open(train_filename, "r") as f:
    header = f.readline()
    # print(header)
    line = f.readline()
    # print(line)
    while line != None and line != "":
        line_list = line.split(",")
        user, article = int(line_list[2]), int(line_list[3])
        train[user, article] = 1.0
        line = f.readline()

In [None]:
np_ratio = 4
user_train, article_train, labels_train = [],[],[]
for (u, i) in train.keys():
    # positive instance
    user_train.append(u)
    article_train.append(i)
    labels_train.append(1)
    # negative instances
    for t in range(np_ratio):
        j = np.random.randint(num_articles)
        while (u, j) in train.keys():
            j = np.random.randint(num_articles)
        user_train.append(u)
        article_train.append(j)
        labels_train.append(0)    

In [None]:
len(user_train), len(article_train), len(labels_train)

In [None]:
test_positives = []
test_negatives = []
with open(test_negatives_fn, "r") as f:
    line = f.readline()
    while line != None and line != "":
        line_list = line.split("\t")
        # print(line_list)
        ua = line_list[0].strip("()").split(",")
        user, article = int(ua[0]), int(ua[1])
        test_positives.append([user, article])
        # print(user)
        # print(article)
        negatives = []
        for neg in line_list[1: ]:
            negatives.append(int(neg))
        test_negatives.append(negatives)
        line = f.readline()

In [None]:
len(test_positives), len(test_negatives)

In [None]:
ua_train, ua_val, label_train, label_val = train_test_split(np.column_stack((user_train, article_train)),
                                                            labels_train, random_state=SEED, test_size=0.2)

In [None]:
user_train, article_train = ua_train[:, 0], ua_train[:, 1]
user_val, article_val = ua_val[:, 0], ua_val[:, 1]

## Initialize Model Parameter

In [None]:
layers = [64, 32, 16, 8]
reg_layers = [1e-4, 1e-4, 1e-4, 1e-4]
num_layer = len(layers)
epochs = 10
learning_rate = 0.001
batch_size=256
loss = 'binary_crossentropy'
SEED = 420
dropout = True
dropout_rates = [0, 0.2, 0.2, 0]

## Build Model

In [None]:
user_input = Input(shape=(1,), dtype='int32', name='user_input')
article_input = Input(shape=(1,), dtype='int32', name='article_input')

In [None]:
MLP_Embedding_User = Embedding(input_dim=num_users, output_dim=layers[0]//2, 
                               embeddings_regularizer=l2(reg_layers[0]),
                               name='user_embedding', input_length=1)

In [None]:
MLP_Embedding_Article = Embedding(input_dim=num_articles, output_dim=layers[0]//2, 
                                  embeddings_regularizer=l2(reg_layers[0]),
                                  name='article_embedding', input_length=1)

In [None]:
user_latent = Flatten()(MLP_Embedding_User(user_input))
article_latent = Flatten()(MLP_Embedding_Article(article_input))

In [None]:
vector = Concatenate(axis=-1)([user_latent, article_latent])

In [None]:
for idx in range(1, num_layer):
    layer = Dense(layers[idx], activation='relu', 
                  kernel_regularizer=l2(reg_layers[idx]), name=f'layer{idx}')
    if dropout:
        drop = Dropout(dropout_rates[idx-1], seed=SEED)
    vector = drop(layer(vector))

In [None]:
prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name='prediction')(vector)

In [None]:
model = Model(inputs=[user_input, article_input], outputs=prediction)

In [None]:
model.compile(optimizer=Adam(lr=learning_rate), loss=loss,
              metrics=[Precision(), AUC()])

In [None]:
model.summary()

## Fitting of the model

In [None]:
ua_train, ua_val, label_train, label_val = train_test_split(np.column_stack((user_train, article_train)),
                                                            labels_train, random_state=SEED, test_size=0.2)

In [None]:
user_train, article_train = ua_train[:, 0], ua_train[:, 1]
user_val, article_val = ua_val[:, 0], ua_val[:, 1]

In [None]:
hist = model.fit([np.array(user_train), np.array(article_train)], #input
                 np.array(label_train), # labels
                 validation_data=([np.array(user_val), np.array(article_val)], np.array(label_val)),
                 batch_size=batch_size, 
                 epochs=epochs, 
                 verbose=1, 
                 shuffle=False)

## Evaluation of the Model

In [None]:
K = 10
iterations = len(test_positives)

In [None]:
hits, ndcgs, rrs = [], [], []
for idx in range(iterations):
    print(round((idx/iterations)*100, 2), end="\r")
    hr, ndcg, rr = eval_one_rating(idx, model, test_positives, test_negatives, K)
    hits.append(hr)
    ndcgs.append(ndcg)
    rrs.append(rr)

In [None]:
hr = np.array(hits).mean()
mrr = np.array(rrs).mean()
ndcg = np.array(ndcgs).mean()

print("Hit ratio:            ", hr)
print("Mean reciprocal rank: ", mrr)
print(f"NDCG@{K}:            ", ndcg)

In [None]:
model_out_file = f"trained-models/{dataset_size}_MLP_{layers}"
model_out_file = f's3://{bucket}/ncf-large/ncf-model-large-'

date_time = datetime.now.strftime("%m/%d/%Y_%H:%M")

In [None]:
model.save(model_out_file+date_time)

The ncf model can also be extended to incorporate information about users and articles (e.g. categories, titles, ..) by extending the input layers. In this sense it would be a hybrid model of collaborative and content-based filtering. 

Another thing that has been done is to combine the matrix factorization approach with the DNN approach. This will be done in the following:

# Neural Matrix Factorization (NeuMF)

## Initialize Model Parameters

In [None]:
EPOCHS = 20
BATCH_SIZE = 256
NUM_FACTORS = 8
LAYERS = [64,32,16,8]
REG_MF = 0
REG_LAYERS = [0,0,0,0]
REGS = [0, 0]
NUM_NEG = 4
LR = 0.001
LEARNER = "adam"

In [None]:
topK = 10

In [None]:
NUM_LAYER = len(LAYERS)

## Build Model

In [None]:
user_input = Input(shape=(1,), dtype='int32', name='user_input')
article_input = Input(shape=(1,), dtype='int32', name='article_input')

### User and Article Embeddings

#### Matrix Factorization

In [None]:
MF_Embedding_User = Embedding(input_dim=num_users, 
                              output_dim=NUM_FACTORS, 
                              name='mf_user_embedding',
                              input_length=1)

In [None]:
MF_Embedding_Article = Embedding(input_dim=num_articles, 
                                 output_dim=NUM_FACTORS, 
                                 name = 'mf_article_embedding',
                                 input_length=1)

#### Mulit-Layer Perceptron

In [None]:
MLP_Embedding_User = Embedding(input_dim=num_users, output_dim=LAYERS[0]//2, 
                               name='mlp_user_embedding', input_length=1)

In [None]:
MLP_Embedding_Article = Embedding(input_dim=num_articles, output_dim=LAYERS[0]//2, 
                               name='mlp_article_embedding', input_length=1)

### MF and MLP Prediction

In [None]:
mf_user_latent = Flatten()(MF_Embedding_User(user_input))
mf_article_latent = Flatten()(MF_Embedding_Article(article_input))

mf_vector = Multiply()([mf_user_latent, mf_article_latent])

In [None]:
mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
mlp_article_latent = Flatten()(MLP_Embedding_Article(article_input))

mlp_vector = Concatenate(axis=-1)([mlp_user_latent, mlp_article_latent])

In [None]:
for idx in range(1, NUM_LAYER):
    layer = Dense(LAYERS[idx], activation='relu', name='layer%d' %idx)
    mlp_vector = layer(mlp_vector)

In [None]:
predict_vector = Concatenate()([mf_vector, mlp_vector])

In [None]:
prediction = Dense(1, activation='sigmoid', 
                   kernel_initializer='lecun_uniform', 
                   name = 'prediction')(predict_vector)

## Compile and Fit Model

In [None]:
model = Model([user_input, article_input], prediction)

In [None]:
model.compile(optimizer=Adam(lr=LR), loss='binary_crossentropy',
             metrics=[MeanSquaredError(), Precision(), AUC()])

In [None]:
model.summary()

In [None]:
hist_neu = model.fit([np.array(user_train), np.array(article_train)], #input
                     np.array(label_train), # labels 
                     validation_data=([np.array(user_val), np.array(article_val)], np.array(label_val))
                     batch_size=BATCH_SIZE, 
                     epochs=1, 
                     verbose=1)

## Evaluate Model

In [None]:
K = 10
iterations = len(test_positives)

In [None]:
hits, ndcgs, rrs = [], [], []
for idx in range(iterations):
    print(round((idx/iterations)*100, 2), end="\r")
    hr, ndcg, rr = eval_one_rating(idx, model_neu, test_positives, test_negatives, K)
    hits.append(hr)
    ndcgs.append(ndcg)
    rrs.append(rr)

In [None]:
hr = np.array(hits).mean()
mrr = np.array(rrs).mean()
ndcg = np.array(ndcgs).mean()

print("Hit ratio:            ", hr)
print("Mean reciprocal rank: ", mrr)
print(f"NDCG@{topK}:         ", ndcg)