#Goal: RMSE 1.005

#Student ID

In [1]:
STUDENT_ID = '20491384'

#Download Data

In [2]:
!wget -q https://hkustconnect-my.sharepoint.com/:u:/g/personal/nnanda_connect_ust_hk/Ea7QgUR0Cp9Hu7cB6ApBqG8BPv4dlpSUzbqu4xa4Lv-rfw?download=1 -O "Project3 data.zip"
!unzip -q "Project3 data.zip"

#Import Libraries

In [3]:
from math import sqrt
from keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape
from keras.models import Model
from keras.callbacks import Callback, ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

#RMSE Calculation

In [4]:
# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

#Build NCF Model

In [5]:
def build_cfmodel(n_users, n_items, embed_size, output_layer='dot'):
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')
    
    user_emb = Embedding(output_dim=embed_size, input_dim=n_users, input_length=1)(user_input)
    user_emb = Reshape((embed_size,))(user_emb)
    item_emb = Embedding(output_dim=embed_size, input_dim=n_items, input_length=1)(item_input)
    item_emb = Reshape((embed_size,))(item_emb)
    
    if output_layer == 'dot':
        model_output = Dot(axes=1)([user_emb, item_emb])
    elif output_layer == 'mlp':
        mlp_input = Concatenate()([user_emb, item_emb])

        dense_1 = Dense(64, activation='relu')(mlp_input)
        dense_1_dp = Dropout(0.15)(dense_1)
        dense_2 = Dense(32, activation='relu')(dense_1_dp)
        dense_2_dp = Dropout(0.15)(dense_2)
        model_output = Dense(1)(dense_2_dp)
    else:
        raise NotImplementedError

    model = Model(inputs=[user_input, item_input],
                  outputs=model_output)
    return model

#Feature Engineering

In [6]:
tr_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/valid.csv")
te_df = pd.read_csv("data/test.csv")

# Build User/Item vocabulary
user_set = set(tr_df.user_id.unique())
business_set = set(tr_df.business_id.unique())
user_vocab = dict(zip(user_set, range(1, len(user_set) + 1)))
user_vocab['unk'] = 0
n_users = len(user_vocab)
business_vocab = dict(zip(business_set, range(1, len(business_set) + 1)))
business_vocab['unk'] = 0
n_items = len(business_vocab)

tr_users = tr_df.user_id.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
tr_items = tr_df.business_id.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values
tr_ratings = tr_df.stars.values
val_users = val_df.user_id.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
val_items = val_df.business_id.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values
val_ratings = val_df.stars.values
te_users = te_df.user_id.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
te_items = te_df.business_id.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values

#NCF - MLP model

In [14]:
def run_cfmodel(n_users, n_items, embed_size=50, output_layer='mlp', epochs=1):
    model = build_cfmodel(
        n_users=n_users, n_items=n_items, 
        embed_size=embed_size,
        output_layer=output_layer)

    model.compile(optimizer='adam', loss='mse')
    history = model.fit(
        [tr_users, tr_items], 
        tr_ratings, 
        epochs=epochs, 
        verbose=1,
        callbacks=[ModelCheckpoint('model.h5')])

    y_pred = model.predict([tr_users, tr_items])
    print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
    y_pred = model.predict([val_users, val_items])
    print("VALID RMSE: ", rmse(y_pred, val_ratings))

## Dot

### Embed Size = 10

#### Epochs = 1

In [24]:
run_cfmodel(n_users, n_items, embed_size=10, output_layer="dot", epochs=1)

TRAIN RMSE:  3.9667903315326707
VALID RMSE:  3.9756482112397027


#### Epochs = 5

In [25]:
run_cfmodel(n_users, n_items, embed_size=10, output_layer="dot", epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN RMSE:  1.6110357019853387
VALID RMSE:  1.8353262771407703


#### Epochs = 10

In [26]:
run_cfmodel(n_users, n_items, embed_size=10, output_layer="dot", epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TRAIN RMSE:  0.9306065545071966
VALID RMSE:  1.259925692480558


### Embed Size = 50

#### Epochs = 1

In [18]:
run_cfmodel(n_users, n_items, embed_size=50, output_layer="dot", epochs=1)

TRAIN RMSE:  3.7989770014460955
VALID RMSE:  3.8449542198812656


#### Epochs = 5

In [19]:
run_cfmodel(n_users, n_items, embed_size=50, output_layer="dot", epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN RMSE:  0.8865298999067136
VALID RMSE:  1.2756073970107678


#### Epochs = 10

In [20]:
run_cfmodel(n_users, n_items, embed_size=50, output_layer="dot", epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TRAIN RMSE:  0.46621924739969073
VALID RMSE:  1.2588486338518


### Embed Size = 100

#### Epochs = 1

In [21]:
run_cfmodel(n_users, n_items, embed_size=100, output_layer="dot", epochs=1)

TRAIN RMSE:  3.6030992730828313
VALID RMSE:  3.6946762884103315


#### Epochs = 5

In [22]:
run_cfmodel(n_users, n_items, embed_size=100, output_layer="dot", epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN RMSE:  0.6721273207784683
VALID RMSE:  1.276091022273684


### Epochs = 10

In [23]:
run_cfmodel(n_users, n_items, embed_size=100, output_layer="dot", epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TRAIN RMSE:  0.24875279949782256
VALID RMSE:  1.3163336141787305


## MLP

### Embed Size = 10

#### Epochs = 1

In [33]:
run_cfmodel(n_users, n_items, embed_size=10, output_layer="mlp", epochs=1)

TRAIN RMSE:  0.9847803356651544
VALID RMSE:  1.0364505131302184


#### Epochs = 5

In [34]:
run_cfmodel(n_users, n_items, embed_size=10, output_layer="mlp", epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN RMSE:  0.8594429383803304
VALID RMSE:  1.0490212042450915


#### Epochs = 10

In [35]:
run_cfmodel(n_users, n_items, embed_size=10, output_layer="mlp", epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TRAIN RMSE:  0.6776780737048899
VALID RMSE:  1.1512020683870534


### Embed Size = 50

#### Epochs = 1

In [27]:
run_cfmodel(n_users, n_items, embed_size=50, output_layer="mlp", epochs=1)

TRAIN RMSE:  0.9805220396541752
VALID RMSE:  1.0445426220905691


#### Epochs = 5

In [28]:
run_cfmodel(n_users, n_items, embed_size=50, output_layer="mlp", epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN RMSE:  0.8077190499069206
VALID RMSE:  1.0661419055223293


#### Epochs = 10

In [29]:
run_cfmodel(n_users, n_items, embed_size=50, output_layer="mlp", epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TRAIN RMSE:  0.4831816176666143
VALID RMSE:  1.1553359661944838


### Embed Size = 100

#### Epochs = 1

In [30]:
run_cfmodel(n_users, n_items, embed_size=100, output_layer="mlp", epochs=1)

TRAIN RMSE:  0.9788794684000723
VALID RMSE:  1.044591126818643


#### Epochs = 5

In [31]:
run_cfmodel(n_users, n_items, embed_size=100, output_layer="mlp", epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN RMSE:  0.7787421566428055
VALID RMSE:  1.0644568907165561


#### Epochs = 10

In [32]:
run_cfmodel(n_users, n_items, embed_size=100, output_layer="mlp", epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TRAIN RMSE:  0.5007167627135511
VALID RMSE:  1.1582035329455567


# Make Predictions

In [37]:
model = build_cfmodel(
    n_users, n_items, 
    embed_size=10,
    output_layer="mlp")

model.compile(optimizer='adam', loss='mse')
history = model.fit(
    [tr_users, tr_items], 
    tr_ratings, 
    epochs=1, 
    verbose=1,
    callbacks=[ModelCheckpoint('model.h5')])

y_pred = model.predict([tr_users, tr_items])
print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
y_pred = model.predict([val_users, val_items])
print("VALID RMSE: ", rmse(y_pred, val_ratings))

TRAIN RMSE:  0.9798878090075481
VALID RMSE:  1.0350033556132094


In [38]:
res_df = pd.DataFrame()
res_df['pred'] = y_pred[:, 0]
res_df.to_csv("{}.csv".format(STUDENT_ID), index=False)
print("Writing test predictions to file done.")

Writing test predictions to file done.
