In [1]:
import pandas as pd
import numpy as np
from CDAE import CDAE
from Dataset import Dataset
from Evaluator import Evaluator
import torch

The model is built as follow:
1. The encoder Layer has the shape of `total_number_of_items` $\times$ `the_hidden_dimension`

$$
z_u = h(W^{\intercal}\widetilde{y}_u + V_u + b)
$$

- $W$ weight matrix `total_number_of_items` $\times$ `the_hidden_dimension`
- $\widetilde{y}_u$ the user-rating input matrix with the shape of `batch_size` $\times$ `total_number_of_items`
- $V_u$ user-specific vector, each user's id will be uniquely embedded with its corresponding vector and contribute to the fomula itself, which serve the purpose of improving performance metrics for the user.

2. The hidden layer (50 $\times$ 50)
3. The decoder layer will has the shape of `the_hidden_dimension` $\times$ `the_total_number_of item`


**The model face two major problems:**

- Cold start: new user has not yet been trained in the model, so there will not be a good recommendation engine at first for new user. One solution for this is to recommend the related items for this user at first, before feeding the engine.
- During splitting dataset, the data utility will randomly split at some predefined percentage, hence, the data will receive the NA values rating at test set or train set. This problem will be overcome by setting threshold for at least 4 ratings different item ratings per user.

Another problem might be that this autoencoder model only consider user-item interaction (or called implicit feedback) such as 0, 1 but not a rating scale such as 1-5, since the performance on those two rating system would not be different. However, the advantages of this would be based on the assumption of user will rarely rate the item but click to view, this can be count as interested and put rating at 1 on the dataset.

#### Load in data

In [3]:
data = pd.read_csv("goodread_books/ratings.csv", engine="python")
data.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [None]:
data.shape

(5976479, 3)

In [None]:
data = data.rename(columns={"user_id": "user", "book_id": "item", "rating": "rating"})
data.shape

(5976479, 3)

## Preprocess data

- Remove ratings that equal to zero
- Set threshold and scale down dataset

In [None]:
zero_rating_row_dix = data[data["rating"] == 0].index
data = data.drop(zero_rating_row_dix)
data.head()

Unnamed: 0,user,item,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [None]:
data.shape

(5976479, 3)

In [None]:
# Select user IDs with more than 20 book ratings
min_ratings_threshold = 20

# Count book ratings per user
num_ratings_per_user = data.groupby('user')['rating'].count()

# Filter users with more than the minimum threshold
knowledgeable_user_ids = num_ratings_per_user[num_ratings_per_user > min_ratings_threshold].index
knowledgeable_user_ratings = data[data['user'].isin(knowledgeable_user_ids)]

In [None]:
print(knowledgeable_user_ratings.shape)
print(knowledgeable_user_ratings.head())

(5976440, 3)
   user  item  rating
0     1   258       5
1     2  4081       4
2     2   260       5
3     2  9296       5
4     2  2318       3


In [None]:
type(knowledgeable_user_ratings)

pandas.core.frame.DataFrame

In [None]:
min_ratings_count_threshold = 8
rating_counts= knowledgeable_user_ratings.groupby('item').count()['rating']
popular_books = rating_counts[rating_counts >= min_ratings_count_threshold].index
final_ratings =  knowledgeable_user_ratings[knowledgeable_user_ratings['item'].isin(popular_books)]

In [None]:
type(final_ratings)

pandas.core.frame.DataFrame

In [None]:
data_group = final_ratings.groupby('user')
for i, group in data_group:
    num_items_user = len(group)
    if num_items_user <= 4:
        final_ratings = final_ratings.drop(final_ratings[final_ratings["user"] == i].index)

In [None]:
num_users = len(pd.unique(final_ratings.user))
num_items = len(pd.unique(final_ratings.item))
print('Initial users: {}, items: {}'.format(num_users, num_items))

Initial users: 53422, items: 10000


In [None]:
final_ratings.shape

(5976440, 3)

In [None]:
# df = export_data.drop(["item"], axis=1)
# df = df.rename(columns={"user": "user", "rating": "rating", "new_items_id": "item"})
# df = df.reindex(columns=["user", "item", "rating"])
df = final_ratings.copy()
df.to_csv("books/edited_ratings.csv", index=False)
df.head()

Unnamed: 0,user,item,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [None]:
data_group = df.groupby("user")
counter = 0
for _, group in data_group:
    num_items_user = len(group)
    if num_items_user == 1:
        counter += 1
print(counter)

0


In [None]:
# Define hyperparameters
data_path = "books/edited_ratings.csv"
train_ratio = 0.8
hidden_dim = 50
num_epochs = 200
batch_size = 512
testing_batch_size = 512
learning_rate = 0.01
early_stop = True
patience = 50
top_k = [10]

In [None]:
device = torch.device("cpu")
dataset = Dataset(
    data_path=data_path,
    save_path="training.json",
    sep="\t",
    device=device
)

Preprocess start...
Initial users: 53422, items: 10000
Assign new user id from 0..n
Assign new item id from 0..n
Split data into training set and test set
# zero train, test: 0, 0
Preprocess finished.


In [None]:
dataset.obtain_data_statistic()

{'Total users': 53422,
 'Total items': 10000,
 'Total ratings': 5976440,
 'Sparsity ratio': 98.8812773763618,
 'Min/Max/Avg. ratings per users': [21, 200, 111.87226236382016],
 'Number of train users': 53422,
 'Number of train ratings': 4759728,
 'Number of test users': 53422,
 'Number of test ratings': 1216712}

The rating of 1 indicate the interaction between user and item that represent in the matrix. During training time, it will take 80% of each user's rating and put it into a training set.

In [None]:
16/20

0.8

In [None]:
eval_pos, eval_target = dataset.eval_data()
item_popularity = dataset.item_popularity

In [None]:
evaluator = Evaluator(eval_pos, eval_target, item_popularity, top_k)
model = CDAE(num_users=dataset.num_users, 
            num_items=dataset.num_items,
            hidden_dim=hidden_dim)
print(model)

CDAE(
  (user_embedding): Embedding(53422, 50)
  (encoder): Linear(in_features=10000, out_features=50, bias=True)
  (decoder): Linear(in_features=50, out_features=10000, bias=True)
)


In [None]:
import time
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()
epochs = 200
best_epoch = -1
best_score = None
best_params = None
patience = 50

if len(list(model.parameters())) > 0:
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
else:
    optimizer = None

for epoch in range(1, epochs + 1):
    # train for an epoch
    epoch_start = time.time()
    loss = model.train_loop(dataset, optimizer, batch_size, verbose=False)
    train_elapsed = time.time() - epoch_start

    # evaluate
    score = evaluator.evaluate(model, testing_batch_size)
    epoch_elapsed = time.time() - epoch_start

    score_str = ' '.join(['%s=%.4f' % (m, score[m]) for m in score])

    print(f"Training epoch {epoch}/{epochs}, {epoch_elapsed:.2f}, {train_elapsed:.2f} loss = {loss:.2f}, {score_str}")

    # update if ...
    standard = 'NDCG@10'
    if best_score is None or score[standard] >= best_score[standard]:
        best_epoch = epoch
        best_score = score
        best_params = model.parameters()
        endure = 0
    else:
        endure += 1
        if early_stop and endure >= patience:
            print('Early Stop Triggered...')
            break
        
    writer.add_scalar('Loss/train', loss, epoch)

print('Training Finished.')
best_score_str = ' '.join(['%s = %.4f' % (k, best_score[k]) for k in best_score])
print(f'Best score at epoch {best_epoch}] {best_score_str}')

Training epoch 1/200, 17.31, 6.47 loss = 41852880.00, Prec@10=0.0730 Recall@10=0.0732 NDCG@10=0.0754 Nov@10=2.9455 Gini-D=0.1157
Training epoch 2/200, 15.73, 6.25 loss = 41097316.00, Prec@10=0.0790 Recall@10=0.0793 NDCG@10=0.0809 Nov@10=3.6707 Gini-D=0.2073
Training epoch 3/200, 16.29, 6.49 loss = 40867560.00, Prec@10=0.0818 Recall@10=0.0821 NDCG@10=0.0840 Nov@10=3.9732 Gini-D=0.2351
Training epoch 4/200, 15.76, 6.04 loss = 40730756.00, Prec@10=0.0843 Recall@10=0.0845 NDCG@10=0.0867 Nov@10=4.0046 Gini-D=0.2196
Training epoch 5/200, 15.63, 6.01 loss = 40638116.00, Prec@10=0.0867 Recall@10=0.0869 NDCG@10=0.0895 Nov@10=3.9921 Gini-D=0.2075
Training epoch 6/200, 16.48, 6.28 loss = 40571900.00, Prec@10=0.0876 Recall@10=0.0878 NDCG@10=0.0903 Nov@10=3.9845 Gini-D=0.1933
Training epoch 7/200, 16.87, 6.32 loss = 40524408.00, Prec@10=0.0886 Recall@10=0.0889 NDCG@10=0.0912 Nov@10=3.9592 Gini-D=0.1894
Training epoch 8/200, 16.64, 6.29 loss = 40487632.00, Prec@10=0.0898 Recall@10=0.0901 NDCG@10=0.0

Since the training set is split 80/20 for each user's preference set, the evaluation metric will be calculated for the reconstructed data afer being corrupted on how many corrected preference (the 20% left) of the user.

In [None]:
from utils import inference

user_id = torch.LongTensor([20])
print(user_id.shape)
user_ratings_tensor = torch.FloatTensor([np.array(dataset.train_matrix.toarray()[20])])
print(user_ratings_tensor)
it = inference(model, user_id=user_id, user_ratings_tensor=user_ratings_tensor, apply_dropout=True)
print(it)

torch.Size([1])
tensor([[0., 0., 1.,  ..., 0., 0., 0.]])
[18  9 20  2 15  4 21 17 16 14]


  user_ratings_tensor = torch.FloatTensor([np.array(dataset.train_matrix.toarray()[20])])


In [None]:
from utils import inference

user_id = torch.LongTensor([500])
print(user_id.shape)
user_ratings_tensor = torch.FloatTensor([np.array(dataset.train_matrix.toarray()[20])])
it = inference(model, user_id=user_id, user_ratings_tensor=user_ratings_tensor, apply_dropout=True)
print(it)

torch.Size([1])
[15  9 19  2 14  4 21 18 17 16]


The meaning of sparse matrix is to has many zeros and non-zeros values at the index