<a href="https://colab.research.google.com/github/van26101998/Recommendation-System/blob/main/MF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install required packages
!pip install -q comet_ml wandb

In [None]:
import wandb
wandb_run = wandb.init(project="mf")

[34m[1mwandb[0m: Currently logged in as: [33mvan26101998[0m (use `wandb login --relogin` to force relogin)


In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset

from torchsummary import summary
import numpy as np
import pandas as pd
import zipfile
import random

from sklearn.model_selection import train_test_split

# Movielens 1M Dataset

In [None]:
!wget -c http://files.grouplens.org/datasets/movielens/ml-1m.zip
import zipfile
with zipfile.ZipFile('ml-1m.zip', 'r') as zip_ref:
    zip_ref.extractall()

--2021-01-06 22:57:18--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [None]:
ratings_df = pd.read_csv('./ml-1m/ratings.dat', sep="::", engine='python', names=['user', 'item', 'rating', 'timestamp'])
users = sorted(ratings_df['user'].unique())
items = sorted(ratings_df['item'].unique())
ratings_df

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [None]:
# train validate test split
train_df, test_df = train_test_split(ratings_df, test_size=0.1)
train_df, validate_df = train_test_split(train_df, test_size=0.1)

In [None]:
print("number of train-validate-test:", len(train_df), len(validate_df), len(test_df))

number of train-validate-test: 810169 90019 100021


In [None]:
class ML1MDataset(Dataset):
    """
        MovieLens 1M Dataset
        + Params:
            - ratings_df: DataFrame with 4 columns: user, item, rating, timestamp
            - users: list all users
            - items: list all items
    """
    def __init__(self, ratings_df: pd.DataFrame, users, items):
        self.ratings_numpy = ratings_df.to_numpy()
        
        self.users = users
        self.items = items
        
        self.num_users = len(self.users)
        self.num_items = len(self.items)
        self.num_ratings = len(ratings_df)

        self.user_to_id = {user:id for id, user in enumerate(self.users)}
        self.item_to_id = {item:id for id, item in enumerate(self.items)}
    
    def __len__(self):
        return self.num_ratings
    

    def __getitem__(self, idx):

        user, item, rating, timestamp = self.ratings_numpy[idx]

        user_id = self.user_to_id[user]
        item_id = self.item_to_id[item]
        rating = 1.0 * rating

        
        return user_id, item_id, rating

In [None]:
trainset =  ML1MDataset(train_df, users, items)
testset = ML1MDataset(test_df, users, items)
validateset = ML1MDataset(validate_df, users, items)

# Model

In [None]:
def l2_regularize(array):
    """
    Function to do L2 regularization
    """
    loss = torch.sum(array ** 2.0)
    return loss

In [None]:
class MF(nn.Module):
    # Iteration counter
    itr = 0

    def __init__(self, n_user, n_item, k=10, lamda=1.0):
        """
        :param n_user: User column
        :param n_item: Item column
        :param k: Dimensions constant
        :param lamda: Regularization constant
        :param writer: Log results via TensorBoard
        """
        super(MF, self).__init__()

        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.lamda = lamda

        
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)

    def forward(self, user_id, item_id):
        
        vector_user = self.user(user_id)
        
        vector_item = self.item(item_id)

        # The user-item interaction: p_u * q_i is a dot product between the 2 vectors above
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)

        return ui_interaction

    def loss(self, prediction, target):
        """
        Function to calculate the loss metric
        """
        # Calculate the Mean Squared Error between target = R_ui and prediction = p_u * q_i
        loss_mse = F.mse_loss(prediction, target.squeeze())

        # Compute L2 regularization over user (P) and item (Q) matrices
        prior_user = l2_regularize(self.user.weight) * self.lamda
        prior_item = l2_regularize(self.item.weight) * self.lamda

        
        total = loss_mse + prior_user + prior_item

        return total


# Train

In [None]:
def train(model, trainloader, optimizer, device):
    model.train()
    total_loss = 0.0
    for i, (user, item, rating) in enumerate(trainloader):
        user = torch.LongTensor(user).to(device)
        item = torch.LongTensor(item).to(device)
        rating = torch.FloatTensor(rating.float()).to(device)

        pred_rating = model(user, item)
        loss = model.loss(pred_rating, rating)

        model.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / i

In [None]:
def test(model, testloader, device):
    model.eval()
    with torch.no_grad():
        mse = 0.0
        num_ratings = testloader.dataset.num_ratings
        for i, (user, item, rating) in enumerate(testloader):
            user = torch.LongTensor(user).to(device)
            item = torch.LongTensor(item).to(device)
            rating = torch.FloatTensor(rating.float()).to(device)

            pred_rating = model(user, item)
            pred_rating = torch.clamp(pred_rating, 0.0, 5.0)

            square_error = nn.MSELoss(reduction='sum')(pred_rating, rating)

            mse += square_error.item() / num_ratings
    
    return np.sqrt(mse)

In [None]:
lr = 0.01
epochs = 50
k = 5
lamda = 1e-6
batch_size = 512
num_users = len(users)
num_items = len(items)
device = torch.device('cuda')

In [None]:
trainloader = DataLoader(trainset, batch_size=batch_size)
validateloader = DataLoader(validateset, batch_size=batch_size)
testloader = DataLoader(testset, batch_size=batch_size)

In [None]:
model = MF(num_users, num_items, k=k, lamda=lamda)
model = model.to(device)

In [None]:
wandb.watch(model, log="all")

[<wandb.wandb_torch.TorchGraph at 0x7f1e30e40748>]

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
params = {
    "k": k,
    "device": device,
    "batch_size": batch_size,
    "epochs": epochs,
    "lr": lr,
    "lamda": lamda,
    "optimizer": optimizer,
}
wandb.config.update(params)

In [None]:
from time import time
t0 = time()

for epoch in range(epochs):
    loss = train(model, trainloader, optimizer, device)
    train_rmse = test(model, trainloader, device)
    valid_rmse = test(model, validateloader, device)

    print('epoch {} \t train_loss {} \t train_rmse {} \t valid_rmse {}'.format(epoch+1, loss, train_rmse, valid_rmse))
    metrics = {
        "train_loss": loss,
        "train_rmse": train_rmse,
        "valid_rmse": valid_rmse
    }
    wandb.log(metrics, step=epoch+1)

print("total time: ", time() - t0)

epoch 1 	 train_loss 9.132557337865817 	 train_rmse 1.0452972202990103 	 valid_rmse 1.082289954453703
epoch 2 	 train_loss 1.0014640973732534 	 train_rmse 0.927961329646787 	 valid_rmse 0.9561961631225312
epoch 3 	 train_loss 0.9273177271169597 	 train_rmse 0.9127266512598265 	 valid_rmse 0.9431116786012644
epoch 4 	 train_loss 0.8952544440813222 	 train_rmse 0.8929090393525995 	 valid_rmse 0.9283095406383498
epoch 5 	 train_loss 0.8691152001904175 	 train_rmse 0.8811796595241137 	 valid_rmse 0.9205127627907763
epoch 6 	 train_loss 0.8538089723864337 	 train_rmse 0.8742473632852857 	 valid_rmse 0.9164770694899819
epoch 7 	 train_loss 0.8444549454147686 	 train_rmse 0.8696597344875376 	 valid_rmse 0.914162832404108
epoch 8 	 train_loss 0.8381225630998913 	 train_rmse 0.8663482126940114 	 valid_rmse 0.9126002961362777
epoch 9 	 train_loss 0.833540667361767 	 train_rmse 0.8638270736726343 	 valid_rmse 0.9114642053593208
epoch 10 	 train_loss 0.8300728533421696 	 train_rmse 0.8618095282010

In [None]:
test_rmse = test(model, testloader, device)
print('test rmse:', test_rmse)

test rmse: 0.8945088308462378


In [None]:
wandb.log({"test_rmse": test_rmse})

In [None]:
wandb_run.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_step,50.0
_runtime,658.0
_timestamp,1609974494.0
train_loss,0.80196
train_rmse,0.84185
valid_rmse,0.89102
test_rmse,0.89451


0,1
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_rmse,█▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_rmse,█▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_rmse,▁
