<a href="https://colab.research.google.com/github/van26101998/Recommendation-System/blob/main/DeepAutoEncoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre

In [None]:
# install required packages
!pip install -q comet_ml wandb

In [None]:
import wandb
wandb_run = wandb.init(project="deeprec-new-ml1m")


[34m[1mwandb[0m: Currently logged in as: [33mvan26101998[0m (use `wandb login --relogin` to force relogin)


In [None]:
from comet_ml import Experiment
#Create an experiment with your api key:
experiment = Experiment(
    api_key="41MoEshiqniCH5C9YqZnSpEfc",
    project_name="deeprec-new-ml1m",
    workspace="van26101998",
)

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/van26101998/deeprec-new-ml1m/ec6377ec01674b74a830f371aaaea205



In [None]:
SEED = 42

In [None]:
from typing import List, Dict, Text
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, Subset

from torchsummary import summary
import numpy as np
import pandas as pd
import zipfile
import random

from sklearn.model_selection import train_test_split
# torch.manual_seed(SEED)

# Movielens 1M Dataset

In [None]:
!wget -c http://files.grouplens.org/datasets/movielens/ml-1m.zip
import zipfile
with zipfile.ZipFile('ml-1m.zip', 'r') as zip_ref:
    zip_ref.extractall()

--2020-12-26 09:45:01--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [None]:
ratings_df = pd.read_csv('./ml-1m/ratings.dat', sep="::", engine='python', names=['user', 'item', 'rating', 'timestamp'])
users = sorted(ratings_df['user'].unique())
items = sorted(ratings_df['item'].unique())
ratings_df

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [None]:
# train validate test split
train_df, test_df = train_test_split(ratings_df, test_size=0.1, random_state=SEED)
train_df, validate_df = train_test_split(train_df, test_size=0.1, random_state=SEED)

In [None]:
print("number of train-validate-test:", len(train_df), len(validate_df), len(test_df))

number of train-validate-test: 810169 90019 100021


In [None]:
class ML1MDataset(Dataset):
    """
        MovieLens 1M Dataset
        + Params:
            - ratings_df: DataFrame with 4 columns: user, item, rating, timestamp
            - users: list all users
            - items: list all items
    """
    def __init__(self, ratings_df: pd.DataFrame, users: List[int], items: List[int]):
        self.data = {} 
        # dictionary with key - user, value - (a dictionary key - item, value - rating)
        for user, item, rating, timestamp in ratings_df.to_numpy():
            if user not in self.data:
                self.data[user] = {}
                
            self.data[user][item] = rating
         
        
        self.users = users
        self.items = items
        
        self.num_users = len(self.users)
        self.num_items = len(self.items)

        self.user_to_id = {user:id for id, user in enumerate(self.users)}
        self.item_to_id = {item:id for id, item in enumerate(self.items)}
    
    def __len__(self):
        return self.num_users
    

    def __getitem__(self, idx):

        user = self.users[idx]

        ratings_vector = np.full((self.num_items,), 0)

        if user in self.data:
            for item, rating in self.data[user].items():
                id = self.item_to_id[item]
                ratings_vector[id] = rating

        
        return torch.FloatTensor(ratings_vector)

In [None]:
trainset =  ML1MDataset(train_df, users, items)
testset = ML1MDataset(test_df, users, items)
validateset = ML1MDataset(validate_df, users, items)

In [None]:
trainset.num_users

6040

# Model

In [None]:
def get_activation_layer(activation_func: str):
    activation_func = str.lower(activation_func)
    match = {
        "selu": nn.SELU(),
        "relu": nn.ReLU(),
        "sigmoid": nn.Sigmoid(),
        "tanh": nn.Tanh()
    }
    
    return match[activation_func]

In [None]:
class DeepAutoEncoder(nn.Module):
    def __init__(self, layer_sizes, activation_func='selu', is_constrained=True, dropout=0.0):
        """
            Params:
                layer_sizes: list of integers, if layer_sizes = [1000, 512, 256], the model has 2 encoder layers (1000*512) -> (512*256), and 2 decoder layers (256*512) -> (512*1000)
                activation_func: activation function, one of `relu`, `tanh`, sigmoid`, `selu`
                is_constrained: if True, the decoder re-uses weights of the encoder
        """
        super(DeepAutoEncoder, self).__init__()

        layers = list()

        # encoder
        for i in range(len(layer_sizes) - 1):
            in_features = layer_sizes[i]
            out_features = layer_sizes[i+1]
            linear_layer = nn.Linear(in_features,out_features)
            self.init_weight(linear_layer)
            layers.append(linear_layer)
            layers.append(get_activation_layer(activation_func))
        
        if dropout > 0:
            layers.append(nn.Dropout(dropout))
        
        # decoder
        for i in range(len(layer_sizes) - 1, 0, -1):
            in_features = layer_sizes[i]
            out_features = layer_sizes[i-1]
            linear_layer = nn.Linear(in_features,out_features)
            if is_constrained:
                encode_linear_layer = layers[2*i-2]
                linear_layer.weight.data = encode_linear_layer.weight.data.transpose(0, 1)
            else:
                self.init_weight(linear_layer)
            layers.append(linear_layer)
            layers.append(get_activation_layer(activation_func))
        
        self.dae = nn.Sequential(*layers)


    def init_weight(self, layer):
        if type(layer) == nn.Linear:
            nn.init.xavier_uniform_(layer.weight)
            layer.bias.data.fill_(0.01)
    
    
    def forward(self, x):
        return self.dae(x)

# Evaluate function

In [None]:
def masked_se(y_true, y_pred):
    mask = y_true != 0
    criterion = nn.MSELoss(reduction='sum')
    
    return criterion(y_pred * mask.float(), y_true)

In [None]:
def masked_mse(y_true, y_pred):
    mask = y_true != 0
    num_ratings = torch.sum(mask.float())
    criterion = nn.MSELoss(reduction='sum')
    masked_se = criterion(y_pred * mask.float(), y_true)

    return masked_se / num_ratings

In [None]:
def masked_rmse(y_true, y_pred):
    return torch.sqrt(masked_mse(y_true, y_pred))

In [None]:
def masked_rmse_clip(y_true, y_pred, min_clip=1, max_clip=5):
    y_pred = torch.clip(y_pred, min_clip, max_clip)

    return masked_rmse(y_true, y_pred)

# Train-test function

In [None]:
def train(model, optimizer, data_loader, criterion, device):
    """
        Train the model
        Params:
        ...
        criterion: loss function
        Return:
        total_loss
    """
    model.train()
    total_loss = 0.0
    n = 0
    for i, data in enumerate(data_loader):
        input = data.to(device)
        output = model(input)
        loss = criterion(input.float(), output)
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        n += 1
    
    return total_loss / n

In [None]:
def test(model, data_loader, device):
    """
        Evaluate the model
        Return: RMSE
    """
    model.eval()
    targets, predicts = list(), list()
    with torch.no_grad():
        for data in data_loader:
            input = data.to(device)
            output = model(input)
            targets.extend(data.tolist())
            predicts.extend(output.tolist())

    targets, predicts = torch.Tensor(targets), torch.Tensor(predicts)
    # TODO
    return masked_rmse_clip(targets, predicts)

# Train on MovieLens1M

In [None]:
# parameters
input_dim = len(items)
layer_sizes = [input_dim, 512, 512, 1024]
device = torch.device('cuda')
batch_size = 512
epochs = 100
lr = 0.005
momentum = 0.9
criterion = masked_mse
dropout = 0.8
activation = 'selu'
is_constrained = True

In [None]:
trainloader = DataLoader(trainset, batch_size=batch_size)
validateloader = DataLoader(validateset, batch_size=batch_size)
testloader = DataLoader(testset, batch_size=batch_size)

In [None]:
model = DeepAutoEncoder(layer_sizes, dropout=dropout, is_constrained=is_constrained, activation_func=activation)
model = model.to(device)
summary(model, (1, layer_sizes[0]))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 512]       1,897,984
              SELU-2               [-1, 1, 512]               0
            Linear-3               [-1, 1, 512]         262,656
              SELU-4               [-1, 1, 512]               0
            Linear-5              [-1, 1, 1024]         525,312
              SELU-6              [-1, 1, 1024]               0
           Dropout-7              [-1, 1, 1024]               0
            Linear-8               [-1, 1, 512]         524,800
              SELU-9               [-1, 1, 512]               0
           Linear-10               [-1, 1, 512]         262,656
             SELU-11               [-1, 1, 512]               0
           Linear-12              [-1, 1, 3706]       1,901,178
             SELU-13              [-1, 1, 3706]               0
Total params: 5,374,586
Trainable param

In [None]:
wandb.watch(model, log="all")

[<wandb.wandb_torch.TorchGraph at 0x7f57f57de080>]

In [None]:
optimizer = torch.optim.SGD(params=model.parameters(), lr=lr, momentum=momentum)

In [None]:
params = {
    "seed": SEED,
    "layer_sizes": layer_sizes,
    "device": device,
    "batch_size": batch_size,
    "epochs": epochs,
    "lr": lr,
    "dropout": dropout,
    "momentum": momentum,
    "activation": activation,
    "optimizer": optimizer,
    "is_constrained": is_constrained
}
experiment.log_parameters(params)
wandb.config.update(params)

In [None]:
from time import time
t0 = time()
for epoch in range(epochs):
    train_loss = train(model, optimizer, trainloader, criterion, device)
    train_rmse_clip = test(model, trainloader, device)
    valid_rmse_clip = test(model, validateloader, device)
    # print('epoch {} \t train_loss {} '.format(epoch+1, train_loss))
    print('epoch {} \t train_loss {} \t train_rmse {} \t valid_rmse {}'.format(epoch+1, train_loss, train_rmse_clip, valid_rmse_clip))

    metrics = {
        "train_loss": train_loss,
        "train_rmse_clip": train_rmse_clip,
        "valid_rmse_clip": valid_rmse_clip
    }
    experiment.log_metrics(metrics, epoch=epoch+1)
    wandb.log(metrics)
print("total time: ", time() - t0)

epoch 1 	 train_loss 8.66086765130361 	 train_rmse 1.6905927658081055 	 valid_rmse 1.651010513305664
epoch 2 	 train_loss 4.250160257021586 	 train_rmse 1.354318618774414 	 valid_rmse 1.4420223236083984
epoch 3 	 train_loss 2.712116519610087 	 train_rmse 1.2138255834579468 	 valid_rmse 1.2998994588851929
epoch 4 	 train_loss 2.1522141098976135 	 train_rmse 1.1271229982376099 	 valid_rmse 1.197977900505066
epoch 5 	 train_loss 1.878717948993047 	 train_rmse 1.0759445428848267 	 valid_rmse 1.1381431818008423
epoch 6 	 train_loss 1.7160055041313171 	 train_rmse 1.043956995010376 	 valid_rmse 1.0980967283248901
epoch 7 	 train_loss 1.5928086737791698 	 train_rmse 1.027258038520813 	 valid_rmse 1.0864953994750977
epoch 8 	 train_loss 1.5087817509969075 	 train_rmse 1.0170612335205078 	 valid_rmse 1.0792722702026367
epoch 9 	 train_loss 1.439484308163325 	 train_rmse 1.00852632522583 	 valid_rmse 1.0666700601577759
epoch 10 	 train_loss 1.371719519297282 	 train_rmse 0.9943221211433411 	 val

In [None]:
test_rmse_clip = test(model, testloader, device)
print('test rmse:', test_rmse_clip)

test rmse: tensor(0.8897)


In [None]:
experiment.log_metric("test_rmse_clip", test_rmse_clip)
wandb.log({"test_rmse_clip": test_rmse_clip})

In [None]:
wandb_run.finish()
experiment.end()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.80493
train_rmse_clip,0.88516
valid_rmse_clip,0.88174
_step,100.0
_runtime,686.0
_timestamp,1608976581.0
test_rmse_clip,0.88973


0,1
train_loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_rmse_clip,█▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_rmse_clip,█▅▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
test_rmse_clip,▁


COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/van26101998/deeprec-new-ml1m/ec6377ec01674b74a830f371aaaea205
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [120]            : (0.7562607526779175, 10.384223937988281)
COMET INFO:     test_rmse_clip        : 0.8897333741188049
COMET INFO:     train_loss [100]      : (0.7901901056369146, 8.66086765130361)
COMET INFO:     train_rmse_clip [100] : (0.8383203148841858, 1.6905927658081055)
COMET INFO:     valid_rmse_clip [100] : (0.8527966737747192, 1.651010513305664)
COMET INFO:   Parameters:
COMET INFO:     activation     : selu
COMET INFO:     batch_size     : 512
COMET INFO:     device         : cuda
COMET INFO:     dropout        : 0.8
COMET INFO:     epochs         : 100
COMET INFO:     is_constrained : True
COMET INFO:     layer_