<a href="https://colab.research.google.com/github/van26101998/Recommendation-System/blob/main/DeepAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre

In [None]:
# install required packages
!pip install -q comet_ml wandb

[K     |████████████████████████████████| 245kB 5.8MB/s 
[K     |████████████████████████████████| 1.8MB 22.7MB/s 
[K     |████████████████████████████████| 512kB 20.9MB/s 
[K     |████████████████████████████████| 204kB 37.1MB/s 
[K     |████████████████████████████████| 133kB 41.1MB/s 
[K     |████████████████████████████████| 163kB 41.5MB/s 
[K     |████████████████████████████████| 81kB 11.6MB/s 
[K     |████████████████████████████████| 102kB 14.1MB/s 
[K     |████████████████████████████████| 71kB 11.8MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for configobj (setup.py) ... [?25l[?25hdone


In [None]:
import wandb
wandb_run = wandb.init(project="deeprec-new-ml1m")


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from comet_ml import Experiment
#Create an experiment with your api key:
experiment = Experiment(
    api_key="41MoEshiqniCH5C9YqZnSpEfc",
    project_name="deeprec-new-ml1m",
    workspace="van26101998",
)

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/van26101998/deeprec-new-ml1m/d2fdb5df6f7b4c198ad05cae97f5a717



In [None]:
SEED = None

In [None]:
from typing import List, Dict, Text
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, Subset

from torchsummary import summary
import numpy as np
import pandas as pd
import zipfile
import random

from sklearn.model_selection import train_test_split
# torch.manual_seed(SEED)

# Movielens 1M Dataset

In [None]:
!wget -c http://files.grouplens.org/datasets/movielens/ml-1m.zip
import zipfile
with zipfile.ZipFile('ml-1m.zip', 'r') as zip_ref:
    zip_ref.extractall()

--2021-01-06 16:12:15--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2021-01-06 16:12:17 (4.27 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]



In [None]:
ratings_df = pd.read_csv('./ml-1m/ratings.dat', sep="::", engine='python', names=['user', 'item', 'rating', 'timestamp'])
users = sorted(ratings_df['user'].unique())
items = sorted(ratings_df['item'].unique())
ratings_df

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [None]:
# train validate test split
train_df, test_df = train_test_split(ratings_df, test_size=0.1, random_state=SEED)
train_df, validate_df = train_test_split(train_df, test_size=0.1, random_state=SEED)

In [None]:
print("number of train-validate-test:", len(train_df), len(validate_df), len(test_df))

number of train-validate-test: 810169 90019 100021


In [None]:
class ML1MDataset(Dataset):
    """
        MovieLens 1M Dataset
        + Params:
            - ratings_df: DataFrame with 4 columns: user, item, rating, timestamp
            - users: list all users
            - items: list all items
    """
    def __init__(self, ratings_df: pd.DataFrame, users: List[int], items: List[int]):
        self.data = {} 
        # dictionary with key - user, value - (a dictionary key - item, value - rating)
        for user, item, rating, timestamp in ratings_df.to_numpy():
            if user not in self.data:
                self.data[user] = {}
                
            self.data[user][item] = rating
         
        
        self.users = users
        self.items = items
        
        self.num_users = len(self.users)
        self.num_items = len(self.items)
        self.num_ratings = len(ratings_df)

        self.user_to_id = {user:id for id, user in enumerate(self.users)}
        self.item_to_id = {item:id for id, item in enumerate(self.items)}
    
    def __len__(self):
        return self.num_users
    

    def __getitem__(self, idx):

        user = self.users[idx]

        ratings_vector = np.full((self.num_items,), 0)

        if user in self.data:
            for item, rating in self.data[user].items():
                id = self.item_to_id[item]
                ratings_vector[id] = rating

        
        return torch.FloatTensor(ratings_vector)

In [None]:
trainset =  ML1MDataset(train_df, users, items)
testset = ML1MDataset(test_df, users, items)
validateset = ML1MDataset(validate_df, users, items)

In [None]:
trainset.num_users

6040

# Model

In [None]:
def get_activation_layer(activation_func: str):
    activation_func = str.lower(activation_func)
    match = {
        "selu": nn.SELU(),
        "relu": nn.ReLU(),
        "sigmoid": nn.Sigmoid(),
        "tanh": nn.Tanh()
    }
    
    return match[activation_func]

In [None]:
class DeepAutoEncoder(nn.Module):
    def __init__(self, layer_sizes, activation_func='selu', is_constrained=True, dropout=0.0):
        """
            Params:
                layer_sizes: list of integers, if layer_sizes = [1000, 512, 256], the model has 2 encoder layers (1000*512) -> (512*256), and 2 decoder layers (256*512) -> (512*1000)
                activation_func: activation function, one of `relu`, `tanh`, sigmoid`, `selu`
                is_constrained: if True, the decoder re-uses weights of the encoder
        """
        super(DeepAutoEncoder, self).__init__()

        layers = list()

        # encoder
        for i in range(len(layer_sizes) - 1):
            in_features = layer_sizes[i]
            out_features = layer_sizes[i+1]
            linear_layer = nn.Linear(in_features,out_features)
            self.init_weight(linear_layer)
            layers.append(linear_layer)
            layers.append(get_activation_layer(activation_func))
        
        if dropout > 0:
            layers.append(nn.Dropout(dropout))
        
        # decoder
        for i in range(len(layer_sizes) - 1, 0, -1):
            in_features = layer_sizes[i]
            out_features = layer_sizes[i-1]
            linear_layer = nn.Linear(in_features,out_features)
            if is_constrained:
                encode_linear_layer = layers[2*i-2]
                linear_layer.weight.data = encode_linear_layer.weight.data.transpose(0, 1)
            else:
                self.init_weight(linear_layer)
            layers.append(linear_layer)
            layers.append(get_activation_layer(activation_func))
        
        self.dae = nn.Sequential(*layers)


    def init_weight(self, layer):
        if type(layer) == nn.Linear:
            nn.init.xavier_uniform_(layer.weight)
            layer.bias.data.fill_(0.01)
    
    
    def forward(self, x):
        return self.dae(x)

# Train-test function

In [None]:
def masked_se(y_true, y_pred):
    mask = y_true != 0
    num_ratings = torch.sum(mask.float())
    criterion = nn.MSELoss(reduction='sum')
    
    return criterion(y_pred * mask.float(), y_true), num_ratings

In [None]:
def train(model, optimizer, data_loader, device):
    """
        Train the model
        Params:
        ...
        criterion: loss function
        Return:
        total_loss
    """
    model.train()
    mse_total = 0.0
    num_ratings_total = data_loader.dataset.num_ratings
    
    for i, data in enumerate(data_loader):
        input = data.to(device)
        output = model(input)
        square_error, num_ratings = masked_se(input.float(), output)
        loss = square_error / num_ratings
        model.zero_grad()
        loss.backward()
        optimizer.step()

        mse_total += square_error.item() / num_ratings_total
        
    
    return mse_total

In [None]:
def test(model, data_loader, device):
    """
        Evaluate the model
        Return: RMSE
    """
    model.eval()
    mse_total = 0.0
    num_ratings_total = data_loader.dataset.num_ratings

    with torch.no_grad():
        for data in data_loader:
            input = data.to(device)
            output = model(input)
            output = torch.clamp(output, 1.0, 5.0)

            square_error, num_ratings = masked_se(input.float(), output)

            mse_total += square_error.item() / num_ratings_total

    return np.sqrt(mse_total)

# Train on MovieLens1M

In [None]:
# parameters
input_dim = len(items)
layer_sizes = [input_dim, 512, 512, 1024]
device = torch.device('cuda')
batch_size = 512
epochs = 100
lr = 0.005
momentum = 0.9
dropout = 0.8
activation = 'selu'
is_constrained = True

In [None]:
trainloader = DataLoader(trainset, batch_size=batch_size)
validateloader = DataLoader(validateset, batch_size=batch_size)
testloader = DataLoader(testset, batch_size=batch_size)

In [None]:
model = DeepAutoEncoder(layer_sizes, dropout=dropout, is_constrained=is_constrained, activation_func=activation)
model = model.to(device)
summary(model, (1, layer_sizes[0]))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1               [-1, 1, 512]       1,897,984
              SELU-2               [-1, 1, 512]               0
            Linear-3               [-1, 1, 512]         262,656
              SELU-4               [-1, 1, 512]               0
            Linear-5              [-1, 1, 1024]         525,312
              SELU-6              [-1, 1, 1024]               0
           Dropout-7              [-1, 1, 1024]               0
            Linear-8               [-1, 1, 512]         524,800
              SELU-9               [-1, 1, 512]               0
           Linear-10               [-1, 1, 512]         262,656
             SELU-11               [-1, 1, 512]               0
           Linear-12              [-1, 1, 3706]       1,901,178
             SELU-13              [-1, 1, 3706]               0
Total params: 5,374,586
Trainable param

In [None]:
wandb.watch(model, log="all")

[<wandb.wandb_torch.TorchGraph at 0x7fe94a805b00>]

In [None]:
optimizer = torch.optim.SGD(params=model.parameters(), lr=lr, momentum=momentum)

In [None]:
params = {
    "seed": SEED,
    "layer_sizes": layer_sizes,
    "device": device,
    "batch_size": batch_size,
    "epochs": epochs,
    "lr": lr,
    "dropout": dropout,
    "momentum": momentum,
    "activation": activation,
    "optimizer": optimizer,
    "is_constrained": is_constrained
}
experiment.log_parameters(params)
wandb.config.update(params)

In [None]:
from time import time
t0 = time()
for epoch in range(epochs):
    train_loss = train(model, optimizer, trainloader, device)
    train_rmse_clip = test(model, trainloader, device)
    valid_rmse_clip = test(model, validateloader, device)
    
    print('epoch {} \t train_loss {} \t train_rmse {} \t valid_rmse {}'.format(epoch+1, train_loss, train_rmse_clip, valid_rmse_clip))

    metrics = {
        "train_loss": train_loss,
        "train_rmse_clip": train_rmse_clip,
        "valid_rmse_clip": valid_rmse_clip
    }
    experiment.log_metrics(metrics, epoch=epoch+1)
    wandb.log(metrics, step=epoch+1)
print("total time: ", time() - t0)

epoch 1 	 train_loss 8.933366718857423 	 train_rmse 1.6881575236094908 	 valid_rmse 1.7038445458673808
epoch 2 	 train_loss 4.655088340364788 	 train_rmse 1.4450904144208423 	 valid_rmse 1.6100743573638279
epoch 3 	 train_loss 3.2377958526245756 	 train_rmse 1.3471300982336327 	 valid_rmse 1.51621795128371
epoch 4 	 train_loss 2.672079413369309 	 train_rmse 1.3021274628328374 	 valid_rmse 1.4588657175762385
epoch 5 	 train_loss 2.414370315792137 	 train_rmse 1.2812029970316272 	 valid_rmse 1.407678667733789
epoch 6 	 train_loss 2.2103950205913825 	 train_rmse 1.2832135208535584 	 valid_rmse 1.3906814719889964
epoch 7 	 train_loss 2.0612856893129705 	 train_rmse 1.35500457637447 	 valid_rmse 1.4035377944999925
epoch 8 	 train_loss 1.920708702057842 	 train_rmse 1.3941718537298486 	 valid_rmse 1.4028372961834676
epoch 9 	 train_loss 1.819818534774843 	 train_rmse 1.3907471591817155 	 valid_rmse 1.3917491024657502
epoch 10 	 train_loss 1.737774088261215 	 train_rmse 1.3992142162815713 	 v

In [None]:
test_rmse_clip = test(model, testloader, device)
print('test rmse:', test_rmse_clip)

test rmse: 0.9012494303942292


In [None]:
experiment.log_metric("test_rmse_clip", test_rmse_clip)
wandb.log({"test_rmse_clip": test_rmse_clip})

In [None]:
wandb_run.finish()
experiment.end()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.9605
train_rmse_clip,0.9123
valid_rmse_clip,0.89611
_step,100.0
_runtime,135.0
_timestamp,1609949660.0
test_rmse_clip,0.90125


0,1
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_rmse_clip,█▅▄▅▅▆▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁
valid_rmse_clip,█▆▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
test_rmse_clip,▁


COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/van26101998/deeprec-new-ml1m/d2fdb5df6f7b4c198ad05cae97f5a717
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [120]            : (0.840811014175415, 10.928566932678223)
COMET INFO:     test_rmse_clip        : 0.9012494303942292
COMET INFO:     train_loss [100]      : (0.870921250303949, 8.933366718857423)
COMET INFO:     train_rmse_clip [100] : (0.9122967889613783, 1.6881575236094908)
COMET INFO:     valid_rmse_clip [100] : (0.8961099414195466, 1.7038445458673808)
COMET INFO:   Parameters:
COMET INFO:     activation     : selu
COMET INFO:     batch_size     : 512
COMET INFO:     device         : cuda
COMET INFO:     dropout        : 0.9
COMET INFO:     epochs         : 100
COMET INFO:     is_constrained : True
COMET INFO:     layer_

In [None]:
trainset[0].sum()

tensor(184.)

In [None]:

out = model(torch.FloatTensor(trainset[0]).to(device))

In [None]:
for i, data in enumerate(trainset[0]):
    if data > 0:
        print(data.item(), out[i].item())

5.0 4.296086311340332
5.0 2.9972264766693115
4.0 4.591500759124756
5.0 4.787888050079346
4.0 3.6943352222442627
4.0 4.150211811065674
5.0 3.8260910511016846
3.0 2.81754469871521
3.0 4.17350435256958
3.0 4.108846187591553
4.0 3.1970226764678955
3.0 4.244326114654541
4.0 4.247074604034424
5.0 3.799407958984375
5.0 3.897704601287842
5.0 3.429720640182495
5.0 3.95233154296875
4.0 4.261758327484131
3.0 4.179635047912598
4.0 4.138547897338867
5.0 4.420731067657471
5.0 4.045830249786377
4.0 3.5138978958129883
4.0 3.5121426582336426
4.0 3.6441495418548584
5.0 3.080186605453491
4.0 3.5250449180603027
5.0 4.223467826843262
4.0 3.650820732116699
5.0 4.505273342132568
4.0 3.093581438064575
3.0 3.7022030353546143
3.0 3.10095477104187
5.0 3.8921313285827637
4.0 3.89554762840271
3.0 3.4256186485290527
4.0 3.575822353363037
4.0 3.828672409057617
5.0 4.424103260040283
4.0 4.306410789489746
5.0 3.786360025405884
4.0 4.239483833312988
4.0 3.4903810024261475
4.0 3.9164211750030518
