<a href="https://colab.research.google.com/github/van26101998/Recommendation-System/blob/main/NeuFM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and import

In [1]:
# install required packages
!pip install -q comet_ml wandb

[K     |████████████████████████████████| 245kB 4.0MB/s 
[K     |████████████████████████████████| 1.8MB 53.5MB/s 
[K     |████████████████████████████████| 204kB 56.9MB/s 
[K     |████████████████████████████████| 512kB 55.9MB/s 
[K     |████████████████████████████████| 81kB 11.9MB/s 
[K     |████████████████████████████████| 163kB 55.8MB/s 
[K     |████████████████████████████████| 133kB 59.1MB/s 
[K     |████████████████████████████████| 102kB 13.9MB/s 
[K     |████████████████████████████████| 71kB 8.7MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for configobj (setup.py) ... [?25l[?25hdone


In [2]:
import wandb
wandb_run = wandb.init(project="neumf")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
from typing import List, Dict, Text
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset

from torchsummary import summary
import numpy as np
import pandas as pd
import zipfile
import random

from sklearn.model_selection import train_test_split

# Movielens 1M Dataset

In [4]:
!wget -c http://files.grouplens.org/datasets/movielens/ml-1m.zip
import zipfile
with zipfile.ZipFile('ml-1m.zip', 'r') as zip_ref:
    zip_ref.extractall()

--2021-01-08 09:15:06--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2021-01-08 09:15:08 (4.28 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]



In [5]:
ratings_df = pd.read_csv('./ml-1m/ratings.dat', sep="::", engine='python', names=['user', 'item', 'rating', 'timestamp'])
users = sorted(ratings_df['user'].unique())
items = sorted(ratings_df['item'].unique())
ratings_df

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [6]:
# train validate test split
train_df, test_df = train_test_split(ratings_df, test_size=0.1)
train_df, validate_df = train_test_split(train_df, test_size=0.1)

In [7]:
print("number of train-validate-test:", len(train_df), len(validate_df), len(test_df))

number of train-validate-test: 810169 90019 100021


In [8]:
class ML1MDataset(Dataset):
    """
        MovieLens 1M Dataset
        + Params:
            - ratings_df: DataFrame with 4 columns: user, item, rating, timestamp
            - users: list all users
            - items: list all items
    """
    def __init__(self, ratings_df: pd.DataFrame, users: List[int], items: List[int]): 
        
        self.data = ratings_df.to_numpy()
        
        self.users = users
        self.items = items
        
        self.num_users = len(self.users)
        self.num_items = len(self.items)

        self.features_dim = self.num_users + self.num_items

        self.user_to_id = {user:id for id, user in enumerate(self.users)}
        self.item_to_id = {item:id for id, item in enumerate(self.items)}
    
    def __len__(self):
        return len(self.data)
    

    def __getitem__(self, idx):
        """
        Returns:
            + feature: a LongTensor, concatenate onehot vectors of user and item
            + target: a FloatTensor, is a rating of user to item
        """
        user, item, rating, timestamp = self.data[idx]

        user_id = self.user_to_id[user]
        item_id = self.item_to_id[item]

        features = np.array([user_id, item_id + self.num_items])
        feature_values = np.array([1.0, 1.0], dtype=np.float32)
        target = np.float32(rating)

        # features = torch.LongTensor([[user_id], [item_id + self.num_items]])
        # feature_values = torch.FloatTensor([[1.0], [1.0]])
        # target = torch.FloatTensor([float(rating)])
        
        return features, feature_values, target

In [9]:
trainset =  ML1MDataset(train_df, users, items)
testset = ML1MDataset(test_df, users, items)
validateset = ML1MDataset(validate_df, users, items)

In [10]:
trainset[0]

(array([4226, 6148]), array([1., 1.], dtype=float32), 3.0)

# Model

In [11]:
class NFM(nn.Module):
    def __init__(self, num_features, num_factors, 
        act_function, layers, batch_norm, drop_prob, pretrain_FM):
        super(NFM, self).__init__()
        """
        num_features: number of features,
        num_factors: number of hidden factors,
        act_function: activation function for MLP layer,
        layers: list of dimension of deep layers,
        batch_norm: bool type, whether to use batch norm or not,
        drop_prob: list of the dropout rate for FM and MLP,
        pretrain_FM: the pre-trained FM weights.
        """
        self.num_features = num_features
        self.num_factors = num_factors
        self.act_function = act_function
        self.layers = layers
        self.batch_norm = batch_norm
        self.drop_prob = drop_prob
        self.pretrain_FM = pretrain_FM

        self.embeddings = nn.Embedding(num_features, num_factors)
        self.biases = nn.Embedding(num_features, 1)
        self.bias_ = nn.Parameter(torch.tensor([0.0]))

        FM_modules = []
        if self.batch_norm:
            FM_modules.append(nn.BatchNorm1d(num_factors))      
        FM_modules.append(nn.Dropout(drop_prob[0]))
        self.FM_layers = nn.Sequential(*FM_modules)

        MLP_module = []
        in_dim = num_factors
        for dim in self.layers:
            out_dim = dim
            MLP_module.append(nn.Linear(in_dim, out_dim))
            in_dim = out_dim

            if self.batch_norm:
                MLP_module.append(nn.BatchNorm1d(out_dim))
            if self.act_function == 'relu':
                MLP_module.append(nn.ReLU())
            elif self.act_function == 'sigmoid':
                MLP_module.append(nn.Sigmoid())
            elif self.act_function == 'tanh':
                MLP_module.append(nn.Tanh())

            MLP_module.append(nn.Dropout(drop_prob[-1]))
        self.deep_layers = nn.Sequential(*MLP_module)

        predict_size = layers[-1] if layers else num_factors
        self.prediction = nn.Linear(predict_size, 1, bias=False)

        self._init_weight_()

    def _init_weight_(self):
        """ Try to mimic the original weight initialization. """
        if self.pretrain_FM:
            self.embeddings.weight.data.copy_(
                            self.pretrain_FM.embeddings.weight)
            self.biases.weight.data.copy_(
                            self.pretrain_FM.biases.weight)
            self.bias_.data.copy_(self.pretrain_FM.bias_)
        else:
            nn.init.normal_(self.embeddings.weight, std=0.01)
            nn.init.constant_(self.biases.weight, 0.0)

        # for deep layers
        if len(self.layers) > 0:
            for m in self.deep_layers:
                if isinstance(m, nn.Linear):
                    nn.init.xavier_normal_(m.weight)
            nn.init.xavier_normal_(self.prediction.weight)
        else:
            nn.init.constant_(self.prediction.weight, 1.0)

    def forward(self, features, feature_values):
        nonzero_embed = self.embeddings(features)
        feature_values = feature_values.unsqueeze(dim=-1)
        nonzero_embed = nonzero_embed * feature_values

        # Bi-Interaction layer
        sum_square_embed = nonzero_embed.sum(dim=1).pow(2)
        square_sum_embed = (nonzero_embed.pow(2)).sum(dim=1)

        # FM model
        FM = 0.5 * (sum_square_embed - square_sum_embed)
        FM = self.FM_layers(FM)
        if self.layers: # have deep layers
            FM = self.deep_layers(FM)
        FM = self.prediction(FM)

        # bias addition
        feature_bias = self.biases(features)
        feature_bias = (feature_bias * feature_values).sum(dim=1)
        FM = FM + feature_bias + self.bias_
        return FM.view(-1)


# Train

In [12]:
def train(model, trainloader, optimizer, device):
    model.train()
    mse = 0.0
    num_datapoints = len(trainloader.dataset)

    for features, feature_values, target in trainloader:
        features = features.to(device)
        feature_values = feature_values.to(device)
        target = target.to(device)

        model.zero_grad() 
        prediction = model(features, feature_values) 
        loss = nn.MSELoss(reduction='sum')(prediction, target) 
        loss += lamda * model.embeddings.weight.norm() 
        loss.backward() 
        optimizer.step() 
        
        mse += loss.item() / num_datapoints
    
    return mse

    
  

In [13]:
def test(model, testloader, device):
    model.eval()
    # mse = 0.0
    # num_datapoints = len(testloader.dataset)
    RMSE = np.array([], dtype=np.float32)
    for features, feature_values, target in testloader:
        features = features.to(device)
        feature_values = feature_values.to(device)
        target = target.to(device)

        prediction = model(features, feature_values) 
        prediction = prediction.clamp(min=1.0, max=5.0) 
        
        SE = (prediction - target).pow(2) 
        # mse += se.item() / num_datapoints
        RMSE = np.append(RMSE, SE.detach().cpu().numpy())
        
    return np.sqrt(RMSE.mean()) #RMSE

In [14]:
num_features = trainset.features_dim # num users + num items
num_factors = 64 # hidden factor
act_function = 'relu'
layers = [64]
batch_norm = True
drop_prob = [0.5, 0.2]
pretrain_FM = False


lamda = 1e-6
lr = 0.001
batch_size = 512
epochs = 100
device = torch.device('cuda')


In [15]:
model = NFM(num_features, num_factors, act_function, layers, batch_norm, drop_prob, pretrain_FM)
model = model.to(device)

In [16]:
wandb.watch(model, log="all")

[<wandb.wandb_torch.TorchGraph at 0x7f49b0554208>]

In [17]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.000001)

In [18]:
params = {
    "num_features": num_features,
    "num_factors": num_factors,
    "act_function": act_function,
    "layers": layers,
    "batch_norm": batch_norm,
    "drop_prob": drop_prob,
    "pretrain_FM": pretrain_FM,
    "lamda": lamda,
    "lr": lr,
    "batch_size": batch_size,
    "epochs": epochs,
    "optimizer": optimizer,
}
wandb.config.update(params)

In [19]:
trainloader = DataLoader(trainset, batch_size=batch_size)
validateloader = DataLoader(validateset, batch_size=batch_size)
testloader = DataLoader(testset, batch_size=batch_size)

In [None]:
from time import time
t0 = time()

for epoch in range(epochs):
    train_loss = train(model, trainloader, optimizer, device)
    train_rmse = test(model, trainloader, device)
    valid_rmse = test(model, validateloader, device)

    print('epoch {} \t train_loss {} \t train_rmse {} \t valid_rmse {}'.format(epoch+1, train_loss, train_rmse, valid_rmse))

    metrics = {
        "train_loss": train_loss,
        "train_rmse": train_rmse,
        "valid_rmse": valid_rmse
    }
    wandb.log(metrics, step=epoch+1)

print("total time: ", time() - t0)

epoch 1 	 train_loss 1.6900149186274718 	 train_rmse 1.3259941339492798 	 valid_rmse 1.341207504272461
epoch 2 	 train_loss 1.072664293821547 	 train_rmse 1.0673645734786987 	 valid_rmse 1.0854594707489014
epoch 3 	 train_loss 0.9760961064012592 	 train_rmse 0.9849750399589539 	 valid_rmse 1.0071234703063965
epoch 4 	 train_loss 0.9243025610906028 	 train_rmse 0.9511322379112244 	 valid_rmse 0.9767338633537292
epoch 5 	 train_loss 0.8844468250207924 	 train_rmse 0.9230720400810242 	 valid_rmse 0.9526275992393494
epoch 6 	 train_loss 0.8559193393746739 	 train_rmse 0.9133886694908142 	 valid_rmse 0.9459014534950256
epoch 7 	 train_loss 0.8368556482380439 	 train_rmse 0.9057212471961975 	 valid_rmse 0.9431531429290771
epoch 8 	 train_loss 0.8209736433891178 	 train_rmse 0.9001116752624512 	 valid_rmse 0.9413273334503174
epoch 9 	 train_loss 0.8065626189801666 	 train_rmse 0.8959820866584778 	 valid_rmse 0.9424015879631042
epoch 10 	 train_loss 0.7953985908808434 	 train_rmse 0.8936487436

In [None]:
test_rmse = test(model, testloader, device)
print('test rmse:', test_rmse)

In [None]:
wandb.log({"test_rmse": test_rmse})

In [None]:
wandb_run.finish()