In [1]:
import os
import re
import sys

import lightning as L
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wikipedia
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

sys.path.append(os.getcwd())
from data import ProcessMovies, ProcessRatings
from utils import read_data

np.random.seed(123)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class MovieLensDataset(Dataset):
    def __init__(self, train_val='train'):
        '''
        part: train/val
        '''
        self.movies, self.users, self.train_ratings, self.val_ratings = self.__feature_engineering()
        self.train_val=train_val
        
    def __len__(self):
        if self.train_val == 'train':
            return len(self.train_ratings)
        else:
            return len(self.val_ratings)
  
    def __getitem__(self, idx):
        if self.train_val == 'train':
            data = self.train_ratings.iloc[idx]
        if self.train_val == 'val':
            data = self.val_ratings.iloc[idx]
        user_id, movie_id, label = data
        print(movie_id)
        user_data = self.users[self.users.user_id==user_id].values.squeeze()
        movie_data = self.movies[self.movies.movie_id==movie_id].values.squeeze()
        return torch.FloatTensor(user_data), torch.FloatTensor(movie_data), torch.FloatTensor([label])
    
    def __feature_engineering(self):
        if not os.path.exists(r'data\dataset\movies.pqt'):
            ProcessMovies(r'data\dataset').main()
        movies = pd.read_parquet(r'data\dataset\movies.pqt')
        train_ratings, val_ratings, ratings_ft = ProcessRatings(r'data\dataset').main()
        movies = movies.merge(ratings_ft)
        users = read_data('users',table_columns=['user_id','gender','age', 'occupation', 'zipcode'])
        users = users.drop('zipcode',axis=1)
        users['gender'] = [1.0 if i=='M' else 0.0 for i in users['gender']]
        return movies, users, train_ratings, val_ratings


In [8]:
MovieLensDataset('train').__getitem__(2830)

3952


(tensor([23.,  1., 35.,  0.]), tensor([], size=(0, 54)), tensor([1.]))

In [26]:
movie_list = pd.read_parquet(r'C:\Users\uyen\OneDrive\NeuralCF\data\dataset\movies.pqt', columns=['movie_id']).values.squeeze()
movie_list

array([   1,    2,    3, ..., 3947, 3948, 3949], dtype=int64)

In [20]:
ratings = read_data('ratings', ['user_id', 'movie_id', 'rating', 'timestamp'])

In [30]:
3952 in movie_list

False

In [29]:
[i for i in ratings.movie_id if i not in movie_list]

[3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3950,
 3952,
 3950,
 3951,
 3952,
 3952,
 3950,
 3951,
 3952,
 3952,
 3952,
 3952,
 3952,
 3950,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3951,
 3952,
 3952,
 3952,
 3950,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3951,
 3952,
 3952,
 3950,
 3952,
 3950,
 3952,
 3951,
 3952,
 3952,
 3952,
 3950,
 3950,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3950,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3951,
 3951,
 3952,
 3952,
 3952,
 3950,
 3951,
 3952,
 3951,
 3952,
 3952,
 3951,
 3952,
 3952,
 3952,
 3951,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3951,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3950,
 3951,
 3950,
 3951,
 3952,
 3952,
 3952,
 3952,
 3950,
 3952,
 3952,
 3952,
 3952,
 3952,
 3952,
 3951,
 3952,
 3952,
 3950,
 3952,
 3952,
 3952,
 3952,
 3950,
 3952,
 3952,
 3952,
 3951,
 3951,
 3952,
 3952,

In [None]:
ratings = read_data()

In [17]:
from torch import Tensor
from typing import Any, Optional
from lightning.pytorch.utilities.types import STEP_OUTPUT

class NCF(L.LightningModule):
    """ Neural Collaborative Filtering (NCF)    
    """
    
    def __init__(self):
        super().__init__()
        self.user_fc_1 = nn.Linear(in_features=4, out_features=16)
        self.item_fc_1 = nn.Linear(in_features=54, out_features=16)
        self.user_fc_2 = nn.Linear(in_features=16, out_features=1)
        self.item_fc_2 = nn.Linear(in_features=16, out_features=1)
        self.bilinear = nn.Bilinear(in1_features=16, in2_features=16, out_features=1)
        
    def forward(self, user_input, item_input):
        print(user_input.shape, item_input.shape)
        user_vector = nn.ReLU()(self.user_fc_1(user_input))
        item_vector = nn.ReLU()(self.item_fc_1(item_input))
        user_output = nn.ReLU()(self.user_fc_2(user_vector))
        item_output = nn.ReLU()(self.item_fc_2(item_vector))
        fusion_output = self.bilinear(user_vector, item_vector)
        print((user_output + item_output + fusion_output).shape)
        pred = nn.Sigmoid()(user_output + item_output + fusion_output)
        return pred
    
    def loss(self, preds: Tensor, labels: Optional[Tensor] = None) -> Tensor:
        print('Pred: ',preds, 'Label: ',labels)
        return nn.BCELoss()(preds, labels)

    def step(self, batch: Any) -> Tensor:
        print('Batch data: ',batch)
        user_input, item_input, labels = batch
        print('Label from NCF: ', labels)
        predicted_labels = self(user_input, item_input)
        loss = self.loss(predicted_labels, labels)
        return loss

    def training_step(self, batch: Any, batch_idx: int) -> STEP_OUTPUT:
        return {"loss": self.step(batch)}

    def validation_step(self, batch: Any, batch_idx: int) -> STEP_OUTPUT:
        return {"x": self.step(batch)}

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
    
    def train_dataloader(self) -> DataLoader:
        return DataLoader(MovieLensDataset('train'))

    def val_dataloader(self) -> DataLoader:
        return DataLoader(MovieLensDataset('val'))

model = NCF()

In [18]:
trainer = L.Trainer(max_epochs=5, devices='auto', enable_progress_bar=True, logger=False)
trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type     | Params
---------------------------------------
0 | user_fc_1 | Linear   | 80    
1 | item_fc_1 | Linear   | 880   
2 | user_fc_2 | Linear   | 17    
3 | item_fc_2 | Linear   | 17    
4 | bilinear  | Bilinear | 257   
---------------------------------------
1.3 K     Trainable params
0         Non-trainable params
1.3 K     Total params
0.005     Total estimated model params size (MB)


Epoch 0:   0%|          | 1/982089 [09:50<161028:29:19,  0.00it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]Batch data:  [tensor([[ 1.,  0.,  1., 10.]]), tensor([[4.8000e+01, 1.9950e+03, 0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 1.2793e+00, 1.4268e+01, 6.1766e+00, 9.7921e+00,
         7.9547e+00, 5.0468e+00, 5.9323e+00, 7.0207e+00, 7.2744e+00, 5.8935e+00,
         4.5633e+00, 8.2570e+00, 3.4874e+00, 6.6938e+00, 5.3376e+00, 6.0674e+00,
         2.6647e+00, 4.9364e+00, 5.9470e+00, 5.2435e+00, 3.7170e+00, 3.0763e+00,
         6.1591e+00, 4.2392e+00, 4.3145e+00, 3.1931e+00, 3.4200e+00, 5.2906e+00,
         5.0871e+00, 4.5525e+00, 7.2238e+00, 8.8252e+00, 2.9764e+00, 3.8200e+02]]), tensor([[1.]])]
Label from NCF:  tensor([[1.]])
torch.Size([1, 4]) torch.Size([1, 54])
t

c:\Users\delic\anaconda3\envs\movie_rec\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.




In [None]:
load