In [1]:
module_path = '/Users/kakao/study/RecModel'
import sys
sys.path.append(module_path)

import argparse

import pandas as pd
import numpy as np
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader, random_split

from src.utils.load_data import EmbeddingIndexGenerater, RandomNegativeSampler
from src.utils.custom_dataset import NeuMFDataset
from src.lit_model.lit_NeuMF import LitNeuMF
from src.model.NeuMF import NeuralMatrixFactorization

In [2]:
def define_argparser():
    parser = argparse.ArgumentParser()

    parser.add_argument("--project", default="NCF")
    parser.add_argument(
        "--batch_size",
        type=int,
        default=256,
        help="input batch size for training (default: 256)",
    )
    parser.add_argument(
        "--gmf_emb_dim",
        type=int,
        default=16,
        help="input GMF embedding dimension for training (default: 16)",
    )
    parser.add_argument(
        "--mlp_emb_dim",
        type=int,
        default=16,
        help="input MLP embedding dimension for training (default: 16)",
    )
    parser.add_argument(
        "--mlp_hidden_dim_list",
        default=[32, 16, 8],
        help="MLP hidden layer dimension list (default: [32, 16, 8])",
    )
    parser.add_argument(
        "--epochs", type=int, default=10, help="number of epochs to train (default: 10)"
    )
    parser.add_argument("--cuda", type=int, default=0, help="0 for cpu -1 for all gpu")
    config = parser.parse_args(args=[]) # add [] in jupyter notebook
    if config.cuda == 0 or torch.cuda.is_available() is False:
        config.cuda = 0

    return config

In [3]:
config = define_argparser()

config

Namespace(project='NCF', batch_size=256, gmf_emb_dim=16, mlp_emb_dim=16, mlp_hidden_dim_list=[32, 16, 8], epochs=10, cuda=0)

In [4]:
data_path = "../data/kmrd_small/rates.csv"
data = pd.read_csv(data_path)

print(data.shape)

N_SAMPLES = 100000

data = data[:N_SAMPLES]

print(f'data.shape={data.shape}')
data = data.drop('time', axis=1)
data.columns = ['user', 'item', 'interaction']
data['interaction'] = 1
data.head()



(140710, 4)
data.shape=(100000, 4)


Unnamed: 0,user,item,interaction
0,0,10003,1
1,0,10004,1
2,0,10018,1
3,0,10021,1
4,0,10022,1


In [5]:
data.columns == ['user', 'item', 'interaction']

array([ True,  True,  True])

In [6]:
embedding_index_generator = EmbeddingIndexGenerater(data)

data['user'] = data['user'].map(embedding_index_generator.user_to_idx)
data['item'] = data['item'].map(embedding_index_generator.item_to_idx)

In [7]:
data.head()

Unnamed: 0,user,item,interaction
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1


In [8]:
radom_negetive_sampler = RandomNegativeSampler(data, neg_samples_per_pos=1)
df = radom_negetive_sampler.negative_sampling(seed=0)

In [9]:
train_ratio = 0.8

NeuMF_dataset = NeuMFDataset(df[:, :2], df[:, 2])

seed = torch.Generator().manual_seed(42)
train_dataset, valid_dataset = random_split(
    NeuMF_dataset, [train_ratio, 1.0 - train_ratio], generator=seed
)

In [10]:
num_users = max(df[:, 0]) + 1
num_items = max(df[:, 1]) + 1

In [11]:
# return: user field index, item field index, target
train_dataset[0]

(tensor([12562,   238]), tensor(0.))

In [12]:
train_loader = DataLoader(train_dataset, batch_size=config.batch_size)
valid_loader = DataLoader(valid_dataset, batch_size=config.batch_size)

In [13]:
# torch model
torch_NeuMF = NeuralMatrixFactorization(
    num_users= num_users,
    num_items= num_items,
    gmf_emb_dim=config.gmf_emb_dim,
    mlp_emb_dim=config.mlp_emb_dim,
    mlp_hidden_dim_list=config.mlp_hidden_dim_list
)

In [14]:
torch_NeuMF

NeuralMatrixFactorization(
  (gmf_user_emb): Embedding(21351, 16)
  (gmf_item_emb): Embedding(592, 16)
  (mlp_user_emb): Embedding(21351, 16)
  (mlp_item_emb): Embedding(592, 16)
  (mlp): Sequential(
    (0): Linear(in_features=32, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=8, bias=True)
  )
  (NeuMF_layer): Linear(in_features=24, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [15]:
NeuMF_lit_model = LitNeuMF(torch_NeuMF, config)

In [16]:
early_stopping_callback = pl.callbacks.EarlyStopping(
    monitor="validation/loss", mode="min", patience=3
)

trainer = pl.Trainer(
    log_every_n_steps=10,
    max_epochs=config.epochs,
    deterministic=True,
    callbacks=[early_stopping_callback],
    accelerator='cpu'
)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


In [17]:
# fit the model
trainer.fit(model=NeuMF_lit_model, train_dataloaders=train_loader, val_dataloaders=valid_loader)


  | Name      | Type                      | Params
--------------------------------------------------------
0 | model     | NeuralMatrixFactorization | 702 K 
1 | loss_fn   | BCELoss                   | 0     
2 | train_acc | BinaryAccuracy            | 0     
3 | valid_acc | BinaryAccuracy            | 0     
4 | test_acc  | BinaryAccuracy            | 0     
--------------------------------------------------------
702 K     Trainable params
0         Non-trainable params
702 K     Total params
2.811     Total estimated model params size (MB)


                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 8: 100%|██████████| 645/645 [00:04<00:00, 145.39it/s, v_num=4, train/loss=0.128, validation/loss=0.356, validation/acc=0.854, train/acc=0.907]


In [18]:
from sklearn.metrics import accuracy_score

In [19]:
pred = []
true = []
for x, y in valid_loader:
    result = NeuMF_lit_model(x)
    pred += result.tolist()
    true += y.tolist()
    
pred = np.where(np.array(pred) > 0.5, 1, 0)

acc = accuracy_score(true, pred)

print(f"Acc = {round(acc, 4)}")

Acc = 0.854
