# Train PMF

In [None]:
from __future__ import print_function

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold

import sys

sys.path.append("..")

from src.model.pmf import PMF

In [None]:
def RMSE(preds, truth):
    return np.sqrt(np.mean(np.square(preds - truth)))

In [None]:
batch_size = 1000
epoches = 1000
no_cuda = False
seed = 1
weight_decay = 0.1
embedding_feature_size = 100
ratio = 0.8
lr = 1e-4
momentum = 0.9
k_folds = 5

In [None]:
DATASET = "yahoo"

In [None]:
df = pd.read_csv("../data/yahoo/ratings.csv")  # .reset_index()
df = df.rename(
    columns={"userId": "user_id", "movieId": "item_id"}  # , "index": "timestamp"}
)
df["rating"] = df["rating"].astype("float")
df = df.sort_values(["user_id", "timestamp"])
df.head()

In [None]:
import pandas as pd
from math import ceil


def split_train_test(data, train_ratio=0.8):
    # Lista para armazenar os subsets de treino e teste
    train_list = []
    test_list = []

    for _, group in data.groupby("user_id"):
        # Ordena as interações por timestamp
        group = group.sort_values("timestamp")

        # Calcula o ponto de corte para o treino (80% das interações)
        split_point = ceil(len(group) * train_ratio)

        # Separa o conjunto de treino e teste
        train_list.append(group.iloc[:split_point])
        test_list.append(group.iloc[split_point:])

    # Concatena todos os subsets de treino e teste
    train_data = pd.concat(train_list)
    test_data = pd.concat(test_list)

    return train_data, test_data


# Exemplo de uso:
# df é o seu DataFrame com as colunas ['user_id', 'item_id', 'rating', 'timestamp']
train_df, test_df = split_train_test(df)
print(train_df.shape, test_df.shape)

# # Normalize rewards to [-1, 1]

train_data = df[["user_id", "item_id", "rating"]].values
train_data[:, 2] = 0.5 * (train_data[:, 2] - 3)

test_data = test_df[["user_id", "item_id", "rating"]].values
test_data[:, 2] = 0.5 * (test_data[:, 2] - 3)

# # Shuffle data
np.random.shuffle(train_data)
np.random.shuffle(test_data)

In [None]:
NUM_ITEMS = df.item_id.max() + 1
NUM_USERS = df.user_id.max() + 1

print(NUM_USERS, NUM_ITEMS)

In [None]:
# Get CUDA device if available
cuda = torch.cuda.is_available()
print(cuda)

# Set device to CUDA or CPU, depending on availability and desire
device = torch.device("cuda" if cuda and no_cuda else "cpu")

# Generate and apply seeds
torch.manual_seed(seed=seed)
if cuda:
    torch.cuda.empty_cache()
    torch.cuda.manual_seed(seed=seed)

# Specify number of workers for cuda
kwargs = {"num_workers": 1, "pin_memory": True} if cuda else {}

In [None]:
# Initialize model
model = PMF(
    n_users=NUM_USERS,
    n_items=NUM_ITEMS,
    n_factors=embedding_feature_size,
    no_cuda=no_cuda,
)

# Move model to CUDA if CUDA selected
if cuda:
    model.cuda()
    print("Model moved to CUDA")

# Set loss function
loss_function = nn.MSELoss(reduction="sum")

# Set optimizer (uncomment Adam for adam)
optimizer = optim.SGD(
    model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum
)
# optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

In [None]:
# Function for training one epoch
def train(epoch, train_data_loader):
    # Initialize
    model.train()
    epoch_loss = 0.0
    optimizer.zero_grad()

    # Go through batches
    for batch_idx, ele in enumerate(train_data_loader):
        # Zero optimizer gradient
        optimizer.zero_grad()

        # Extract user_id_nums: row 0, item_id_nums: col 1 , ratings: val 2
        row = ele[:, 0]
        col = ele[:, 1]
        val = ele[:, 2]

        # Set to variables
        row = Variable(row.long())
        if isinstance(col, list):
            col = tuple(Variable(c.long()) for c in col)
        else:
            col = Variable(col.long())
        val = Variable(val.float())

        # Move data to CUDA
        if cuda:
            row = row.cuda()
            col = col.cuda()
            val = val.cuda()

        # Train
        preds = model.forward(row, col)
        loss = loss_function(preds, val)
        loss.backward()
        optimizer.step()

        # Update epoch loss
        epoch_loss += loss.data

    epoch_loss /= train_data_loader.dataset.shape[0]
    return epoch_loss

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_score = None
        self.epochs_without_improvement = 0

    def __call__(self, current_score):
        if (
            self.best_score is None
            or (current_score - self.best_score) < self.min_delta
        ):
            self.best_score = current_score
            self.epochs_without_improvement = 0
        else:
            self.epochs_without_improvement += 1

        if self.epochs_without_improvement >= self.patience:
            return True  # Indica que o treinamento deve ser interrompido
        return False

In [None]:
epoches = 1000

In [None]:
# Training Model
train_loss_list = []
train_rmse_list = []
vali_rmse_list = []
print(
    "parameters are: train ratio:{:f},batch_size:{:d}, epoches:{:d}, weight_decay:{:f}".format(
        ratio, batch_size, epoches, weight_decay
    )
)
print(model)

# kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
# for fold, (train_idx, test_idx) in enumerate(kf.split(train_data)):
#     print(f"Fold {fold + 1}")
print("-------")
early_stopping = EarlyStopping()

# Define the data loaders for the current fold
train_loader = torch.utils.data.DataLoader(
    dataset=train_data,
    batch_size=batch_size,
    # sampler=torch.utils.data.SubsetRandomSampler(train_idx),
)
test_loader = torch.utils.data.DataLoader(
    dataset=test_data,
    batch_size=batch_size,
    # sampler=torch.utils.data.SubsetRandomSampler(test_idx),
)

# Go through epochs
for epoch in range(1, epoches + 1):
    # Train epoch
    train_epoch_loss = train(epoch, train_loader)

    # Get epoch loss
    train_loss_list.append(train_epoch_loss.cpu())

    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to("cuda")
            vali_preds = model.predict(
                data[:, 0].long().cuda(), data[:, 1].long().cuda()
            )
            vali_rmse = RMSE(
                vali_preds.cpu().data.numpy(), data[:, 2].cpu().data.numpy()
            )

    train_rmse = np.sqrt(train_epoch_loss.cpu())
    train_rmse_list.append(train_rmse)
    vali_rmse_list.append(vali_rmse)

    print(
        "Training epoch:{: d}, training rmse:{: .6f}, vali rmse:{:.6f}".format(
            epoch, train_rmse, vali_rmse
        )
    )

    # Early stop condition
    if early_stopping(vali_rmse):
        break

In [None]:
# Testing Model
# Move test set to CUDA
if cuda:
    test_row = Variable(torch.from_numpy(test_data[:, 0]).long()).cuda()
    test_col = Variable(torch.from_numpy(test_data[:, 1]).long()).cuda()
else:
    test_row = Variable(torch.from_numpy(test_data[:, 0]).long())
    test_col = Variable(torch.from_numpy(test_data[:, 1]).long())

# Get test predictions
preds = model.predict(test_row, test_col)

# Get test rmse loss
if cuda:
    test_rmse = RMSE(preds.cpu().data.numpy(), test_data[:, 2])
else:
    test_rmse = RMSE(preds.data.numpy(), test_data[:, 2])
print("Test rmse: {:f}".format(test_rmse))

In [None]:
# rmse: 0.968853
# Test rmse: 0.477737

In [None]:
# Save model
path_to_trained_pmf = f"../model/pmf/{DATASET}_trained_pmf.pt"
torch.save(model.state_dict(), path_to_trained_pmf)