# Train PMF

In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd

from math import ceil

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold

import sys

sys.path.append("..")

from src.model.dien import DIEN

In [2]:
def RMSE(preds, truth):
    return np.sqrt(np.mean(np.square(preds - truth)))

In [3]:
batch_size = 128
epoches = 15
no_cuda = False
# seed = 1
# weight_decay = 0.1
# embedding_feature_size = 100
# ratio = 0.8
# lr = 1e-5
# momentum = 0.9
# k_folds = 5

In [4]:
df = pd.read_csv("../data/ml-100k/ratings.csv")
df["rating"] = df["rating"].astype("float")
df = df.sort_values(["user_id", "timestamp"])
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
59972,0,167,5.0,874965478
92487,0,171,5.0,874965478
74577,0,164,5.0,874965518
48214,0,155,4.0,874965556
15764,0,195,5.0,874965677


In [5]:
# Ordenar por user_id e timestamp
df = df.sort_values(by=["user_id", "timestamp"])

# Inicializar um dicionário para manter o histórico de cada usuário
user_histories = {}

# Lista para armazenar as linhas do novo DataFrame
new_rows = []

for index, row in df.iterrows():
    user_id = row["user_id"]
    item_id = row["item_id"]
    rating = row["rating"]
    timestamp = row["timestamp"]

    # Inicializar o histórico do usuário, se necessário
    if user_id not in user_histories:
        user_histories[user_id] = []
        last_item = -1
        last_rating = 0

    # Adicionar o item atual ao histórico, se a nota for >= 4
    if last_rating >= 4 and last_item != 1:
        user_histories[user_id].append(last_item)
        # Garantir que apenas os últimos 5 itens sejam mantidos
        user_histories[user_id] = user_histories[user_id][-5:]
        last_item = item_id
        last_rating = rating
    else:
        last_item = item_id
        last_rating = rating

    # Criar uma nova linha com o user_history atual
    new_row = {
        "user_id": user_id,
        "user_history": list(user_histories[user_id]),
        "item_id": item_id,
        "rating": rating,
        "timestamp": timestamp,
    }
    new_rows.append(new_row)

# Criar um novo DataFrame a partir das novas linhas
df_final = pd.DataFrame(new_rows)

df_final["len_history"] = df_final["user_history"].apply(lambda x: len(x))
df_final = df_final[df_final["len_history"] == 5]
df_final = df_final.reset_index().drop(columns="index")

df_final = pd.concat(
    [
        df_final,
        pd.DataFrame(
            df_final["user_history"].to_list(),
            columns=[
                "user_history_1",
                "user_history_2",
                "user_history_3",
                "user_history_4",
                "user_history_5",
            ],
        ),
    ],
    axis=1,
)

df_final = df_final.drop(columns=["user_history", "len_history"])
df_final.head()

Unnamed: 0,user_id,item_id,rating,timestamp,user_history_1,user_history_2,user_history_3,user_history_4,user_history_5
0,0.0,165.0,5.0,874965677.0,167.0,171.0,164.0,155.0,195.0
1,0.0,186.0,4.0,874965678.0,171.0,164.0,155.0,195.0,165.0
2,0.0,13.0,5.0,874965706.0,164.0,155.0,195.0,165.0,186.0
3,0.0,249.0,4.0,874965706.0,155.0,195.0,165.0,186.0,13.0
4,0.0,126.0,5.0,874965706.0,195.0,165.0,186.0,13.0,249.0


In [6]:
def split_train_test(data, train_ratio=0.8):
    # Lista para armazenar os subsets de treino e teste
    train_list = []
    test_list = []

    for _, group in data.groupby("user_id"):
        # Ordena as interações por timestamp
        group = group.sort_values("timestamp")

        # Calcula o ponto de corte para o treino (80% das interações)
        split_point = ceil(len(group) * train_ratio)

        # Separa o conjunto de treino e teste
        train_list.append(group.iloc[:split_point])
        test_list.append(group.iloc[split_point:])

    # Concatena todos os subsets de treino e teste
    train_data = pd.concat(train_list)
    test_data = pd.concat(test_list)

    return train_data, test_data


train_df, test_df = split_train_test(df_final)
print(train_df.shape, test_df.shape)

# # # Normalize rewards to [-1, 1]
# train_data = train_df[["user_id", "item_id", "rating"]].values
# # train_data[:, 2] = 0.5 * (train_data[:, 2] - 3)

# test_data = test_df[["user_id", "item_id", "rating"]].values
# # test_data[:, 2] = 0.5 * (test_data[:, 2] - 3)

# # # Shuffle data
# np.random.shuffle(train_data)
# np.random.shuffle(test_data)

(73201, 9) (17846, 9)


In [7]:
NUM_ITEMS = df.item_id.max() + 1
NUM_USERS = df.user_id.max() + 1

print(NUM_USERS, NUM_ITEMS)

943 1682


In [8]:
from torch.utils.data import Dataset


class build_din_dataset(Dataset):
    def __init__(
        self,
        data,
        user_col=["user_id"],
        candidate_col=["item_id"],
        recent_rate_col=[
            "user_history_1",
            "user_history_2",
            "user_history_3",
            "user_history_4",
            "user_history_5",
        ],
    ):
        self.data = data
        self.user_col = user_col
        self.candidate_col = candidate_col
        self.recent_rate_col = recent_rate_col

    def __getitem__(self, idx):
        idx = [idx]
        user_features = self.data.iloc[idx][self.user_col].values
        candidate_features = self.data.iloc[idx][self.candidate_col].values
        recent_rate_features = (
            self.data.iloc[idx][self.recent_rate_col].astype(int).values
        )
        label = self.data.iloc[idx]["rating"].values.reshape(-1, 1)
        return (
            torch.from_numpy(user_features),
            torch.from_numpy(candidate_features),
            torch.from_numpy(recent_rate_features),
            torch.from_numpy(label),
        )

    def __len__(self):
        return len(self.data)

In [9]:
import collections
from torch.utils.data import DataLoader

user_col = ["user_id"]
candidate_col = ["item_id"]
recent_rate_col = [
    "user_history_1",
    "user_history_2",
    "user_history_3",
    "user_history_4",
    "user_history_5",
]

train_data = build_din_dataset(train_df)
loader_train = DataLoader(
    train_data, batch_size=batch_size, shuffle=True, pin_memory=True
)
test_data = build_din_dataset(test_df)
loader_test = DataLoader(test_data, batch_size=batch_size)

In [10]:
len(train_data)

73201

In [11]:
def RMSE(preds, truth):
    return np.sqrt(np.mean(np.square(preds - truth)))

In [12]:
train_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,user_history_1,user_history_2,user_history_3,user_history_4,user_history_5
0,0.0,165.0,5.0,874965677.0,167.0,171.0,164.0,155.0,195.0
1,0.0,186.0,4.0,874965678.0,171.0,164.0,155.0,195.0,165.0
2,0.0,13.0,5.0,874965706.0,164.0,155.0,195.0,165.0,186.0
3,0.0,249.0,4.0,874965706.0,155.0,195.0,165.0,186.0,13.0
4,0.0,126.0,5.0,874965706.0,195.0,165.0,186.0,13.0,249.0


In [13]:
len(loader_train)

572

In [14]:
def train(epoch, model, optimizer, device):
    model.train()
    epoch_loss = 0.0
    for batch_idx, (x0, x1, x2, y) in enumerate(loader_train):
        x0, x1, x2, y = (
            torch.squeeze(x0, 1).to(torch.float32),
            torch.squeeze(x1, 1).to(torch.float32),
            torch.squeeze(x2, 1).to(torch.float32),
            torch.squeeze(y).to(torch.float32),
        )
        x0, x1, x2, y = (
            x0.to(device),
            x1.to(device),
            x2.to(device),
            y.to(device),
        )
        optimizer.zero_grad()
        out = model(x0, x1, x2)
        loss = nn.MSELoss(reduction="sum")(torch.squeeze(out, dim=1), y)
        loss.backward()
        optimizer.step()

        # Update epoch loss
        epoch_loss += loss.data

    epoch_loss /= len(train_data)
    return epoch_loss


def test(epoch, model, device, best_acc=0):
    model.eval()
    test_loss = 0.0  # cost function error
    correct = 0.0
    for batch_idx, (x0, x1, x2, y) in enumerate(loader_test):
        x0, x1, x2, y = (
            torch.squeeze(x0, 1).to(torch.float32),
            torch.squeeze(x1, 1).to(torch.float32),
            torch.squeeze(x2, 1).to(torch.float32),
            torch.squeeze(y).to(torch.float32),
        )
        x0, x1, x2, y = (
            x0.to(device),
            x1.to(device),
            x2.to(device),
            y.to(device),
        )
        out = model(x0, x1, x2)
        test_loss += nn.MSELoss(reduction="sum")(torch.squeeze(out, dim=1), y).item()

        correct += RMSE(
            torch.squeeze(out, dim=1).cpu().data.numpy(), y.cpu().data.numpy()
        )

    correct /= len(loader_test)
    if correct < best_acc:
        best_acc = correct
        torch.save(model, "checkpoint/DIEN/DIEN_best.pth")

    print("epoch {}, test loss {}".format(epoch, correct))
    return correct

In [15]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_score = None
        self.epochs_without_improvement = 0

    def __call__(self, current_score):
        if (
            self.best_score is None
            or (current_score - self.best_score) < self.min_delta
        ):
            self.best_score = current_score
            self.epochs_without_improvement = 0
        else:
            self.epochs_without_improvement += 1

        if self.epochs_without_improvement >= self.patience:
            return True  # Indica que o treinamento deve ser interrompido
        return False

In [16]:
epoches = 1000

In [17]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda and no_cuda else "cpu")
# train model NUM_USERS, NUM_ITEMS
model = DIEN(
    candidate_num=NUM_ITEMS,
    user_num=NUM_USERS,
    history_num=5,
    embed_dim=100,
    dynamic_dim=64,
)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.1)
# optimizer = optim.SGD(model.parameters(), lr=1e-4, weight_decay=0.1, momentum=0.9)
vali_rmse = 2
early_stopping = EarlyStopping()

for ep in range(epoches):
    train_loss = train(ep, model, optimizer, device)
    train_rmse = np.sqrt(train_loss.cpu())
    print("Training epoch:{: d}, training rmse:{: .6f}".format(ep, train_rmse))
    vali_rmse = test(ep, model, device, vali_rmse)

    # Early stop condition
    if early_stopping(vali_rmse):
        break

Training epoch: 0, training rmse: 3.337918
epoch 0, test loss 2.4864054773535047
Training epoch: 1, training rmse: 2.042906
epoch 1, test loss 1.4048377539430346
Training epoch: 2, training rmse: 1.186036
epoch 2, test loss 1.094992790051869
Training epoch: 3, training rmse: 0.996307
epoch 3, test loss 1.080148679443768
Training epoch: 4, training rmse: 0.966485
epoch 4, test loss 1.0698557078838349
Training epoch: 5, training rmse: 0.948403
epoch 5, test loss 1.0637366924967084
Training epoch: 6, training rmse: 0.934821
epoch 6, test loss 1.0592470135007586
Training epoch: 7, training rmse: 0.921926
epoch 7, test loss 1.0565562009811402
Training epoch: 8, training rmse: 0.910407
epoch 8, test loss 1.05254308283329
Training epoch: 9, training rmse: 0.901390
epoch 9, test loss 1.0509365098817007
Training epoch: 10, training rmse: 0.891615
epoch 10, test loss 1.0530388631990977
Training epoch: 11, training rmse: 0.884102
epoch 11, test loss 1.0549419458423341
Training epoch: 12, training

In [18]:
test_rmse = test(-1, model, device)
print("Test rmse: {:f}".format(test_rmse))

epoch -1, test loss 1.0495388571705138
Test rmse: 1.049539


In [19]:
# Test rmse: 1.082658

In [20]:
# # Save model
# path_to_trained_pmf = "../model/pmf/ml_100k_emb_{:d}_ratio_{:f}_bs_{:d}_e_{:d}_wd_{:f}_lr_{:f}_trained_pmf.pt".format(
#     embedding_feature_size, ratio, batch_size, len(train_rmse_list), weight_decay, lr
# )
# torch.save(model.state_dict(), path_to_trained_pmf)