# Train PMF

In [None]:
from __future__ import print_function

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
import pickle

import sys

sys.path.append("..")

from src.model.bpmf import BPMF

In [None]:
"""
Options: 
- ml-100k
- ml-1m
- ml-10m
- ml-20m
- ml-25m
"""

DATASET = "ml-100k"

In [None]:
def get_dataset(dataset_name):
    if dataset_name in ("ml-1m", "ml-10m"):
        rnames = ["user_id", "item_id", "rating", "timestamp"]
        df = pd.read_table(
            f"../data/{dataset_name}/ratings.dat",
            sep="::",
            header=None,
            names=rnames,
            engine="python",
        )
    else:
        df = pd.read_csv(f"../data/{dataset_name}/ratings.csv")  # .reset_index()
        df = df.rename(
            columns={
                "userId": "user_id",
                "movieId": "item_id",
            }  # , "index": "timestamp"}
        )

    df["rating"] = df["rating"].astype("float")
    df = df.sort_values(["user_id", "timestamp"])

    return df


df = get_dataset(DATASET)
df.head()

In [None]:
import pandas as pd
from math import ceil


def split_train_test(data, train_ratio=0.8):
    # Lista para armazenar os subsets de treino e teste
    train_list = []
    test_list = []

    for _, group in data.groupby("user_id"):
        # Ordena as interações por timestamp
        group = group.sort_values("timestamp")

        # Calcula o ponto de corte para o treino (80% das interações)
        split_point = ceil(len(group) * train_ratio)

        # Separa o conjunto de treino e teste
        train_list.append(group.iloc[:split_point])
        test_list.append(group.iloc[split_point:])

    # Concatena todos os subsets de treino e teste
    train_data = pd.concat(train_list)
    test_data = pd.concat(test_list)

    return train_data, test_data


# Exemplo de uso:
# df é o seu DataFrame com as colunas ['user_id', 'item_id', 'rating', 'timestamp']
train_df, test_df = split_train_test(df)
print(train_df.shape, test_df.shape)

# # Normalize rewards to [-1, 1]
train_data = df[["user_id", "item_id", "rating"]].values
train_data[:, 2] = 0.5 * (train_data[:, 2] - 3)

test_data = test_df[["user_id", "item_id", "rating"]].values
test_data[:, 2] = 0.5 * (test_data[:, 2] - 3)

# # Shuffle data
np.random.shuffle(train_data)
# np.random.shuffle(test_data)

In [None]:
NUM_ITEMS = df.item_id.max() + 1
NUM_USERS = df.user_id.max() + 1

print(NUM_USERS, NUM_ITEMS)

In [None]:
train_data = train_data.astype(float)

In [None]:
bpmf = BPMF(
    n_user=NUM_USERS,
    n_item=NUM_ITEMS,
    n_feature=100,
    max_rating=1.0,
    min_rating=-1.0,
    seed=0,
)

train_rmse_list = bpmf.fit(train_data, n_iters=300)

In [None]:
def RMSE(preds, truth):
    return np.sqrt(np.mean(np.square(preds - truth)))

In [None]:
# Get test predictions
preds = bpmf.predict(test_data)
test_rmse = RMSE(preds, test_data[:, 2])
print("Test rmse: {:f}".format(test_rmse))

In [None]:
with open(f"../model/bpmf/bpmf_{DATASET}.pickle", "wb") as handle:
    pickle.dump(bpmf, handle, protocol=pickle.HIGHEST_PROTOCOL)