In [None]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

In [None]:
import argparse
import pickle
import os
from tqdm.notebook import tqdm
from datetime import datetime
from itertools import cycle

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from dataset.deep import DeepDatasetIterable, FeaturelessDatasetIterable, collate_fn
from features.store import FeatureStore
from models import DeepFM
from utils import write_scalars

In [None]:
from utils import load_model

In [None]:
torch.set_printoptions(precision=3, sci_mode=False)

In [None]:
def train_epoch(model, criterion, optimizer, train_loader, device):
    model.train()

    running_loss = 0.
    preds, ground_truths = [], []
    for i_batch, (batch, y_true) in enumerate(tqdm(train_loader)):
        batch, y_true = batch.to(device), y_true.to(device)

        y_pred = model(batch)
        loss = criterion(y_pred, y_true)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        preds.append(y_pred)
        ground_truths.append(y_true)
        running_loss += loss.item()

    pred = torch.cat(preds, dim=0).detach().sigmoid().cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).detach().cpu().numpy()
    train_loss = running_loss / len(train_loader)
    train_roc_auc = roc_auc_score(ground_truth, pred)

    return train_loss, train_roc_auc

@torch.no_grad()
def test(model, criterion, val_loader, device):
    model.eval()

    running_loss = 0.
    preds, ground_truths = [], []

    for i_batch, (batch, y_true) in enumerate(val_loader):
        batch, y_true = batch.to(device), y_true.to(device)

        y_pred = model(batch)
        loss = criterion(y_pred, y_true)

        preds.append(y_pred)
        ground_truths.append(y_true)
        running_loss += loss.item()

    pred = torch.cat(preds, dim=0).sigmoid().cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()

    test_loss = running_loss / len(val_loader)
    test_roc_auc = roc_auc_score(ground_truth, pred)

    return test_loss, test_roc_auc

In [None]:
dir_art = 'data/steam'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_epochs = 50

In [None]:
with open(os.path.join(dir_art, 'data.pkl'), "rb") as f:
    data = pd.read_pickle(f)

train_set = data['relations_datastore'].dataframe.train.values
supervision_set = data['relations_datastore'].dataframe.supervision.values
valid_set = data['relations_datastore'].dataframe.valid.values
item_attr = data['items_datastore'].dataframe.df
user_attr = data['users_datastore'].dataframe.df
scheme_relations = data['relations_datastore'].scheme
scheme_items = data['items_datastore'].scheme
scheme_users = data['users_datastore'].scheme

train_set = np.concatenate((train_set, supervision_set), axis=0)
n_users, n_items = user_attr.shape[0], item_attr.shape[0]

In [None]:
feature_store = FeatureStore(scheme_relations, scheme_items, scheme_users, emb_dims={"sparse": 16, "varlen": 16})

In [None]:
model = load_model(cls=DeepFM, model_path="runs/DeepFM/2023-11-16_20-30-00/model.pth", model_kwargs = {
            "feature_store": feature_store,
            "hidden_dim": [128, 64, 16],
            "device": device
        })

In [None]:
model.V.embeddings['user_id'](torch.tensor(1))

In [None]:
torch.tensor([0],[0]

In [None]:
import torch
from torch import nn

from layers import EmbeddingNet


class MF(nn.Module):
    def __init__(self, feature_store, device):
        super(MF, self).__init__()
        self.feature_store = feature_store
        self.device = device

        self.V = EmbeddingNet(feature_store, device=device)
        self.emb_dim = self.V.embeddings['user_id'].weight.shape[1]

    def forward(self, x):
        x = self.V(x).to(torch.float)
        x = torch.sum(x[:, :self.emb_dim] * x[:, self.emb_dim:], axis=1).unsqueeze(1)
        return x

In [None]:
train_dataset = FeaturelessDatasetIterable(train_set, n_users, n_items, user_batch_size=int(1e4), neg_sampl=2)
val_dataset = FeaturelessDatasetIterable(valid_set, n_users, n_items, user_batch_size=int(1e4), neg_sampl=2)
model = MF(feature_store, device=device).to(device)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=1, collate_fn=collate_fn, drop_last=False)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=1, collate_fn=collate_fn, drop_last=False)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.RMSprop(params=model.parameters(), lr=1e-4, momentum=0.9)

In [None]:
print(f"> Training model[{model.__class__.__name__}] on device[{device}] begins...")
for epoch in tqdm(range(n_epochs)):
    train_loss, train_roc_auc = train_epoch(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        train_loader=train_loader,
        device=device
    )
    test_loss, test_roc_auc = test(
        model=model,
        criterion=criterion,
        val_loader=val_loader,
        device=device
    )
    print(f"""Epoch <{epoch}>\ntrain_loss: {train_loss} - train_roc_auc: {train_roc_auc}
test_loss: {test_loss} - test_roc_auc: {test_roc_auc}\n""")

In [None]:
torch.save(model.state_dict(), f"runs/MF/2023-11-16_22-00-00/model.pth")