In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

C:\Users\Milosz\Desktop\python\thesis-recsys


In [2]:
import pandas as pd
import numpy as np
import functools
import operator
import json
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import roc_auc_score

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
def load_data_from_csv(path: str) -> pd.DataFrame:
    """
    Loads data from a CSV file into a Pandas DataFrame.
    Csv file requirements:
        - `user_id` - int
        - `app_id` - int
        - `is_recommended` - int [0/1]

    Parameters:
    - path (str): The file path of the CSV file to load.

    Returns:
    - df (pd.DataFrame): The loaded data as a Pandas DataFrame.
    """
    df = pd.read_csv(path, index_col=[0])
    return df

In [5]:
train_df = load_data_from_csv("data/graph_train.csv")
test_df = load_data_from_csv("data/graph_test.csv")

## Matrix fact benchmark

In [6]:
# class MFDataset(Dataset):
#     def __init__(self, data):
#         self.data = data
    
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         row = self.data.iloc[idx]

#         user_id = torch.tensor([row['user_id']], dtype=torch.int)
#         item_id = torch.tensor([row['app_id']],  dtype=torch.int)
#         rating = torch.Tensor([row['is_recommended']])
        
#         return user_id, item_id, rating
    
    
class MFDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
        self.users = torch.from_numpy(data['user_id'].values)
        self.items = torch.from_numpy(data['app_id'].values)
        self.is_recommended = torch.from_numpy(data['is_recommended'].values).float()
        
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.is_recommended[idx]

In [7]:
class MF(nn.Module):
    def __init__(self, n_users, n_items, emb_size):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.u = nn.Embedding(n_users, emb_size)
        self.i = nn.Embedding(n_items, emb_size)
        
    def forward(self, ux, ix):
        return torch.sum(self.u(ux) * self.i(ix), dim=1)
        #return torch.sum((self.u(ux) * self.i(ix)).squeeze(1), dim=1)

In [8]:
def approx_negative_sampling(ix):
    bs = ix.size(0)
    random_indices = torch.randperm(bs)
    ix_neg = ix[random_indices]
    return ix_neg

In [21]:
def train(n_epochs, train_loader, test_loader, print_loss=500):
    model.train()
    
    for epoch in range(n_epochs):
        running_loss = 0.
        batch_print_loss = 0.
        for i_batch, (user_idx, item_idx, y_true) in enumerate(tqdm(train_loader)):
            user_idx, item_idx, y_true = user_idx.to(device), item_idx.to(device), y_true.to(device)
            
            neg_item_idx = approx_negative_sampling(item_idx)
            
            user_idx = torch.cat([user_idx, user_idx])
            item_idx = torch.cat([item_idx, neg_item_idx])
            y_true = torch.cat([y_true, torch.zeros(train_loader.batch_size).to(device)])
            
            optimizer.zero_grad()
            y_pred = model(user_idx, item_idx)
            loss = criterion(y_pred, y_true)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            batch_print_loss += loss.item()
            
            if not ((i_batch+1) % print_loss):
                last_loss = batch_print_loss / print_loss
                print(f"batch <{i_batch}> - loss: {last_loss}")
                batch_print_loss = 0.
                
                test_loss, test_roc_auc = test(test_loader)
                print(f"\tTest loss: {test_loss} \t Test ROC AUC: {test_roc_auc}")
                
        print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}')
    
    return model

In [24]:
@torch.no_grad()
def test(test_loader):
    model.eval()
    running_loss = 0.
    preds, ground_truths = [], []

    for i_batch, (user_idx, item_idx, y_true) in enumerate(test_loader):
        user_idx, item_idx, y_true = user_idx.to(device), item_idx.to(device), y_true.to(device)
        
        neg_item_idx = approx_negative_sampling(item_idx)
            
        user_idx = torch.cat([user_idx, user_idx])
        item_idx = torch.cat([item_idx, neg_item_idx])
        y_true = torch.cat([y_true, torch.zeros(test_loader.batch_size).to(device)])
        
        y_pred = model(user_idx, item_idx)
        loss = criterion(y_pred, y_true)
        
        preds.append(y_pred)
        ground_truths.append(y_true)

        running_loss += loss.item()
        
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    
    test_loss = running_loss / len(test_loader)
    test_score = roc_auc_score(ground_truth, pred)

    return test_loss, test_score

In [11]:
train_dataset = MFDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, drop_last=True)

test_dataset = MFDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, drop_last=True)

In [12]:
n_users, n_items = train_df['user_id'].nunique(), train_df['app_id'].nunique()

model = MF(n_users, n_items, emb_size=32)
model = model.to(device)

In [13]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)

In [25]:
train(10, train_loader, test_loader, print_loss=50)

  1%|▍                                                                               | 49/8866 [00:03<06:48, 21.58it/s]

batch <49> - loss: 0.7704195237159729


  1%|▍                                                                               | 52/8866 [00:06<54:23,  2.70it/s]

	Test loss: 0.8593045313479537 	 Test ROC AUC: 0.4991752167098722


  1%|▉                                                                               | 97/8866 [00:08<06:34, 22.21it/s]

batch <99> - loss: 0.7635062205791473


  1%|▉                                                                              | 102/8866 [00:12<44:41,  3.27it/s]

	Test loss: 0.8596402190499387 	 Test ROC AUC: 0.4989371421658351


  2%|█▎                                                                             | 147/8866 [00:14<06:30, 22.34it/s]

batch <149> - loss: 0.7692069864273071


  2%|█▎                                                                             | 152/8866 [00:17<43:47,  3.32it/s]

	Test loss: 0.8578991089837026 	 Test ROC AUC: 0.4991600017333914


  2%|█▊                                                                             | 197/8866 [00:19<07:47, 18.55it/s]

batch <199> - loss: 0.7714810705184937


  2%|█▊                                                                             | 203/8866 [00:23<40:36,  3.56it/s]

	Test loss: 0.8587203460224604 	 Test ROC AUC: 0.49897007704671525


  3%|██▏                                                                            | 248/8866 [00:25<06:34, 21.87it/s]

batch <249> - loss: 0.7720768058300018


  3%|██▏                                                                            | 249/8866 [00:27<16:01,  8.97it/s]


KeyboardInterrupt: 

In [None]:
test(test_loader)

In [None]:
with torch.no_grad():
    u_id, i_id, y_true = next(iter(train_loader))
    
    u_id, i_id, y_true = u_id.to(device), i_id.to(device), y_true.to(device) 
    y_pred = model(u_id, i_id)
    
    print(y_pred.sigmoid(), y_true)

In [None]:
criterion(y_pred, y_true)

In [14]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    
def load_model(path):
    model = MF(n_users, n_items, emb_size=32)
    model.load_state_dict(torch.load(path))
    model = model.to(device)
    return model

In [None]:
#save_model(model, "models/mf_01.pth")

In [15]:
model = load_model("models/mf_01.pth")

In [None]:
from reco_env import RecoEnv
from utils import import_data_for_env
import gym

In [None]:
env = gym.make(RecoEnv.id, **import_data_for_env())

In [None]:
vc = rec.user_id.value_counts()

In [None]:
vc

In [None]:
vc[vc >= 3]

In [None]:
plt.plot(vc)