In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

C:\Users\Milosz\Desktop\python\thesis-recsys


In [32]:
import pandas as pd
import numpy as np
import functools
import operator
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import namedtuple

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import pad
from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

from src.models import NCF, DeepFM
from src.feature_store import FeatureStore, SparseFeat, DenseFeat
from src.utils import attr2tensor

torch.set_printoptions(precision=2, sci_mode=False)
torch.manual_seed(0)

<torch._C.Generator at 0x1591a8d14b0>

In [33]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [34]:
import pickle
with open("data/steam/data.pkl", "rb") as f:
    data = pd.read_pickle(f)

In [35]:
user_col = 'user_id'
item_col = 'app_id'
train_set = data['train_set'][[user_col, item_col]].values.T
supervision_set = data['supervision_set'][[user_col, item_col]].values.T
valid_set = data['valid_set'][[user_col, item_col]].values.T
user_attr = data['user_attr']
item_attr = data['item_attr']

train_set = np.concatenate([train_set, supervision_set], axis=1)

In [None]:
# scaler_user = StandardScaler()
# user_attr_preprocess = scaler_user.fit_transform(user_attr)

# scaler_item = StandardScaler()
# item_attr_preprocess = np.copy(item_attr)
# item_attr_preprocess[:, 435:] = scaler_item.fit_transform(item_attr[:, 435:])

In [6]:
def temp_item_attr_rebuild(item_attr):
    new_item_attr = []
    for row in item_attr:
        tags = row[15]
        OS = list(row[0:3].nonzero()[0])
        avg_rat = row[4:10].nonzero()[0][0]
        price_original = row[12]
        new_item_attr.append([tags, OS, avg_rat, price_original])
    return np.array(new_item_attr, dtype=object)

In [7]:
item_attr2 = temp_item_attr_rebuild(item_attr)

In [8]:
#TODO: przerobic na plik json oraz wrzucic nazwy atrybutów do słowników i zrobic liste słownikow
scheme = {
    "edge_index": {
        "user_id": {
            "emb_dim": 8,
            "num_emb": user_attr.shape[0],
            "max_len": 1,
            "type": 'sparse'
        },
        "item_id": {
            "emb_dim": 8,
            "num_emb": item_attr.shape[0],
            "max_len": 1,
            "type": 'sparse'
        }
    },
    "user_feat": {},
    "item_feat": {
        "tags": {
            "emb_dim": 4,
            "num_emb": 425,
            "max_len": 20,
            "type": 'sparse'
        },
        "OS": {
            "emb_dim": 4,
            "num_emb": 3,
            "max_len": 3,
            "type": 'sparse'
        },
        "AvgCatRating": {
            "emb_dim": 4,
            "num_emb": 6,
            "max_len": 1,
            "type": 'sparse'
        },
        "PriceOriginal": {
            "type": 'dense'
        }
    }
}

In [42]:
class DeepFMDataset(Dataset):
    def __init__(self, feature_store, edge_index, user_attr, item_attr, neg_sampl):
        self.edge_index = torch.tensor(edge_index) + 1
        self.user_attr = None #attr2tensor(feature_store.user_feat, user_attr)
        self.item_attr = attr2tensor(feature_store.item_feat, item_attr)
        
        self.users = self.edge_index[:, 0]
        self.items = self.edge_index[:, 1]

        self.n_users = 3066721
        self.n_items = self.item_attr.shape[0]

        self.neg_sampl = neg_sampl
    
    def __len__(self):
        return self.edge_index.shape[0]
    
    def __getitem__(self, idx):
        u_id = self.users[idx].repeat(self.neg_sampl + 1)
        i_id = torch.cat([self.items[idx].unsqueeze(0), self._approx_neg_sampl()])
        
        u_attr = torch.empty(self.neg_sampl + 1)
        i_attr = self.item_attr[i_id - 1]

        x = torch.column_stack((u_id, i_id, i_attr))
        y = torch.tensor([1] + [0] * self.neg_sampl)
        
        return x, y

    def _approx_neg_sampl(self):
        neg_i_id = torch.randint(low=0, high=self.n_items, size=(self.neg_sampl,))
        return neg_i_id

def collate_fn(batch):
    xs, ys = [], []
    for x, y in batch:
        xs.append(x)
        ys.append(y)
    xs = torch.cat(xs)
    ys = torch.cat(ys).to(torch.float)
    return xs, ys.unsqueeze(1)

In [10]:
# def pad_sequence_max_len(sequence, max_len):
#     sequence[0] = pad(sequence[0], (0, max_len-sequence[0].shape[0]), value=-1)
#     return pad_sequence(sequence, batch_first=True, padding_value=-1) + 1

# def collate_fn(batch):
#     xs = [[] for f in range(feature_store.n_features)]
#     for (_sparse, _dense) in batch:
#         for i, f in enumerate(feature_store.features['sparse']):
#             xs[f.index].append(torch.tensor(_sparse[i], dtype=torch.float32))

#         for i, f in enumerate(feature_store.features['dense']):
#             xs[f.index].append(torch.tensor(_dense[i], dtype=torch.float32))

#     for i, f in enumerate(feature_store.features['sparse']):
#         if f.max_len > 1:
#             xs[f.index] = pad_sequence_max_len(xs[f.index], f.max_len)
#         else:
#             xs[f.index] = torch.tensor(xs[f.index]) + 1
        
#     for i, f in enumerate(feature_store.features['dense']):
#         xs[f.index] = torch.tensor(xs[f.index])
    
#     xs = torch.column_stack(xs)

#     return xs.to(device)

In [11]:
# class MF(nn.Module):
#     def __init__(self, n_users, n_items, emb_size):
#         super().__init__()
#         self.n_users = n_users
#         self.n_items = n_items
#         self.u = nn.Embedding(n_users, emb_size)
#         self.i = nn.Embedding(n_items, emb_size)
        
#     def forward(self, ux, ix):
#         return torch.sum(self.u(ux) * self.i(ix), dim=1)
#         #return torch.sum((self.u(ux) * self.i(ix)).squeeze(1), dim=1)

In [12]:
# def approx_negative_sampling(batch, n_samples, bs):
#     ind = torch.arange(bs).repeat(n_samples)
#     perm =  torch.randperm(ind.shape[0])
#     random_indices = ind[perm]
#     i_id_neg = batch[random_indices, 1:] # apply indexing batch[random_indices, 0] to get only item_id 
#                                          # column, else get all columns (currently all item features)
    
#     u_id = batch[:, 0].repeat(n_samples+1).unsqueeze(1)
#     i_id = torch.cat([batch[:, 1:], i_id_neg])
#     batch = torch.cat([u_id, i_id], dim=1)
    
#     y_true = torch.cat([torch.ones(bs).to(device),
#                        torch.zeros(bs*n_samples).to(device)]).unsqueeze(1)
    
#     return batch, y_true

In [43]:
def train(n_epochs, print_loss=500):
    model.train()
    batch_size = train_loader.batch_size
    
    for epoch in range(n_epochs):
        running_loss = 0.
        preds, ground_truths = [], []
        for i_batch, (batch, y_true) in enumerate(tqdm(train_loader)):
            batch, y_true = batch.to(device), y_true.to(device)
            
            y_pred = model(batch)
            loss = criterion(y_pred, y_true)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            preds.append(y_pred)
            ground_truths.append(y_true)
            running_loss += loss.item()
            
            if not ((i_batch+1) % print_loss):
                pred = torch.cat(preds, dim=0).detach().sigmoid().cpu().numpy()
                ground_truth = torch.cat(ground_truths, dim=0).detach().cpu().numpy()
                last_loss = running_loss / print_loss
                
                train_roc_auc = roc_auc_score(ground_truth, pred)
                test_loss, test_roc_auc = test()
                
                preds, ground_truths = [], []
                running_loss = 0.
                
                print(f"batch <{i_batch}>\ntrain_loss: {last_loss} - train_roc_auc: {train_roc_auc}\ntest_loss: {test_loss} - test_roc_auc: {test_roc_auc}\n")
        print(f"Epoch: {epoch}, Loss: {running_loss / len(train_loader):.4f}")

In [44]:
@torch.no_grad()
def test():
    model.eval()
    batch_size = val_loader.batch_size
    
    running_loss = 0.
    preds, ground_truths = [], []

    for i_batch, (batch, y_true) in enumerate(tqdm(val_loader)):
        batch, y_true = batch.to(device), y_true.to(device)
        
        y_pred = model(batch)
        loss = criterion(y_pred, y_true)
        
        preds.append(y_pred)
        ground_truths.append(y_true)
        running_loss += loss.item()
        
    pred = torch.cat(preds, dim=0).sigmoid().cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    
    test_loss = running_loss / len(val_loader)
    test_score = roc_auc_score(ground_truth, pred)

    return test_loss, test_score

In [45]:
feature_store = FeatureStore(scheme)
train_dataset = DeepFMDataset(feature_store, train_set.T, user_attr, item_attr2, neg_sampl=2)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=1024, collate_fn=collate_fn, drop_last=True)
val_dataset = DeepFMDataset(feature_store, valid_set.T, user_attr, item_attr2, neg_sampl=2)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=1024, collate_fn=collate_fn, drop_last=True)

model = DeepFM(feature_store, hidden_dim=[128, 64], device=device)
#model = NCF(feature_store, hidden_dim=[64, 32, 8])
model = model.to(device)

In [46]:
criterion = nn.BCEWithLogitsLoss()
#optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
optimizer = torch.optim.RMSprop(params=model.parameters(), lr=1e-4, momentum=0.9)

In [47]:
train(n_epochs=10, print_loss=10)

  0%|                                                                       | 9/5310 [00:10<1:27:55,  1.00it/s]
  0%|                                                                                  | 0/422 [00:00<?, ?it/s][A
  0%|▏                                                                         | 1/422 [00:00<05:01,  1.39it/s][A
  0%|▎                                                                         | 2/422 [00:01<05:04,  1.38it/s][A
  1%|▌                                                                         | 3/422 [00:02<05:08,  1.36it/s][A
  1%|▋                                                                         | 4/422 [00:03<05:27,  1.28it/s][A
  0%|                                                                       | 9/5310 [00:14<2:20:30,  1.59s/it]


KeyboardInterrupt: 

In [48]:
test()

100%|████████████████████████████████████████████████████████████████████████| 422/422 [04:41<00:00,  1.50it/s]


(4.617467509626778, 0.5079692237174365)

In [None]:
with torch.no_grad():
    batch_size = train_loader.batch_size
    batch = next(iter(train_loader))
    batch = [i.to(device) for i in batch]
    u_id, i_id, u_attr, i_attr = batch
    
    neg_item_idx = approx_negative_sampling(i_id, bs=batch_size)

    u_id = torch.cat([u_id, u_id])
    i_id = torch.cat([i_id, neg_item_idx])
    y_true = torch.cat([torch.ones(batch_size).to(device),
                        torch.zeros(batch_size).to(device)])

    y_pred = model(u_id, i_id)
    
    print(y_pred.sigmoid(), y_true)

In [None]:
criterion(y_pred, y_true)

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    
def load_model(path):
    model = MF(n_users, n_items, emb_size=32)
    model.load_state_dict(torch.load(path))
    model = model.to(device)
    return model

In [None]:
#save_model(model, "models/mf_01.pth")

In [None]:
model = load_model("models/mf_01.pth")

In [None]:
from reco_env import RecoEnv
from utils import import_data_for_env
import gym

In [None]:
env = gym.make(RecoEnv.id, **import_data_for_env())

In [None]:
vc = rec.user_id.value_counts()

In [None]:
vc

In [None]:
vc[vc >= 3]

In [None]:
plt.plot(vc)