In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

C:\Users\miha\Projects\gnn-rl-recsys


In [2]:
import pandas as pd
import numpy as np
import functools
import operator
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import namedtuple

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import pad
from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

torch.set_printoptions(precision=2, sci_mode=False)
torch.manual_seed(0)

<torch._C.Generator at 0x213e55f4ed0>

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [4]:
import pickle
with open("data/steam/data.pkl", "rb") as f:
    data = pd.read_pickle(f)
    #data = pickle.load(f)

In [5]:
user_col = 'user_id'
item_col = 'app_id'
train_set = data['train_set'][[user_col, item_col]].values.T
supervision_set = data['supervision_set'][[user_col, item_col]].values.T
valid_set = data['valid_set'][[user_col, item_col]].values.T
user_attr = data['user_attr']
item_attr = data['item_attr']

train_set = np.concatenate([train_set, supervision_set], axis=1)

In [6]:
# scaler_user = StandardScaler()
# user_attr_preprocess = scaler_user.fit_transform(user_attr)

# scaler_item = StandardScaler()
# item_attr_preprocess = np.copy(item_attr)
# item_attr_preprocess[:, 435:] = scaler_item.fit_transform(item_attr[:, 435:])

In [7]:
def temp_item_attr_rebuild(item_attr):
    new_item_attr = []
    for row in item_attr:
        tags = row[15]
        OS = list(row[0:3].nonzero()[0])
        avg_rat = row[4:10].nonzero()[0][0]
        price_original = row[12]
        new_item_attr.append([tags, OS, avg_rat, price_original])
    return np.array(new_item_attr, dtype=object)

In [8]:
item_attr2 = temp_item_attr_rebuild(item_attr)

In [9]:
scheme = {
    "user_id": {
        "emb_dim": 8,
        "num_emb": user_attr.shape[0],
        "max_len": 1,
        "type": 'sparse'
    },
    "item_id": {
        "emb_dim": 8,
        "num_emb": item_attr.shape[0],
        "max_len": 1,
        "type": 'sparse'
    },
    # "tags": {
    #     "emb_dim": 4,
    #     "num_emb": 425,
    #     "max_len": 20,
    #     "type": 'sparse'
    # },
    # "OS": {
    #     "emb_dim": 4,
    #     "num_emb": 3,
    #     "max_len": 3,
    #     "type": 'sparse'
    # },
    # "AvgCatRating": {
    #     "emb_dim": 4,
    #     "num_emb": 6,
    #     "max_len": 1,
    #     "type": 'sparse'
    # },
    # "PriceOriginal": {
    #     "type": 'dense'
    # }
}

class SparseFeat(namedtuple('SparseFeat', ['name', 'emb_dim', 'num_emb', "max_len", "index", 'pad_index'])):
    def __new__(cls, name, emb_dim, num_emb, max_len, index, pad_index):
        return super(SparseFeat, cls).__new__(cls, name, emb_dim, num_emb, max_len, index, pad_index)
    
class DenseFeat(namedtuple('DenseFeat', ['name', "index", 'pad_index'])):
    def __new__(cls, name, index, pad_index):
        return super(DenseFeat, cls).__new__(cls, name, index, pad_index)    

# TODO: start+=feat['max_len'] powoduje blad, do sprawdzenia
class FeatureStore:
    def __init__(self, scheme):
        self.features = {'sparse': [], 'dense': []}
        self.n_features = len(scheme.keys())
        
        start = 0
        for i, (feat_name, feat) in enumerate(scheme.items()):
            if feat['type'] == 'sparse':
                self.features['sparse'].append(
                    SparseFeat(feat_name, feat['emb_dim'], feat['num_emb'], feat['max_len'], i, (start, start+feat['max_len']))
                )
                start = start + feat['max_len']
            elif feat['type'] == 'dense':
                self.features['dense'].append(
                    DenseFeat(feat_name, i, (start, start+1))
                )
                start += 1

        self.sparse_index = self.get_feature_index('sparse')
        self.dense_index = self.get_feature_index('dense')

    def input_len(self):
        sparse_len = sum([f.emb_dim for f in self.features['sparse']])
        dense_len = len(self.features['dense'])
        return sparse_len + dense_len
    
    def emb_dim(self):
        return 8
    
    def num_emb(self):
        return sum([f.num_emb for f in self.features['sparse']])

    def get_feature_index(self, feat_type):
        return torch.hstack([torch.arange(i[0], i[1]) for i in [f.pad_index for f in self.features[feat_type]]])

    def get_input_dim(self):
        return sum([f.emb_dim for f in self.features['sparse']]) + len(self.features['dense'])

In [10]:
feature_store = FeatureStore(scheme)

RuntimeError: hstack expects a non-empty TensorList

In [None]:
class DeepFMDataset(Dataset):
    def __init__(self, edge_index, user_attr, item_attr):
        self.edge_index = edge_index
        self.user_attr = user_attr
        self.item_attr = item_attr
        
        self.users = edge_index[0]
        self.items = edge_index[1]
    
    def __len__(self):
        return self.edge_index.shape[1]
    
    def __getitem__(self, idx):
        u_id = self.users[idx]
        i_id = self.items[idx]
        
        #u_attr = self.user_attr[u_id]
        u_attr = []
        i_attr = self.item_attr[i_id]

        x = np.concatenate([[u_id], [i_id], u_attr, i_attr])
        
        sparse_features = self._get_sparse_features(x)
        dense_features = self._get_dense_features(x)
        
        return sparse_features, dense_features
    
    def _get_sparse_features(self, x):
        indices = [f.index for f in feature_store.features['sparse']]
        return x[indices]
    
    def _get_dense_features(self, x):
        indices = [f.index for f in feature_store.features['dense']]
        return x[indices]

def pad_sequence_max_len(sequence, max_len):
    sequence[0] = pad(sequence[0], (0, max_len-sequence[0].shape[0]), value=-1)
    return pad_sequence(sequence, batch_first=True, padding_value=-1) + 1

def collate_fn(batch):
    xs = [[] for f in range(feature_store.n_features)]
    for (_sparse, _dense) in batch:
        for i, f in enumerate(feature_store.features['sparse']):
            xs[f.index].append(torch.tensor(_sparse[i], dtype=torch.float32))

        for i, f in enumerate(feature_store.features['dense']):
            xs[f.index].append(torch.tensor(_dense[i], dtype=torch.float32))

    for i, f in enumerate(feature_store.features['sparse']):
        if f.max_len > 1:
            xs[f.index] = pad_sequence_max_len(xs[f.index], f.max_len)
        else:
            xs[f.index] = torch.tensor(xs[f.index]) + 1
        
    for i, f in enumerate(feature_store.features['dense']):
        xs[f.index] = torch.tensor(xs[f.index])
    
    xs = torch.column_stack(xs)

    return xs.to(device)

In [None]:
class MF(nn.Module):
    def __init__(self, n_users, n_items, emb_size):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.u = nn.Embedding(n_users, emb_size)
        self.i = nn.Embedding(n_items, emb_size)
        
    def forward(self, ux, ix):
        return torch.sum(self.u(ux) * self.i(ix), dim=1)
        #return torch.sum((self.u(ux) * self.i(ix)).squeeze(1), dim=1)

In [None]:
class CrossProductNet(nn.Module):
    def forward(self, emb_x):
        cross = torch.bmm(emb_x.unsqueeze(2), emb_x.unsqueeze(1))
        return torch.tensor([torch.tril(i, diagonal=-1).sum() for i in cross], device=device).unsqueeze(1)

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout=0.1, batch_norm=True):
        super(MLP, self).__init__()
        
        _layers = []
        dims = [input_dim] + hidden_dim + [1]
        for in_dim, out_dim in zip(dims, dims[1:]):
            _layers.append(nn.Linear(in_dim, out_dim))
            if batch_norm:
                _layers.append(nn.BatchNorm1d(out_dim))
            _layers.append(nn.ReLU())
            _layers.append(nn.Dropout(p=dropout))

        self.layers = nn.Sequential(*_layers)
        
    def forward(self, emb_x):
        return self.layers(emb_x)
    
class EmbeddingNet(nn.Module):
    def __init__(self, feature_store):
        super(EmbeddingNet, self).__init__()
        _embeddings = {feat.name: nn.Embedding(feat.num_emb+1, feat.emb_dim, padding_idx=0, device=device) for feat in feature_store.features['sparse']}
        self._feature_store = feature_store
        self.embeddings = nn.ModuleDict(_embeddings)
        
    def forward(self, x):
        x_emb = []
        for name, embed_matrix in self.embeddings.items():
            feat = [i for i in self._feature_store.features['sparse'] if i.name == name][0]
            x_feat = x[:, feat.pad_index[0]:feat.pad_index[1]].to(torch.long)
            emb = embed_matrix(x_feat)
            emb_agg = torch.mean(emb, axis=1)
            x_emb.append(emb_agg)
        x_emb = torch.cat(x_emb, axis=1)
        return x_emb   

class DeepFM(nn.Module):
    def __init__(self, feature_store, hidden_dim):
        super(DeepFM, self).__init__()
        self.feature_store = feature_store

        self.V = EmbeddingNet(feature_store)
        self.fm = CrossProductNet()
        self.dnn = MLP(feature_store.get_input_dim(), hidden_dim)
        
    def forward(self, x):
        x_dense = x[:, feature_store.dense_index]
        
        x_sparse = self.V(x)
        x_sparse_dense = torch.cat([x_sparse, x_dense], dim=1)
        
        x = self.fm(x_sparse) + self.dnn(x_sparse_dense)
        return x

In [None]:
def approx_negative_sampling(batch, n_samples, bs):
    ind = torch.arange(bs).repeat(n_samples)
    perm =  torch.randperm(ind.shape[0])
    random_indices = ind[perm]
    i_id_neg = batch[random_indices, 1:] # apply indexing batch[random_indices, 0] to get only item_id 
                                         # column, else get all columns (currently all item features)
    
    u_id = batch[:, 0].repeat(n_samples+1).unsqueeze(1)
    i_id = torch.cat([batch[:, 1:], i_id_neg])
    batch = torch.cat([u_id, i_id], dim=1)
    
    y_true = torch.cat([torch.ones(bs).to(device),
                       torch.zeros(bs*n_samples).to(device)]).unsqueeze(1)
    
    return batch, y_true

In [11]:
def train(n_epochs, print_loss=500):
    model.train()
    batch_size = train_loader.batch_size
    
    for epoch in range(n_epochs):
        running_loss = 0.
        preds, ground_truths = [], []
        batch_print_loss = 0.
        for i_batch, batch in enumerate(tqdm(train_loader)):
            batch, y_true = approx_negative_sampling(batch, n_samples=2, bs=batch_size)
            
            y_pred = model(batch)
            loss = criterion(y_pred, y_true)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            preds.append(y_pred)
            ground_truths.append(y_true)
            running_loss += loss.item()
            
            if not ((i_batch+1) % print_loss):
                pred = torch.cat(preds, dim=0).detach().sigmoid().cpu().numpy()
                ground_truth = torch.cat(ground_truths, dim=0).detach().cpu().numpy()
                last_loss = running_loss / print_loss
                
                train_roc_auc = roc_auc_score(ground_truth, pred)
                #test_loss, test_roc_auc = test()
                test_loss, test_roc_auc = -1, -1
                
                preds, ground_truths = [], []
                running_loss = 0.
                
                print(f"batch <{i_batch}>\ntrain_loss: {last_loss} - train_roc_auc: {train_roc_auc}\ntest_loss: {test_loss} - test_roc_auc: {test_roc_auc}\n")
        print(f"Epoch: {epoch}, Loss: {running_loss / len(train_loader):.4f}")

In [12]:
@torch.no_grad()
def test():
    model.eval()
    batch_size = val_loader.batch_size
    
    running_loss = 0.
    preds, ground_truths = [], []

    for i_batch, batch in enumerate(tqdm(val_loader)):
        batch = [i.to(device) for i in batch]
        u_id, i_id, u_attr, i_attr = batch
        
        u_id, i_id, y_true = approx_negative_sampling(u_id, i_id, n_samples=2, bs=batch_size)

        y_pred = model(u_id, i_id)
        loss = criterion(y_pred, y_true)
        
        preds.append(y_pred)
        ground_truths.append(y_true)
        running_loss += loss.item()
        
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    
    test_loss = running_loss / len(val_loader)
    test_score = roc_auc_score(ground_truth, pred)

    return test_loss, test_score

In [None]:
# train_dataset = MFDataset(train_set, data['user_attr'], data['item_attr'])
# train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, drop_last=True)

# val_dataset = MFDataset(valid_set, data['user_attr'], data['item_attr'])
# val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False, drop_last=True)

# n_users, n_items = user_attr.shape[0], item_attr.shape[0]
# model = MF(n_users, n_items, emb_size=32)
# model = model.to(device)

In [13]:
dataset = DeepFMDataset(train_set, user_attr, item_attr2)
dataloader = DataLoader(dataset, shuffle=False, batch_size=1024, collate_fn=collate_fn, drop_last=True)
train_loader = dataloader

model = DeepFM(feature_store, hidden_dim=[128, 64])
model = model.to(device)

NameError: name 'DeepFMDataset' is not defined

In [None]:
criterion = nn.BCEWithLogitsLoss()
#optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
optimizer = torch.optim.RMSprop(params=model.parameters(), lr=1e-4, momentum=0.9)

In [None]:
train(n_epochs=10, print_loss=1000)

In [None]:
test()

In [None]:
with torch.no_grad():
    batch_size = train_loader.batch_size
    batch = next(iter(train_loader))
    batch = [i.to(device) for i in batch]
    u_id, i_id, u_attr, i_attr = batch
    
    neg_item_idx = approx_negative_sampling(i_id, bs=batch_size)

    u_id = torch.cat([u_id, u_id])
    i_id = torch.cat([i_id, neg_item_idx])
    y_true = torch.cat([torch.ones(batch_size).to(device),
                        torch.zeros(batch_size).to(device)])

    y_pred = model(u_id, i_id)
    
    print(y_pred.sigmoid(), y_true)

In [None]:
criterion(y_pred, y_true)

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    
def load_model(path):
    model = MF(n_users, n_items, emb_size=32)
    model.load_state_dict(torch.load(path))
    model = model.to(device)
    return model

In [None]:
#save_model(model, "models/mf_01.pth")

In [None]:
model = load_model("models/mf_01.pth")

In [None]:
from reco_env import RecoEnv
from utils import import_data_for_env
import gym

In [None]:
env = gym.make(RecoEnv.id, **import_data_for_env())

In [None]:
vc = rec.user_id.value_counts()

In [None]:
vc

In [None]:
vc[vc >= 3]

In [None]:
plt.plot(vc)