In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("../../")
print(os.getcwd())

C:\Users\miha\Projects\gnn-rl-recsys


In [174]:
import pandas as pd
import numpy as np
import functools
import operator
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import namedtuple

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import pad
from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

torch.set_printoptions(precision=2, sci_mode=False)
torch.manual_seed(0)

<torch._C.Generator at 0x1f8c1064ef0>

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [6]:
import pickle
with open("data/steam/data.pkl", "rb") as f:
    data = pd.read_pickle(f)
    #data = pickle.load(f)

In [7]:
user_col = 'user_id'
item_col = 'app_id'
train_set = data['train_set'][[user_col, item_col]].values.T
supervision_set = data['supervision_set'][[user_col, item_col]].values.T
valid_set = data['valid_set'][[user_col, item_col]].values.T
user_attr = data['user_attr']
item_attr = data['item_attr']

train_set = np.concatenate([train_set, supervision_set], axis=1)

In [None]:
# scaler_user = StandardScaler()
# user_attr_preprocess = scaler_user.fit_transform(user_attr)

# scaler_item = StandardScaler()
# item_attr_preprocess = np.copy(item_attr)
# item_attr_preprocess[:, 435:] = scaler_item.fit_transform(item_attr[:, 435:])

In [44]:
def temp_item_attr_rebuild(item_attr):
    new_item_attr = []
    for row in item_attr:
        tags = row[15]
        OS = list(row[0:3].nonzero()[0])
        avg_rat = row[4:10].nonzero()[0][0]
        price_original = row[12]
        new_item_attr.append([tags, OS, avg_rat, price_original])
    return np.array(new_item_attr, dtype=object)

In [55]:
item_attr2 = temp_item_attr_rebuild(item_attr)

In [78]:
item_attr2[0]

array([list([382, 345, 380, 228, 169, 12, 21, 322, 167, 157, 295, 74, 120, 356, 110, 222, 321, 72, 158, 370]),
       list([0, 1, 2]), 3, 29.99], dtype=object)

In [380]:
scheme = {
    "user_id": {
        "emb_dim": 8,
        "num_emb": user_attr.shape[0],
        "max_len": 1,
        "type": 'sparse'
    },
    "item_id": {
        "emb_dim": 8,
        "num_emb": item_attr.shape[0],
        "max_len": 1,
        "type": 'sparse'
    },
    "tags": {
        "emb_dim": 8,
        "num_emb": 425,
        "max_len": 20,
        "type": 'sparse'
    },
    "OS": {
        "emb_dim": 8,
        "num_emb": 3,
        "max_len": 3,
        "type": 'sparse'
    },
    "AvgCatRating": {
        "emb_dim": 8,
        "num_emb": 6,
        "max_len": 1,
        "type": 'sparse'
    },
    "PriceOriginal": {
        "type": 'dense'
    }
}

class SparseFeat(namedtuple('SparseFeat', ['name', 'emb_dim', 'num_emb', "max_len", "index", 'pad_index'])):
    def __new__(cls, name, emb_dim, num_emb, max_len, index, pad_index):
        return super(SparseFeat, cls).__new__(cls, name, emb_dim, num_emb, max_len, index, pad_index)
    
class DenseFeat(namedtuple('DenseFeat', ['name', "index", 'pad_index'])):
    def __new__(cls, name, index, pad_index):
        return super(DenseFeat, cls).__new__(cls, name, index, pad_index)    
    
# class VarLenSparseFeat(namedtuple('VarLenSparseFeat', ['name', 'emb_dim', 'num_emb', "max_len", 'index'])):
#     def __new__(cls, name, emb_dim, num_emb, max_len, index):
#         return super(VarLenSparseFeat, cls).__new__(cls, name, emb_dim, num_emb, max_len, index)

# TODO: start+=feat['max_len'] powoduje blad, do sprawdzenia
class FeatureStore:
    def __init__(self, scheme):
        self.features = {'sparse': [], 'dense': []}
        self.n_features = len(scheme.keys())
        
        start = 0
        for i, (feat_name, feat) in enumerate(scheme.items()):
            if feat['type'] == 'sparse':
                self.features['sparse'].append(
                    SparseFeat(feat_name, feat['emb_dim'], feat['num_emb'], feat['max_len'], i, (start, start+feat['max_len']))
                )
                start = start + feat['max_len']
            elif feat['type'] == 'dense':
                self.features['dense'].append(
                    DenseFeat(feat_name, i, (start, start+1))
                )
                start += 1

        self.sparse_index = self.get_feature_index('sparse')
        self.dense_index = self.get_feature_index('dense')

    def input_len(self):
        sparse_len = sum([f.emb_dim for f in self.features['sparse']])
        dense_len = len(self.features['dense'])
        return sparse_len + dense_len
    
    def emb_dim(self):
        return 8
    
    def num_emb(self):
        return sum([f.num_emb for f in self.features['sparse']])

    def get_feature_index(self, feat_type):
        return torch.hstack([torch.arange(i[0], i[1]) for i in [f.pad_index for f in self.features[feat_type]]])

In [384]:
feature_store = FeatureStore(scheme)
feature_store.dense_index

tensor([26])

In [339]:
class DeepFMDataset(Dataset):
    def __init__(self, edge_index, user_attr, item_attr):
        self.edge_index = edge_index
        self.user_attr = user_attr
        self.item_attr = item_attr
        
        self.users = edge_index[0]
        self.items = edge_index[1]
    
    def __len__(self):
        return self.edge_index.shape[1]
    
    def __getitem__(self, idx):
        u_id = self.users[idx]
        i_id = self.items[idx]
        
        #u_attr = self.user_attr[u_id]
        u_attr = []
        i_attr = self.item_attr[i_id]

        x = np.concatenate([[u_id], [i_id], u_attr, i_attr])
        
        sparse_features = self._get_sparse_features(x)
        dense_features = self._get_dense_features(x)
        
        return sparse_features, dense_features
    
    def _get_sparse_features(self, x):
        indices = [f.index for f in feature_store.features['sparse']]
        return x[indices]
    
    def _get_dense_features(self, x):
        indices = [f.index for f in feature_store.features['dense']]
        return x[indices]

def pad_sequence_max_len(sequence, max_len):
    sequence[0] = pad(sequence[0], (0, max_len-sequence[0].shape[0]), value=-1)
    return pad_sequence(sequence, batch_first=True, padding_value=-1)

def collate_fn(batch):
    xs = [[] for f in range(feature_store.n_features)]
    for (_sparse, _dense) in batch:
        for i, f in enumerate(feature_store.features['sparse']):
            xs[f.index].append(torch.tensor(_sparse[i], dtype=torch.float32))

        for i, f in enumerate(feature_store.features['dense']):
            xs[f.index].append(torch.tensor(_dense[i], dtype=torch.float32))

    for i, f in enumerate(feature_store.features['sparse']):
        if f.max_len > 1:
            xs[f.index] = pad_sequence_max_len(xs[f.index], f.max_len)
        else:
            xs[f.index] = torch.tensor(xs[f.index])
        
    for i, f in enumerate(feature_store.features['dense']):
        xs[f.index] = torch.tensor(xs[f.index])
    
    xs = torch.column_stack(xs)

    return xs.to(device)

In [340]:
# l = [torch.tensor([1]), torch.tensor([4,5])]
# def pad_sequence_max_len(sequence, max_len):
#     sequence[0] = pad(sequence[0], (0, max_len-sequence[0].shape[0]), value=-1)
#     return pad_sequence(sequence, batch_first=True, padding_value=-1)
# pad_sequence_max_len(l, 20)

In [352]:
dataset = DeepFMDataset(train_set, user_attr, item_attr2)
dataloader = DataLoader(dataset, shuffle=False, batch_size=1024, collate_fn=collate_fn)

In [362]:
batch = next(iter(dataloader))

In [None]:
class MF(nn.Module):
    def __init__(self, n_users, n_items, emb_size):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.u = nn.Embedding(n_users, emb_size)
        self.i = nn.Embedding(n_items, emb_size)
        
    def forward(self, ux, ix):
        return torch.sum(self.u(ux) * self.i(ix), dim=1)
        #return torch.sum((self.u(ux) * self.i(ix)).squeeze(1), dim=1)

In [None]:
class EmbeddingBagCollection:
    """
    EmbeddingBagCollection represents a collection of pooled embeddings (`EmbeddingBags`).

    It processes sparse data in the form of `KeyedJaggedTensor` with values of the form
    [F X B X L] where:

    * F: features (keys)
    * B: batch size
    * L: length of sparse features (jagged)

    and outputs a `KeyedTensor` with values of the form [B * (F * D)] where:

    * F: features (keys)
    * D: each feature's (key's) embedding dimension
    * B: batch size

    Args:
        tables (List[EmbeddingBagConfig]): list of embedding tables.
        is_weighted (bool): whether input `KeyedJaggedTensor` is weighted.
        device (Optional[torch.device]): default compute device.

    Example::

        table_0 = EmbeddingBagConfig(
            name="t1", embedding_dim=3, num_embeddings=10, feature_names=["f1"]
        )
        table_1 = EmbeddingBagConfig(
            name="t2", embedding_dim=4, num_embeddings=10, feature_names=["f2"]
        )

        ebc = EmbeddingBagCollection(tables=[table_0, table_1])

        #        0       1        2  <-- batch
        # "f1"   [0,1] None    [2]
        # "f2"   [3]    [4]    [5,6,7]
        #  ^
        # feature

        features = KeyedJaggedTensor(
            keys=["f1", "f2"],
            values=torch.tensor([0, 1, 2, 3, 4, 5, 6, 7]),
            offsets=torch.tensor([0, 2, 2, 3, 4, 5, 8]),
        )

        pooled_embeddings = ebc(features)
        print(pooled_embeddings.values())
        tensor([[-0.8899, -0.1342, -1.9060, -0.0905, -0.2814, -0.9369, -0.7783],
            [ 0.0000,  0.0000,  0.0000,  0.1598,  0.0695,  1.3265, -0.1011],
            [-0.4256, -1.1846, -2.1648, -1.0893,  0.3590, -1.9784, -0.7681]],
            grad_fn=<CatBackward0>)
        print(pooled_embeddings.keys())
        ['f1', 'f2']
        print(pooled_embeddings.offset_per_key())
        tensor([0, 3, 7])
    """

    def __init__(self, feature_store: FeatureStore) -> None:
        super().__init__()
        self.embedding_bags: nn.ModuleDict = nn.ModuleDict()
        self._feature_store = feature_store
        self._lengths_per_embedding: List[int] = []

        table_names = set()
        for embedding_config in tables:
            if embedding_config.name in table_names:
                raise ValueError(f"Duplicate table name {embedding_config.name}")
            table_names.add(embedding_config.name)
            dtype = (
                torch.float32
                if embedding_config.data_type == DataType.FP32
                else torch.float16
            )
            self.embedding_bags[embedding_config.name] = nn.EmbeddingBag(
                num_embeddings=embedding_config.num_embeddings,
                embedding_dim=embedding_config.embedding_dim,
                mode=pooling_type_to_str(embedding_config.pooling),
                device=self._device,
                include_last_offset=True,
                dtype=dtype,
            )

            if not embedding_config.feature_names:
                embedding_config.feature_names = [embedding_config.name]
            self._lengths_per_embedding.extend(
                len(embedding_config.feature_names) * [embedding_config.embedding_dim]
            )

        self._embedding_names: List[str] = [
            embedding
            for embeddings in get_embedding_names_by_table(tables)
            for embedding in embeddings
        ]
        self._feature_names: List[List[str]] = [table.feature_names for table in tables]
        self.reset_parameters()

    def forward(self, features: KeyedJaggedTensor) -> KeyedTensor:
        """
        Args:
            features (KeyedJaggedTensor): KJT of form [F X B X L].

        Returns:
            KeyedTensor
        """

        pooled_embeddings: List[torch.Tensor] = []

        feature_dict = features.to_dict()
        for i, embedding_bag in enumerate(self.embedding_bags.values()):
            for feature_name in self._feature_names[i]:
                f = feature_dict[feature_name]
                res = embedding_bag(
                    input=f.values(),
                    offsets=f.offsets(),
                    per_sample_weights=f.weights() if self._is_weighted else None,
                ).float()
                pooled_embeddings.append(res)
        data = torch.cat(pooled_embeddings, dim=1)
        return KeyedTensor(
            keys=self._embedding_names,
            values=data,
            length_per_key=self._lengths_per_embedding,
        )

    def is_weighted(self) -> bool:
        return self._is_weighted

    def embedding_bag_configs(self) -> List[EmbeddingBagConfig]:
        return self._embedding_bag_configs

    @property
    def device(self) -> torch.device:
        return self._device

    def reset_parameters(self) -> None:
        if (isinstance(self.device, torch.device) and self.device.type == "meta") or (
            isinstance(self.device, str) and self.device == "meta"
        ):
            return
        # Initialize embedding bags weights with init_fn
        for table_config in self._embedding_bag_configs:
            assert table_config.init_fn is not None
            param = self.embedding_bags[f"{table_config.name}"].weight
            # pyre-ignore
            table_config.init_fn(param)


In [None]:
class CrossProductNet(nn.Module):
    def forward(self, emb_x):
        cross = emb_x @ emb_x.permute(1, 0)
        triangle_lower_cross = torch.tril(cross, diagonal=-1)
        return torch.sum(triangle_lower_cross)

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout=0.1, batch_norm=True):
        super(MLP, self).__init__()
        
        _layers = []
        dims = [input_dim] + hidden_dim + [1]
        for in_dim, out_dim in zip(dims, dims[1:]):
            _layers.append(nn.Linear(in_dim, out_dim))
            if batch_norm:
                _layers.append(nn.BatchNorm1d(out_dim))
            _layers.append(nn.ReLU())
            _layers.append(nn.Dropout(p=dropout))

        self.layers = nn.ModuleList(_layers)
        
    def forward(self, emb_x):
        return self.layers(emb_x)
    
# class EmbeddingNet(nn.Module):
#     def __init__(self, features):
#         super(EmbeddingNet, self).__init__()
#         _embeddings = {feat.name: nn.Embedding(feat.emb_values, feat.emb_dim) for feat in features}
#         self.features = features
#         self.embeddings = nn.ModuleDict(_embeddings)
        
#     def forward(self, x):
#         emb = []
#         for name, embed_matrix in self.embeddings.items():
#             if isinstance(x[name], list):
#                 emb.append(torch.sum(embed_matrix(torch.tensor(x[name])), axis=0))
#             else:
#                 emb.append(embed_matrix(torch.tensor(x[name])))
#         return emb

# class EmbeddingNet(nn.Module):
#     def __init__(self, sparse_feat, varlen_feat, emb_dim):
#         super(EmbeddingNet, self).__init__()
#         self.sparse_emb = nn.Embedding(sparse_feat, emb_dim)
#         self.varlen_sparse_emb = nn.Embedding(varlen_feat, emb_dim)
        
#         self.offsets_sparse = np.array((0, *np.cumsum(sparse_feat)[:-1]))
#         self.offsets_varlen = np.array((0, *np.cumsum(varlen_feat)[:-1]))
        
#     def forward(self, x):
#         x = x + x.new_tensor(self.offsets).unsqueeze(0)
#         return self.embedding(x)       

class DeepFM(nn.Module):
    def __init__(self, feature_store, hidden_dim):
        super(DeepFM, self).__init__()
        self.feature_store = feature_store

        self.V = EmbeddingNet(sparse_num_emb, varlen_num_emb, emb_dim)
        self.fm = CrossProductNet()
        self.dnn = MLP(input_dim, hidden_dim)
        
    def forward(self, x):
        x_sparse, x_dense = x[feature_store.sparse_index], x[feature_store.dense_index]
        
        x_sparse = self.V(x_sparse)
        x_sparse_dense = torch.cat([x_sparse, x_dense])
        x = self.fm(x_sparse_emb) + self.dnn(x_sparse_dense)
        return x

In [None]:
model = DeepFM(feature_store, hidden_dim=[128,64])

In [None]:
torch.tensor([[1,2,3], [4,5,6], [7,8,9]])[[1,2]]

In [None]:
model

In [None]:
features.features

In [None]:
train_dataset[9432]

In [None]:
filter_input_types(train_dataset[9432], features)

In [None]:
e = EmbeddingNet([f.num_emb for f in features], 8)

In [None]:
np.array((0, *np.cumsum([f.num_emb for f in features])[:-1]))

In [None]:
[f.num_emb for f in features]

In [None]:
e.offsets

In [None]:
torch.arange(20).new_tensor([0, 5, 10]).unsqueeze(0)

In [None]:
x_input = {
    "user_id": 5,
    "item_id": 20,
    "tags": [0, 5, 8, 100],
    "OS": [0, 1],
    "AvgCatRating": 3 
}
x_input_tensor = torch.tensor([5, 
                               20, 
                               0, 5, 8, 100,
                               0, 1,
                               3
                              ])

In [None]:
e(x_input)

In [None]:
def approx_negative_sampling(u_id, i_id, n_samples, bs):
    ind = torch.arange(bs).repeat(n_samples)
    perm =  torch.randperm(ind.shape[0])
    random_indices = ind[perm]
    i_id_neg = i_id[random_indices]
    
    u_id = u_id.repeat(n_samples+1)
    i_id = torch.cat([i_id, i_id_neg])
    y_true = torch.cat([torch.ones(bs).to(device),
                       torch.zeros(bs*n_samples).to(device)])
    
    return u_id, i_id, y_true

In [None]:
def train(n_epochs, print_loss=500):
    model.train()
    batch_size = train_loader.batch_size
    
    for epoch in range(n_epochs):
        running_loss = 0.
        preds, ground_truths = [], []
        batch_print_loss = 0.
        for i_batch, batch in enumerate(tqdm(train_loader)):
            batch = [i.to(device) for i in batch]
            u_id, i_id, u_attr, i_attr = batch
 
            u_id, i_id, y_true = approx_negative_sampling(u_id, i_id, n_samples=2, bs=batch_size)
            
            y_pred = model(u_id, i_id)
            loss = criterion(y_pred, y_true)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            preds.append(y_pred)
            ground_truths.append(y_true)
            running_loss += loss.item()
            
            if not ((i_batch+1) % print_loss):
                pred = torch.cat(preds, dim=0).detach().sigmoid().cpu().numpy()
                ground_truth = torch.cat(ground_truths, dim=0).detach().cpu().numpy()
                last_loss = running_loss / print_loss
                
                train_roc_auc = roc_auc_score(ground_truth, pred)
                test_loss, test_roc_auc = test()
                
                preds, ground_truths = [], []
                running_loss = 0.
                
                print(f"batch <{i_batch}>\ntrain_loss: {last_loss} - train_roc_auc: {train_roc_auc}\ntest_loss: {test_loss} - test_roc_auc: {test_roc_auc}\n")
        print(f"Epoch: {epoch}, Loss: {running_loss / len(train_loader):.4f}")

In [None]:
@torch.no_grad()
def test():
    model.eval()
    batch_size = val_loader.batch_size
    
    running_loss = 0.
    preds, ground_truths = [], []

    for i_batch, batch in enumerate(tqdm(val_loader)):
        batch = [i.to(device) for i in batch]
        u_id, i_id, u_attr, i_attr = batch
        
        u_id, i_id, y_true = approx_negative_sampling(u_id, i_id, n_samples=2, bs=batch_size)

        y_pred = model(u_id, i_id)
        loss = criterion(y_pred, y_true)
        
        preds.append(y_pred)
        ground_truths.append(y_true)
        running_loss += loss.item()
        
    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    
    test_loss = running_loss / len(val_loader)
    test_score = roc_auc_score(ground_truth, pred)

    return test_loss, test_score

In [None]:
train_dataset = MFDataset(train_set, data['user_attr'], data['item_attr'])
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, drop_last=True)

val_dataset = MFDataset(valid_set, data['user_attr'], data['item_attr'])
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False, drop_last=True)

In [None]:
n_users, n_items = user_attr.shape[0], item_attr.shape[0]

model = MF(n_users, n_items, emb_size=32)
model = model.to(device)

In [None]:
criterion = nn.BCEWithLogitsLoss()
#optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
optimizer = torch.optim.RMSprop(params=model.parameters(), lr=1e-4, momentum=0.9)

In [None]:
train(n_epochs=10, print_loss=1000)

In [None]:
model

In [None]:
test()

In [None]:
with torch.no_grad():
    batch_size = train_loader.batch_size
    batch = next(iter(train_loader))
    batch = [i.to(device) for i in batch]
    u_id, i_id, u_attr, i_attr = batch
    
    neg_item_idx = approx_negative_sampling(i_id, bs=batch_size)

    u_id = torch.cat([u_id, u_id])
    i_id = torch.cat([i_id, neg_item_idx])
    y_true = torch.cat([torch.ones(batch_size).to(device),
                        torch.zeros(batch_size).to(device)])

    y_pred = model(u_id, i_id)
    
    print(y_pred.sigmoid(), y_true)

In [None]:
criterion(y_pred, y_true)

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    
def load_model(path):
    model = MF(n_users, n_items, emb_size=32)
    model.load_state_dict(torch.load(path))
    model = model.to(device)
    return model

In [None]:
#save_model(model, "models/mf_01.pth")

In [None]:
model = load_model("models/mf_01.pth")

In [None]:
from reco_env import RecoEnv
from utils import import_data_for_env
import gym

In [None]:
env = gym.make(RecoEnv.id, **import_data_for_env())

In [None]:
vc = rec.user_id.value_counts()

In [None]:
vc

In [None]:
vc[vc >= 3]

In [None]:
plt.plot(vc)