<a href="https://colab.research.google.com/github/omomer16/EPFL_AMLD_2024/blob/main/cleora_and_emde.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!git clone https://github.com/Synerise/predicting-user-behavior-workshop.git
%cd predicting-user-behavior-workshop

Cloning into 'predicting-user-behavior-workshop'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 72 (delta 15), reused 20 (delta 7), pack-reused 43[K
Receiving objects: 100% (72/72), 27.41 MiB | 14.30 MiB/s, done.
Resolving deltas: 100% (32/32), done.
Updating files: 100% (12/12), done.
/content/predicting-user-behavior-workshop


In [4]:
!pip install lightning
!pip install torchmetrics
!pip install cleora_saas_api

Collecting lightning
  Downloading lightning-2.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning)
  Downloading lightning_utilities-0.11.0-py3-none-any.whl (25 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.3.2-py3-none-any.whl (841 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.2.1-py3-none-any.whl (801 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.6/801.6 kB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch<4.0,>=1.13.0->lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim, Tensor
from torchmetrics import AveragePrecision, AUROC
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

from emde import calculate_absolute_emde_codes
from cleora_saas_api import CLI


# Constants
First we define all constants that will be used in this notebook.

In [14]:
DATA_DIR = "./data/"
TRAIN_PD_PATH = os.path.join(DATA_DIR, "2019-Oct_small.csv")
TARGETS_PATH = os.path.join(DATA_DIR,"train_target.npy")
VALIDATION_TARGETS_PATH = os.path.join(DATA_DIR,"test_target.npy")
USER_IDS = os.path.join(DATA_DIR, "user_ids.npy")
CLEORA_INPUT_FILE = os.path.join(DATA_DIR,"cleora_input.tsv")
EMBEDDINGS_NPZ = "embeddings.npz"
SKETCH_DEPTH= 20
SKETCH_WIDTH = 64
BATCH_SIZE = 128
LEARNING_RATE = 0.0001
MAX_EPOCH = 1
ACCELERATOR = "gpu"
DEVICES = 1
NUM_WORKERS = 8
EXPERIMENT_NAME = "experiment_with_brands"
CLEORA_API_TOKEN = "AMf-vBzu2gRj3CHurpiW47XDIZRvlo8b2lzNAJIQMp3b8W9M5OSMJTn-C7aq97E20Hv6dqx4XMhi82RdCIWgxadLlffWznlV-ewqStw4HUa5SIZgH76hmhMl9P_OuFa0_PvJyj3ogtqh_0w5Id7RlD-i37fI5s0rnbmRgRomH0ZtorIvEYtd4lQGt_RCXbKiSoGKOyYBSbUEisAqqSpGC502zPN4XcNq6_0Guhu4gN7xd3QvddHqrO9ch7UcDjO43jzmLhQdldVEKzoPGhdoIVW8fh9hfsC2cSo9HSb7erMoai9GDApLucmUy0FR3y9rsG1tQeVD9lEz0EMi_OuVOayDiIMGGvN2cHjTa1-Q_fkDRaPNZHEOTpkaqIsT-262VW3Uybt3E8hs1BpbSKGAHrRZIOPuAMfpbcFXxEwZvcH-JO10S-GjpKJT2Mfdac20S-mg137uqaQ0"



# Calculating embeddings with Cleora
The following function prepares input for cleora. First we load DataFrame with training data. Note that cleora works with timestamps as well. However, in our case we drop timestamps for simplicity. Finally we save the result as a tsv file, which is required input format for cleora.

In [7]:
train_df = pd.read_csv(TRAIN_PD_PATH)
train_df.head(10)

Unnamed: 0,event_time,brand,user_id
0,2019-10-01 00:02:14 UTC,samsung,543272936
1,2019-10-01 00:04:37 UTC,apple,551377651
2,2019-10-01 00:05:14 UTC,xiaomi,550121407
3,2019-10-01 00:06:02 UTC,xiaomi,514591159
4,2019-10-01 00:07:07 UTC,santeri,555332717
5,2019-10-01 00:09:26 UTC,apple,524601178
6,2019-10-01 00:09:33 UTC,apple,524325294
7,2019-10-01 00:09:54 UTC,apple,551377651
8,2019-10-01 00:10:08 UTC,apple,524325294
9,2019-10-01 00:10:56 UTC,oasis,548691404


In [8]:
def prepare_cleora_input_file():
    train_df = pd.read_csv(TRAIN_PD_PATH)
    train_df.drop("event_time", axis=1, inplace=True)
    train_df = train_df[["user_id", "brand"]]
    # train_df["event_time"] = pd.to_datetime(train_df["event_time"])
    with open(CLEORA_INPUT_FILE, "w") as tsv_file:
        train_df.to_csv(tsv_file, sep="\t", header=True, index=False)

In [12]:
prepare_cleora_input_file()

In [9]:
cleora = CLI()
cleora.login(CLEORA_API_TOKEN)

logged in successfully


In [13]:

cleora.run(256, 3, input_path=os.path.join(DATA_DIR, "cleora_input.tsv"), run_name="colab_clustering")

-- Start --


Exception: Provided input_path do not exist

# Exercise 1

Instead of using 3 iterations with dimension 256 set the number of iterations to 4 and dimension to 128.

In [15]:
# @title Solution

cleora.run(128, 4, input_path=os.path.join(DATA_DIR, "cleora_input.tsv"), run_name="colab_clustering")

-- Start --
-- Config to trigger run prepared --
-- Run started --
-- Logs: --
Downloading input file.
Number of rows in original data: 1592254
Initializing Cleora.
Iteration 1/4 done
Iteration 2/4 done
Iteration 3/4 done
Iteration 4/4 done
-- Result download started --
-- Result download finished --


The following function is used to load embeddings.

In [16]:
def load_embeddings(embeddings_path: str):
    embeddings = np.load(embeddings_path)
    return embeddings["entity_id"], embeddings["vectors"]

We explain now the output of cleora.ai app.

In [17]:
brands_ids, embeddings = load_embeddings(embeddings_path=EMBEDDINGS_NPZ)
print(f"embeddings shape: {embeddings.shape}, embeddings dtype: {embeddings.dtype}")
print(f"brands_ids shape: {brands_ids.shape}")

embeddings shape: (2012, 128), embeddings dtype: float32
brands_ids shape: (2012,)


In [18]:
brands_ids[:10]

array(['ballu', 'marcomen', 'marmiton', 'specialized', 'doona',
       'prestige', 'besty', 'creed', 'sho-me', 'kapous'], dtype='<U28')

Let us find brand which corresponds to some index and then print its embedding.

In [19]:
idx = 2
brands_ids[idx]

'marmiton'

In [20]:
embeddings[idx]

array([ 1.0230306 ,  1.9514207 ,  0.24575491, -0.9504516 ,  1.4293667 ,
       -1.6212693 , -1.7155627 ,  2.0747602 , -0.9816355 ,  0.9636677 ,
       -0.6516863 , -0.6538    ,  0.5812629 , -1.0369552 , -0.6494597 ,
       -2.378841  , -0.4666721 , -1.4923645 , -0.67555815,  0.9322502 ,
        0.04595158,  0.45137691,  1.2068661 ,  1.0692046 , -0.16219157,
       -1.5978843 ,  0.9527491 ,  0.6328959 , -0.26497874, -1.2235662 ,
       -1.1258589 ,  0.3093083 ,  1.5253558 ,  0.1168129 , -0.40584928,
        0.53664815,  1.6411399 , -0.7492391 , -0.71723795, -0.33115712,
       -0.01526039,  0.9630989 , -1.7954491 , -0.8545243 , -0.19607781,
        1.2980661 , -0.4702821 ,  1.5850583 , -0.34788403,  0.4469458 ,
        0.37845707,  1.524986  ,  1.3873322 ,  1.8265886 ,  0.7130445 ,
       -0.6066848 , -0.23600581,  0.5601566 ,  0.46953318, -0.40649077,
       -0.8146219 , -0.36442253,  0.9135999 , -1.2652913 ,  2.3965256 ,
        1.3300005 , -1.2466861 ,  0.31673935, -0.5854638 , -0.27

# Implementing Dataset class

We explain here some details related to our implementation of Dataset class.

First we investigate the contents of training DataFrame.

In [21]:
train_df = pd.read_csv(TRAIN_PD_PATH)
train_df.head(10)

Unnamed: 0,event_time,brand,user_id
0,2019-10-01 00:02:14 UTC,samsung,543272936
1,2019-10-01 00:04:37 UTC,apple,551377651
2,2019-10-01 00:05:14 UTC,xiaomi,550121407
3,2019-10-01 00:06:02 UTC,xiaomi,514591159
4,2019-10-01 00:07:07 UTC,santeri,555332717
5,2019-10-01 00:09:26 UTC,apple,524601178
6,2019-10-01 00:09:33 UTC,apple,524325294
7,2019-10-01 00:09:54 UTC,apple,551377651
8,2019-10-01 00:10:08 UTC,apple,524325294
9,2019-10-01 00:10:56 UTC,oasis,548691404


We group train Dataframe by user and aggregate obtained groups by applying list construtor. This constructs Series that contains list of interactions of every user.

In [22]:
brands = train_df.groupby("user_id", group_keys=True)["brand"].apply(list)
brands.head(10)

user_id
264649825        [kiturami, kiturami]
284344819                     [apple]
293957954                    [xiaomi]
303160429                    [garmin]
304325717    [huawei, huawei, huawei]
318611205              [huawei, zeta]
336595257          [samsung, samsung]
340041246        [lg, lg, lg, lg, lg]
348815209                   [samsung]
362327778                     [apple]
Name: brand, dtype: object

Now we are ready to implement our custom dataset class.

In [23]:
class UsersBrandsDataset(Dataset):
    def __init__(
        self,
        absolute_codes: np.ndarray,
        brands_ids: np.ndarray,
        train_df_path: str,
        targets_path: str,
        user_ids_path: str,
        sketch_width: int,
        sketch_depth: int,
        sketch_decay: float = 0.94,
    ):
        """
        Args:
            absolute_codes (np.ndarray): Array of shape (num_brands, sketch_depth) containing the absolute codes for each item
            brands_ids (np.ndarray): Array of shape (num_brands) mapping each idx to corresponding brand
            inputs_df_path (str): path to train dataframe
            targets_path (str): path to targets array
            sketch_width (int): width of the sketch
            sketch_depth (int): depth of the sketch
            sketch_decay (float): Decay factor for the sketch
        """
        self.absolute_codes = absolute_codes
        self.sketch_depth = sketch_depth
        self.sketch_width = sketch_width
        self.sketch_decay = sketch_decay

        self.brand_to_ids = {brands_ids[idx]: idx for idx in range(len(brands_ids))}

        train_df = pd.read_csv(train_df_path)

        self.brands = train_df.groupby("user_id")["brand"].apply(list)
        self.users_ids = np.load(user_ids_path)
        self.target_brands = np.load(targets_path)

    def __len__(self):
        return len(self.users_ids)

    def __getitem__(self, idx: int):
        user_brands = self.brands.iloc[idx]
        user_brands = [self.brand_to_ids[brand] for brand in user_brands]
        user_brands_codes = torch.from_numpy(self.absolute_codes[user_brands])
        # Assume that user interacted with 5 brands. Then user_brands_codes is a list of 5 emde_codes
        # and emde_code is itself a list of sketch_depth integers.
        user_sketch = torch.zeros(self.sketch_depth * self.sketch_width, dtype=torch.float32)
        for brand_codes in user_brands_codes:
            user_sketch *= self.sketch_decay
            user_sketch[brand_codes] += 1

        target = self.target_brands[idx]
        return user_sketch, target

Since we are using PyTorch Lightning, we need to wrap our dataset in LightningDataModule.

In [24]:
class UserBrandDataModule(pl.LightningDataModule):
    def __init__(
        self,
        brands_ids: np.array,
        embeddings: np.array,
        train_df_path: str,
        targets_path: str,
        validation_targets_path: str,
        user_ids_path: str,
        sketch_width: int,
        sketch_depth: int,
        batch_size: int,
        num_workers: int,
    ) -> None:
        super().__init__()
        self.brands_ids = brands_ids
        self.embeddings = embeddings
        self.train_df_path = train_df_path
        self.targets_path = targets_path
        self.validation_targets_path = validation_targets_path
        self.user_ids_path = user_ids_path
        self.sketch_depth = sketch_depth
        self.sketch_width = sketch_width
        self.batch_size = batch_size
        self.num_workers = num_workers

    def setup(self, stage) -> None:
        if stage == "fit":
            absolute_emde_codes = calculate_absolute_emde_codes(self.sketch_depth, self.sketch_width, self.embeddings)
            self.train_data = UsersBrandsDataset(
                absolute_codes=absolute_emde_codes,
                brands_ids=self.brands_ids,
                train_df_path=self.train_df_path,
                targets_path=self.targets_path,
                user_ids_path=self.user_ids_path,
                sketch_depth=self.sketch_depth,
                sketch_width=self.sketch_width,
            )
            self.validation_data = UsersBrandsDataset(
                absolute_codes=absolute_emde_codes,
                brands_ids=self.brands_ids,
                train_df_path=self.train_df_path,
                targets_path=self.validation_targets_path,
                user_ids_path=self.user_ids_path,
                sketch_depth=self.sketch_depth,
                sketch_width=self.sketch_width,
            )

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_data, batch_size=self.batch_size, num_workers=self.num_workers)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            self.validation_data,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

# Defining simple feedforward Neural Network

Below we implement simple feedforward neural network with binary cross entropy loss and multilabel auroc as validation score.

In [25]:
class Net(torch.nn.Module):
    def __init__(
        self,
        input_dim: int,
        hidden_size: int,
        output_dim: int,
    ) -> None:
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, output_dim),
        )

    def forward(self, x) -> torch.Tensor:
        return self.linear_relu_stack(x)



In [26]:
class Model(pl.LightningModule):
    def __init__(
        self,
        input_dim: int,
        hidden_size: int,
        output_dim: int,
        learning_rate: float,
    ) -> None:
        super().__init__()
        self.learning_rate = learning_rate
        self.net = Net(hidden_size=hidden_size, input_dim=input_dim, output_dim=output_dim)
        self.val_auroc = AUROC(task="multilabel", num_labels=output_dim)

    def forward(self, x) -> Tensor:
        return self.net(x)

    def configure_optimizers(self) -> optim.Optimizer:
        optimizer = optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

    def training_step(self, train_batch, batch_idx) -> Tensor:
        x, y = train_batch
        preds = self.forward(x)
        loss = F.binary_cross_entropy_with_logits(preds, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, val_batch, batch_idx) -> None:
        x, y = val_batch
        preds = self.forward(x)
        loss = F.binary_cross_entropy_with_logits(preds, y)
        self.val_auroc(preds, y.long())
        self.log("val_loss", loss, prog_bar=True, on_epoch=True, logger=True)

    def on_validation_epoch_end(self) -> None:
        self.log("val_auroc", self.val_auroc, prog_bar=True, on_epoch=True, logger=True)

# Training and results

Now we combine all these elements together into a piece of code which trains our model.

First we need calculate number of target brands, since this is the ouput size of our model.

In [27]:
num_target_brands = np.load(TARGETS_PATH).shape[1]

Next we load embeddings and brands_ids.

In [28]:
brands_ids, embeddings = load_embeddings(embeddings_path=EMBEDDINGS_NPZ)

Now we are able to construct data module and model.

In [29]:
data = UserBrandDataModule(
    brands_ids=brands_ids,
    embeddings=embeddings,
    train_df_path=TRAIN_PD_PATH,
    targets_path=TARGETS_PATH,
    validation_targets_path=VALIDATION_TARGETS_PATH,
    user_ids_path=USER_IDS,
    sketch_width=SKETCH_WIDTH,
    sketch_depth=SKETCH_DEPTH,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

model = Model(
    input_dim=SKETCH_DEPTH * SKETCH_WIDTH, hidden_size=2048, output_dim=num_target_brands, learning_rate=LEARNING_RATE
)

We also want to print some useful messages concerning training progress, current loss and validation scores. In order to to do this we add some basic logger.

In [30]:
logger = TensorBoardLogger(save_dir="logs", name=f"{EXPERIMENT_NAME}")

Finally we employ PyTorch Lightning Trainer class to wrap all configurations concerning training and validation together.

In [1]:
trainer = pl.Trainer(
    accelerator=ACCELERATOR,
    devices=DEVICES,
    max_epochs=MAX_EPOCH,
    logger=logger,
)

NameError: name 'pl' is not defined

Now by call to fit method on trainer with model and data as arguments in order to train and validate our pipeline.

In [32]:
trainer.fit(model, data)

NameError: name 'trainer' is not defined

# Exercise 2
Consider the following code, which adds additional linear layer on top of previously defined model.

    class DeepNet(torch.nn.Module):
        def __init__(
            self,
            input_dim: int,
            hidden_size: int,
            output_dim: int,
        ) -> None:
            super().__init__()
            self.linear_relu_stack = nn.Sequential(
                nn.Linear(input_dim, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
            )

        def forward(self, x) -> torch.Tensor:
            return self.linear_relu_stack(x)

Replace Net with DeepNet in appropriate cell above and try to run the the training.

**Q1:** Do you know what went wrong?

**Q2:** Can you fix it?

In [None]:
# @title Solution

# output of the sequential network has incorrect dimension!!!


class DeepNet(torch.nn.Module):
    def __init__(
        self,
        input_dim: int,
        hidden_size: int,
        output_dim: int,
    ) -> None:
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, output_dim),
        )

    def forward(self, x) -> torch.Tensor:
        return self.linear_relu_stack(x)