In [1]:
import sys
import os
sys.path.append('../')
os.environ['SEQ_SPLITS_DATA_PATH'] = "../data/"

In [2]:
import os
from hydra import compose, initialize
from omegaconf import OmegaConf


In [3]:
from runs.train import prepare_data, create_dataloaders


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
config = OmegaConf.load('../runs/configs/train.yaml')

with initialize(config_path="../runs/configs/"):  
    config = compose(
        config_name="train",      
        overrides=[
            "quantile=0.9",
            "split_subtype=val_by_time",
            "dataset=Beauty",
            "model=GPT2",
            "trainer_params.max_epochs=10",
            
        ],
        return_hydra_config=False,
    )

print(OmegaConf.to_yaml(config, resolve=True))

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with initialize(config_path="../runs/configs/"):


cuda_visible_devices: 0
random_state: 101
clearml_project_folder: null
clearml_task_name: null
use_pretrained_embeddings: false
pretrained_embeddings:
  add_padding_emb: true
  freeze: false
use_semantic_ids: false
semantic_ids_map_path: data/SemanticID/PATH.pkl
split_type: global_timesplit
split_subtype: val_by_time
quantile: 0.9
validation_quantile: 0.9
dataset_params:
  max_length: 128
dataloader:
  batch_size: 64
  test_batch_size: 64
  num_workers: 8
  validation_size: 2048
seqrec_module:
  lr: 0.001
  predict_top_k: 10
  filter_seen: true
trainer_params:
  max_epochs: 10
  accelerator: gpu
patience: 20
load_if_possible: false
evaluator:
  successive_val: false
  successive_test: false
  successive_test_retrained: false
  calc_successive_metrics_val: true
  calc_successive_metrics_test: true
  calc_successive_metrics_test_retrained: true
  successive_replay_metrics: false
  metrics:
  - NDCG
  - HitRate
  - MRR
  - Coverage
  top_k:
  - 1
  - 5
  - 10
  - 20
  - 50
  - 100
retrain

In [5]:
import os
import time

import hydra
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from clearml import Task
from omegaconf import OmegaConf
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import (EarlyStopping, ModelCheckpoint,
                                         ModelSummary, TQDMProgressBar)
from torch.utils.data import DataLoader

from src.datasets import (CausalLMDataset, CausalLMPredictionDataset,
                          PaddingCollateFn)
from src.metrics import Evaluator
from src.models import SASRec
from src.modules import SeqRec, SeqRecHuggingface
from src.postprocess import preds2recs
from src.prepr import last_item_split

import itertools
import pickle
from tqdm.auto import tqdm

### Prepare data

In [6]:
if hasattr(config, 'cuda_visible_devices'):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(config.cuda_visible_devices)


train, validation, test, max_item_id, global_timepoint, global_timepoint_val = prepare_data(config)

train_loader, eval_loader = create_dataloaders(train, validation, config)

train shape (160178, 4)
validation shape (66297, 4)
test shape (70263, 4)
Test global timepoint 1399939200.0
Validation global timepoint 1394668800.0


In [7]:
max_item_id

12101

### Batch example

In [8]:
batch = next(iter(eval_loader))

print(batch)

{'input_ids': tensor([[ 2728,   310,  7384,  ...,     0,     0,     0],
        [ 3357,  9568,  2192,  ...,     0,     0,     0],
        [ 9289, 10033, 12011,  ...,     0,     0,     0],
        ...,
        [ 2102,  2983,   481,  ...,     0,     0,     0],
        [   40,  3282,  9476,  ...,     0,     0,     0],
        [ 2143,  1887,  4197,  ...,     0,     0,     0]]), 'user_id': tensor([  5,  11,  27,  37,  42,  46,  66,  69,  71,  75,  79,  82,  99, 102,
        115, 123, 133, 144, 151, 153, 184, 198, 200, 205, 221, 231, 248, 257,
        277, 289, 322, 333, 334, 343, 354, 358, 359, 360, 370, 375, 378, 387,
        404, 419, 424, 438, 450, 479, 495, 497, 500, 502, 560, 565, 575, 576,
        578, 580, 605, 616, 621, 622, 630, 633]), 'seen_ids': tensor([[ 2728,   310,  7384,  ...,     0,     0,     0],
        [ 3357,  9568,  2192,  ...,     0,     0,     0],
        [ 9289, 10033, 12011,  ...,     0,     0,     0],
        ...,
        [ 2102,  2983,   481,  ...,     0,     0,  

### Model creation

In [9]:
pwd

'/home/jovyan/shares/SR003.nfs2/volodkevich/smiles_25/semantic_seqrec/notebooks'

In [10]:
from runs.train import create_model

In [11]:
model = create_model(config, item_count=max_item_id)

In [12]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(12102, 64)
    (wpe): Embedding(128, 64)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-1): 2 x GPT2Block(
        (ln_1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=64, out_features=12102, bias=False)
)

In [13]:
retrain = False
split_subtype = config.split_subtype or ''
q = 'q0' + str(config.quantile)[2:] if config.split_type == 'global_timesplit' else ''
model_path = os.path.join(
    os.path.dirname(os.path.abspath('.')), 'models', config.split_type,
    split_subtype, config.dataset.name, q, config.model.model_class, 'retrain_with_val' if retrain else '')

print(model_path)
if not os.path.exists(model_path):
    os.makedirs(model_path)

/home/jovyan/shares/SR003.nfs2/volodkevich/smiles_25/semantic_seqrec/models/global_timesplit/val_by_time/Beauty/q09/GPT-2/


In [14]:
if config.model.model_class == 'SASRec':
    file_name = (
        f"{config.model.model_params.hidden_units}_"
        f"{config.model.model_params.num_blocks}_"
        f"{config.model.model_params.num_heads}_"
        f"{config.model.model_params.dropout_rate}_"
        f"{config.model.model_params.maxlen}_"
        f"{config.dataloader.batch_size}_"
        f"{config.random_state}"
    )
elif config.model.model_class == 'GPT-2':
    file_name = (
        f"{config.model.model_params.n_embd}_"
        f"{config.model.model_params.n_layer}_"
        f"{config.model.model_params.n_head}_"
        f"{config.dataloader.batch_size}_"
        f"{config.random_state}"
    )

checkpoint_file = os.path.join(model_path, file_name + ".ckpt")

if config.model.model_class == 'GPT-2':
    seqrec_module = SeqRecHuggingface(model, **config['seqrec_module'])
else:   
    seqrec_module = SeqRec(model, **config['seqrec_module']) 

model_summary = ModelSummary(max_depth=1)
progress_bar = TQDMProgressBar(refresh_rate=20)

checkpoint = ModelCheckpoint(
    dirpath=model_path,  
    filename='_' + file_name,           
    save_top_k=1,
    monitor="val_ndcg",
    mode="max",
    save_weights_only=True
)
early_stopping = EarlyStopping(monitor="val_ndcg", mode="max",
                            patience=config.patience, verbose=False)
callbacks = [early_stopping, model_summary, checkpoint, progress_bar]

trainer = pl.Trainer(callbacks=callbacks, enable_checkpointing=True, 
                        **config['trainer_params'])

start_time = time.time()


Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Traning

In [15]:
trainer.fit(model=seqrec_module,
                    train_dataloaders=train_loader,
                    val_dataloaders=eval_loader)

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/jovyan/shares/SR003.nfs2/volodkevich/seq_splits/av_env_seq_splits/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /home/jovyan/shares/SR003.nfs2/volodkevich/smiles_25/semantic_seqrec/models/global_timesplit/val_by_time/Beauty/q09/GPT-2 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 882 K 
------------------------------------------
882 K     Trainable params
0         Non-trainable params
882 K     Total params
3.531     Total estimated model 

Epoch 9: 100%|██████████| 318/318 [00:03<00:00, 92.11it/s, v_num=2, val_ndcg=0.0251, val_hit_rate=0.0439, val_mrr=0.0194]    

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 318/318 [00:03<00:00, 91.46it/s, v_num=2, val_ndcg=0.0251, val_hit_rate=0.0439, val_mrr=0.0194]


### Predict

In [16]:
if config.model.model_class == 'GPT-2':
    if config.model.generation:
        predict_dataset = CausalLMPredictionDataset(
            test, max_length=config.dataset_params.max_length - max(config.evaluator.top_k))
        
        predict_loader = DataLoader(
                predict_dataset, shuffle=False,
                collate_fn=PaddingCollateFn(left_padding=True),
                batch_size=config.dataloader.test_batch_size,
                num_workers=config.dataloader.num_workers)
        
        seqrec_module.set_predict_mode(generate=True, mode=config.model.mode, **config.model.generation_params)

    else:
        predict_dataset = CausalLMPredictionDataset(test, max_length=config.dataset_params.max_length)

        predict_loader = DataLoader(
                predict_dataset, shuffle=False,
                collate_fn=PaddingCollateFn(),
                batch_size=config.dataloader.test_batch_size,
                num_workers=config.dataloader.num_workers)
        
        seqrec_module.set_predict_mode(generate=False)

In [17]:
preds = trainer.predict(model=seqrec_module, dataloaders=predict_loader)
recs = preds2recs(preds, successive=False)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 96/96 [00:07<00:00, 12.44it/s]


In [18]:
recs

Unnamed: 0,user_id,item_id,prediction
0,0,3797,1.000000
1,0,5839,0.500000
2,0,2496,0.333333
3,0,7966,0.250000
4,0,180,0.200000
...,...,...,...
61095,22361,5730,0.166667
61096,22361,11689,0.142857
61097,22361,11178,0.125000
61098,22361,2876,0.111111


In [19]:
batch = next(iter(predict_loader))

In [20]:
seqrec_module.generate_params['num_return_sequences'] = 1
seqrec_module.generate_params['no_repeat_ngram_size'] = 1
seqrec_module.generate_params

{'early_stopping': False,
 'num_return_sequences': 1,
 'no_repeat_ngram_size': 1,
 'do_sample': False}

### HuggingFace Generate

In [21]:
seq = seqrec_module.model.generate(
                batch['input_ids'][:, -seqrec_module.model.config.n_positions + seqrec_module.predict_top_k:].to(seqrec_module.model.device),
                pad_token_id=seqrec_module.padding_idx,
                max_new_tokens=seqrec_module.predict_top_k,
                **seqrec_module.generate_params,
            )

preds = seq[:, -seqrec_module.predict_top_k:]

In [22]:
seq

tensor([[    0,     0,     0,  ...,   633,  3696, 11229],
        [    0,     0,     0,  ...,  5883,  9882,  5403],
        [    0,     0,     0,  ..., 12062,   856,   574],
        ...,
        [ 7666,  7831,  4680,  ..., 11951,  2335,  3539],
        [    0,     0,     0,  ...,  9177,  1110,  6823],
        [    0,     0,     0,  ..., 10735,  5754,  1547]])

In [23]:
batch['input_ids'].shape

torch.Size([64, 28])

In [24]:
seq.shape

torch.Size([64, 38])

In [25]:
preds

tensor([[ 8096,   922,  5655,  6829,  4142, 10104, 11905,   633,  3696, 11229],
        [ 7168,  5523,   332,  6993, 11678, 12070, 11460,  5883,  9882,  5403],
        [ 7177,  1962,  5374,  5663,  3569,  1899,  8457, 12062,   856,   574],
        [ 4565,  3575,  8645,   208,  9937,  7209,  2039,  4170,  8956, 10519],
        [ 3315,  3869, 11681,  7729,  8355,  2382,   705,  2525,  2876, 11178],
        [ 9806,  3468,  8160,  3669,  5265,  4737,  1022, 10786, 10762,   697],
        [ 8443,  1292,   137,  4442, 11073,  1547,  3141,  8294,  2516,  7352],
        [ 1682,  1566,  5940,  1547, 11073,  3141,   431,  9734,  7352,  5754],
        [ 1890,  7280,  1727,  8411,   663,   456,  7551,  8851,  3772,  9714],
        [ 7062,  3479, 10125, 11675, 11690,  2295,  5910,  8677, 11918,  2945],
        [11451,  2331,  3103,   477,  2843, 10823,  5839,   107, 11665,  4756],
        [ 6566,  4017,  3675,  5839,  2496, 12024, 10904,  9859,  7890, 11404],
        [ 5117,  1375, 10378,  7735,  62