In [1]:
%load_ext autoreload
%autoreload 2

import logging
import torch
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [2]:
import torch
import ptls
import pandas as pd
import numpy as np

import pytorch_lightning as pl

import catboost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

from ptls.frames import PtlsDataModule, coles
from ptls.frames.coles import split_strategy
from ptls.data_load import datasets
from ptls.data_load.datasets import ParquetFiles
from ptls.data_load.datasets import MemoryMapDataset, parquet_file_scan, ParquetDataset
from ptls.data_load.iterable_processing import SeqLenFilter, FeatureFilter
from ptls.frames.coles import CoLESModule
from ptls.nn.trx_encoder import TrxEncoder
from ptls.nn.seq_encoder import RnnSeqEncoder
from ptls.nn.head import Head
from ptls.frames.coles.losses import ContrastiveLoss
from ptls.frames.coles import sampling_strategies

from pytorch_lightning.loggers import TensorBoardLogger

import sklearn
from sklearn.metrics import mean_absolute_error, accuracy_score

from functools import partial

# <font size="5">data module</font>

In [4]:
from ptls.frames import coles
from ptls.data_load import datasets
from ptls.frames import PtlsDataModule
from ptls.frames.coles import losses, sampling_strategies
from ptls.frames.coles import split_strategy

data_module = PtlsDataModule(
    train_data=ptls.frames.coles.ColesDataset(
        splitter=split_strategy.SampleSlices(split_count=5, cnt_min=20, cnt_max=60),
        data=ptls.data_load.datasets.AugmentationDataset(
            data=MemoryMapDataset(
                data=ParquetDataset(
                    i_filters=[SeqLenFilter(min_seq_len=30), FeatureFilter()],
                    data_files=parquet_file_scan(file_path='train_agg_without_last.parquet',
                                                 valid_rate=0,
                                                 return_part='train')
                )
            ),
            f_augmentations=[ptls.data_load.augmentations.DropoutTrx(trx_dropout=0.01)]
        )
    ),
train_batch_size=256,
train_num_workers=8,
)

# <font size="5">Load config & model</font>

In [5]:
from omegaconf import OmegaConf
conf = OmegaConf.load('cpc.yaml')

In [6]:
print(OmegaConf.to_yaml(conf.data_module))

_target_: ptls.frames.PtlsDataModule
train_data:
  _target_: ptls.frames.cpc.CpcIterableDataset
  min_len: 600
  max_len: 800
  data: ${dataset_unsupervised.train}
valid_data:
  _target_: ptls.frames.cpc.CpcIterableDataset
  min_len: 600
  max_len: 800
  data: ${dataset_unsupervised.valid}
train_batch_size: 256
train_num_workers: 8
valid_batch_size: 256
valid_num_workers: 16



In [7]:
import hydra
model = hydra.utils.instantiate(conf.pl_module)

# <font size="5">training</font>

In [8]:
trainer_params = conf.trainer
trainer_params['max_epochs'] = 15
print(OmegaConf.to_yaml(trainer_params))

gpus: 1
auto_select_gpus: false
max_epochs: 15
deterministic: true



In [9]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

trainer_params = conf.trainer
logger = TensorBoardLogger('src/ptls-experiments/scenario_alpha_battle/lightning_logs',
                           name='coles-baseline-alpha-battle')
trainer = pl.Trainer(**trainer_params, logger=logger)

In [10]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, data_module)
print(trainer.logged_metrics)

logger.version = 17


Training: 0it [00:00, ?it/s]

{'loss': tensor(3.1582), 'seq_len': tensor(39.6306)}
CPU times: user 56min 19s, sys: 34min 11s, total: 1h 30min 31s
Wall time: 1h 25min 42s


# <font size="5">save model</font>

In [11]:
torch.save(model.state_dict(), "models_alpha_battle/cpc_pretrain_without_last.pth")