## Setup

In [1]:
%load_ext autoreload
%autoreload 2

import logging
import torch
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [2]:
from glob import glob
import numpy as np
import logging
import pytorch_lightning as pl
import torch
from tqdm import tqdm
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split


logger = logging.getLogger(__name__)

## Data preproccessing

In [3]:
%%time
df = pd.read_parquet('data/train_transactions_clipped.parquet')

del df['app_id'], df['flag'], df['transaction_number']

for col in tqdm(df.columns):
    df[col] = df[col].apply(torch.tensor)

data = df.to_dict(orient='records')
train_data, valid_data = train_test_split(data, test_size = 0.1, random_state = 42)

100%|███████████████████████████████████████████| 19/19 [00:32<00:00,  1.69s/it]


CPU times: user 39.7 s, sys: 16.7 s, total: 56.4 s
Wall time: 41.1 s


In [4]:
from omegaconf import OmegaConf
conf = OmegaConf.load('config/cpc.yaml')

In [5]:
print(OmegaConf.to_yaml(conf.data_module))

_target_: ptls.frames.PtlsDataModule
train_data:
  _target_: ptls.frames.cpc.CpcIterableDataset
  min_len: 600
  max_len: 800
  data: ${dataset_unsupervised.train}
valid_data:
  _target_: ptls.frames.cpc.CpcIterableDataset
  min_len: 600
  max_len: 800
  data: ${dataset_unsupervised.valid}
train_batch_size: 256
train_num_workers: 8
valid_batch_size: 256
valid_num_workers: 16



In [6]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.frames.cpc import CpcIterableDataset
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule


train_ds = MemoryMapDataset(data=train_data)
valid_ds = MemoryMapDataset(data=valid_data)

train_data=CpcIterableDataset(train_ds, min_len=600, max_len=800)
valid_data=CpcIterableDataset(valid_ds, min_len=600, max_len=800)

dm = PtlsDataModule(
    train_data=train_data, train_num_workers=8, train_batch_size=256, 
    valid_data=valid_data, valid_num_workers=16, valid_batch_size=256
)

# Model

In [7]:
import hydra
model = hydra.utils.instantiate(conf.pl_module)

### Training 

In [8]:
trainer_params = conf.trainer
trainer_params['max_epochs'] = 15
print(OmegaConf.to_yaml(trainer_params))


gpus: 1
auto_select_gpus: false
max_epochs: 15
deterministic: true



In [9]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

trainer_params = conf.trainer
callbacks = [ModelCheckpoint(every_n_epochs=5, save_top_k=-1), LearningRateMonitor(logging_interval='step')]
logger = TensorBoardLogger(save_dir='lightning_logs', name=conf.get('logger_name'))
trainer = pl.Trainer(**trainer_params, callbacks=callbacks, logger=logger)

In [None]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, dm)
print(trainer.logged_metrics)

logger.version = 6


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

# Save model

In [12]:
torch.save(model.state_dict(), conf.model_path)
print(f'Model weights saved to "{conf.model_path}"')

Model weights saved to "models/cpc.p"
