In [1]:
import os

import torch
import pandas as pd
import numpy as np


In [2]:
# from src.dataset import generate_splits, preprocess


In [3]:
train_merged = pd.read_csv('data/ieee-train-merged_imputed_cleaned_dropped_stuff.csv', index_col=None)

print(train_merged.columns)
print(train_merged.describe())

print(train_merged.shape)


Index(['Unnamed: 0', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD',
       'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320',
       'V321'],
      dtype='object', length=182)
          Unnamed: 0        isFraud  TransactionDT  TransactionAmt  \
count  590540.000000  590540.000000   5.905400e+05   590540.000000   
mean   295269.500000       0.034990   7.372311e+06      135.027176   
std    170474.358321       0.183755   4.617224e+06      239.162522   
min         0.000000       0.000000   8.640000e+04        0.251000   
25%    147634.750000       0.000000   3.027058e+06       43.321000   
50%    295269.500000       0.000000   7.306528e+06       68.769000   
75%    442904.250000       0.000000   1.124662e+07      125.000000   
max    590539.000000       1.000000   1.581113e+07    31937.391000   

               card1          card2          card3          card5  \
count  590540.000000  590540

In [4]:
# Total rows: 590540
# Percent: 0.45 %


In [5]:
y = train_merged['isFraud'].copy()
print(y)
print("-------")
print(y.value_counts())


0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
590535    0.0
590536    0.0
590537    0.0
590538    0.0
590539    0.0
Name: isFraud, Length: 590540, dtype: float64
-------
isFraud
0.0    569877
1.0     20663
Name: count, dtype: int64


In [6]:

train_merged.drop(columns=['isFraud'], inplace=True)


In [7]:
train_merged.shape


(590540, 181)

In [13]:
from src.dataset import generate_splits, preprocess


In [14]:
# separate 200 samples for supervised training, all others are used in ssl training
num_supervised_train_data = 2657

# get split indices
sup_train_indices, val_indices, test_indices, ssl_train_indices = generate_splits(len(train_merged), 
                                                                            num_supervised_train_data,
                                                                            args['preproc']['validation_split'],
                                                                            args['preproc']['test_split'],
                                                                            args['seed'],)


In [15]:
df_proc, y_proc, no_num, no_cat, cats  = preprocess(train_merged, y, args['transformer']['cls_token_idx'])


In [16]:
cat_cols = train_merged.select_dtypes(include=['object','category']).columns
cat_cols
# cat_data = data[cat_cols].copy()


Index(['ProductCD', 'card4', 'card6', 'P_emaildomain'], dtype='object')

In [17]:
#This variables will need to be added to the config files in "configs/data/bank_*" before training

print('no of numerical columns: ', no_num)
print('no of categorical columns: ', no_cat)

print('list of categories in each categorical column: ', cats)


no of numerical columns:  177
no of categorical columns:  5
list of categories in each categorical column:  [1, 5, 4, 4, 59]


In [18]:
#### args.num_features = args.no_num + args.no_cat

# split data into train, val and test using generated indices

train_df, train_y   = df_proc.iloc[sup_train_indices], y_proc.iloc[sup_train_indices]
val_df, val_y       = df_proc.iloc[val_indices], y_proc.iloc[val_indices]
test_df, test_y     = df_proc.iloc[test_indices], y_proc.iloc[test_indices]


In [19]:
# Generate data for self-supervised training if specified

train_ssl, train_ssl_y = None, None

if num_supervised_train_data != 'all':
    train_ssl, train_ssl_y = df_proc.iloc[ssl_train_indices], y_proc.iloc[ssl_train_indices]


In [23]:
# save dataframes in the data directory

train_df.to_csv('./IEEE_Preprocess/train.csv' , index=False)
train_y.to_csv('./IEEE_Preprocess/train_y.csv' , index=False)
val_df.to_csv('./IEEE_Preprocess/val.csv' , index=False)
val_y.to_csv('./IEEE_Preprocess/val_y.csv' , index=False)
test_df.to_csv('./IEEE_Preprocess/test.csv' , index=False)
test_y.to_csv('./IEEE_Preprocess/test_y.csv' , index=False)

if train_ssl is not None:
    train_ssl.to_csv('./IEEE_Preprocess/train_ssl.csv' , index=False)

if train_ssl_y is not None:
    train_ssl_y.to_csv('./IEEE_Preprocess/train_ssl_y.csv' , index=False)


In [20]:
import pytorch_lightning as li


In [21]:
# import os
# os.environ['SETUPTOOLS_USE_DISTUTILS'] = 'stdlib'


In [24]:
import os
os.environ['HYDRA_FULL_ERROR'] = '1'


In [38]:
# --- Config Information ---#
try:
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML

config_path = './configs/config.yaml'

yaml = YAML(typ='safe')
with open(config_path) as f:
    args = yaml.load(f)

print(args)


{'defaults': ['_self_', {'experiment': 'supervised'}, {'data': 'bank_sup'}], 'seed': 1234, 'transformer': {'num_layers': 6, 'num_heads': 8, 'dropout': 0.1, 'dropout_ff': 0.1, 'embed_dim': 32, 'd_ff': 32, 'cls_token_idx': 0}, 'augmentation': {'prob_cutmix': 0.3, 'alpha': 0.2, 'lambda_pt': 10}, 'optimizer': {'temperature': 0.7, 'proj_head_dim': 128, 'beta_1': 0.9, 'beta_2': 0.99, 'lr': 0.0001, 'weight_decay': 0.01, 'optim': 'adamw', 'metric': 'auroc'}, 'preproc': {'data_folder': None, 'train_split': 0.65, 'validation_split': 0.15, 'test_split': 0.2, 'num_supervised_train_data': None}, 'callback': {'monitor': 'val_loss', 'mode': 'min', 'auto_insert_metric_name': False}, 'trainer': {'max_epochs': 1, 'accelerator': 'cpu', 'devices': 1, 'deterministic': True, 'default_root_dir': None}, 'dataloader': {'shuffle_val': False, 'train_bs': 256, 'val_bs': 32, 'test_bs': 32, 'num_workers': 2, 'pin_memory': False}, 'metric': '${optimizer.metric}', 'print_config': True}


In [39]:
# Train saint model in self-supervised settings. 
# To use gpus, add trainer.gpus=1 where "1" is the total no of gpus to the command

# python main.py experiment=self-supervised \
#     experiment.model=saint \
#     data.data_folder=/Users/annimukh/Documents/acode/iclr-/saint-unofficial/IEEE_Preprocess \
#     data=bank_ssl 

!python main.py experiment=self-supervised \
    experiment.model=saint \
    data.data_folder=/Users/annimukh/Documents/acode/iclr-/saint-unofficial/IEEE_Preprocess \
    data=bank_ssl \
    trainer.accelerator=mps


{'seed': 1234, 'transformer': {'num_layers': 6, 'num_heads': 8, 'dropout': 0.1, 'dropout_ff': 0.1, 'embed_dim': 32, 'd_ff': 32, 'cls_token_idx': 0}, 'augmentation': {'prob_cutmix': 0.3, 'alpha': 0.2, 'lambda_pt': 10}, 'optimizer': {'temperature': 0.7, 'proj_head_dim': 128, 'beta_1': 0.9, 'beta_2': 0.99, 'lr': 0.0001, 'weight_decay': 0.01, 'optim': 'adamw', 'metric': 'auroc'}, 'preproc': {'data_folder': None, 'train_split': 0.65, 'validation_split': 0.15, 'test_split': 0.2, 'num_supervised_train_data': None}, 'callback': {'monitor': 'val_loss', 'mode': 'min', 'auto_insert_metric_name': False}, 'trainer': {'max_epochs': 1, 'accelerator': 'mps', 'devices': 1, 'deterministic': True, 'default_root_dir': None}, 'dataloader': {'shuffle_val': False, 'train_bs': 256, 'val_bs': 32, 'test_bs': 32, 'num_workers': 2, 'pin_memory': False}, 'metric': '${optimizer.metric}', 'print_config': True, 'experiment': {'experiment': 'self-supervised', 'task': 'classification', 'model': 'saint', 'num_output': N

In [40]:
print("PyTorch Lightning Version:", li.__version__)


PyTorch Lightning Version: 2.5.0.post0


In [46]:
best_ssl_model_ckpt = "/Users/annimukh/Documents/acode/iclr-/saint-unofficial/lightning_logs/version_8/checkpoints/0-1490.ckpt"
data_path = '/Users/annimukh/Documents/acode/iclr-/saint-unofficial/IEEE_Preprocess'


In [61]:
!python main.py experiment=supervised \
    experiment.model=saint \
    data.data_folder=/Users/annimukh/Documents/acode/iclr-/saint-unofficial/IEEE_Preprocess \
    data=bank_sup \
    experiment.pretrained_checkpoint={best_ssl_model_ckpt}


{'seed': 1234, 'transformer': {'num_layers': 6, 'num_heads': 8, 'dropout': 0.1, 'dropout_ff': 0.1, 'embed_dim': 32, 'd_ff': 32, 'cls_token_idx': 0}, 'augmentation': {'prob_cutmix': 0.3, 'alpha': 0.2, 'lambda_pt': 10}, 'optimizer': {'temperature': 0.7, 'proj_head_dim': 128, 'beta_1': 0.9, 'beta_2': 0.99, 'lr': 0.0001, 'weight_decay': 0.01, 'optim': 'adamw', 'metric': 'auroc'}, 'preproc': {'data_folder': None, 'train_split': 0.65, 'validation_split': 0.15, 'test_split': 0.2, 'num_supervised_train_data': None}, 'callback': {'monitor': 'val_loss', 'mode': 'min', 'auto_insert_metric_name': False}, 'trainer': {'max_epochs': 1, 'accelerator': 'cpu', 'devices': 1, 'deterministic': True, 'default_root_dir': None}, 'dataloader': {'shuffle_val': False, 'train_bs': 256, 'val_bs': 32, 'test_bs': 32, 'num_workers': 2, 'pin_memory': False}, 'metric': '${optimizer.metric}', 'print_config': True, 'experiment': {'experiment': 'supervised', 'task': 'classification', 'model': 'saint', 'num_output': 1, 'fr

In [62]:
pretrained_checkpoint = "/Users/annimukh/Documents/acode/iclr-/saint-unofficial/lightning_logs/version_13/checkpoints/0-11.ckpt"

!python predict.py experiment=predict \
    experiment.model=saint \
    data=bank_sup \
    data.data_folder=/Users/annimukh/Documents/acode/iclr-/saint-unofficial/IEEE_Preprocess \
    experiment.pretrained_checkpoint={pretrained_checkpoint} \
    experiment.pred_sav_path=/Users/annimukh/Documents/acode/iclr-/saint-unofficial/outputs/predictions.csv


The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  @hydra.main(config_path="configs", config_name="config")
See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(
{'seed': 1234, 'transformer': {'num_layers': 6, 'num_heads': 8, 'dropout': 0.1, 'dropout_ff': 0.1, 'embed_dim': 32, 'd_ff': 32, 'cls_token_idx': 0}, 'augmentation': {'prob_cutmix': 0.3, 'alpha': 0.2, 'lambda_pt': 10}, 'optimizer': {'temperature': 0.7, 'proj_head_dim': 128, 'beta_1': 0.9, 'beta_2': 0.99, 'lr': 0.0001, 'weight_decay': 0.01, 'optim': 'adamw', 'metric': 'auroc'}, 'preproc': {'data_folder': None, 'train_split': 0.65, 'validation_split': 0.15, 'test_split': 0.2, 'num_supervised_train_data': None}, 'callback': {'monitor': 'val_loss', 'mode': 'min', 'auto_insert_metric_name': False}, 'trainer': {'max_epochs': 1, 'accelerator': 'cpu', 'devices': 1, 'deterministic': True,