# Bank dataset example

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ogunlao/saint/blob/main/notebooks/Bank_Dataset.ipynb)

This notebook example is made to run seamlessly on colab. You may need to change the paths to run on local.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# clone the repository

!git clone https://github.com/ogunlao/saint.git

In [None]:
!pip install -r saint/requirements.txt

**warning: restart the runtime on Colab**

## General setup configuration parameters from paper

In [None]:
import os

import torch
import pandas as pd
import numpy as np

In [None]:
# --- Config Information ---#
try:
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML

config_path = 'saint/configs/config.yaml'

yaml = YAML(typ='safe')
with open(config_path) as f:
    args = yaml.load(f)

print(args)

## Dataset preprocessing

###### Bank dataset [Link](https://archive.ics.uci.edu/ml/datasets/bank+marketing)

In [None]:
data_folder = "saint/data"

# download the data into a data folder
URL_LINK = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip"

!wget -P {data_folder} {URL_LINK}
!unzip {data_folder}/bank.zip -d {data_folder}

In [None]:
bank_df = pd.read_csv(os.path.join(data_folder, 'bank-full.csv'),  sep=';')
y = bank_df[['y']].copy()

bank_df.drop(columns=['y'], inplace=True)
bank_df.head()

In [None]:
len(y) # Data size should be 45211

In [None]:
y.value_counts() # Data is imbalanced, will require some balanced sampling

In [None]:
# check if there are any missing values
bank_df.isna().sum()

In [None]:
from saint.src.dataset import generate_splits, preprocess

In [None]:
# separate 200 samples for supervised training, all others are used in ssl training
num_supervised_train_data = 200 

# get split indices
sup_train_indices, val_indices, test_indices, ssl_train_indices = generate_splits(len(bank_df), 
                                                                            num_supervised_train_data,
                                                                            args['preproc']['validation_split'],
                                                                            args['preproc']['test_split'],
                                                                            args['seed'],)

In [None]:
df_proc, y_proc, no_num, no_cat, cats  = preprocess(bank_df, y, args['transformer']['cls_token_idx'])

In [None]:
# This variables will need to be added to the config files in "configs/data/bank_*" before training

print('no of numerical columns: ', no_num)
print('no of categorical columns: ', no_cat)

print('list of categories in each categorical column: ', cats)

In [None]:
#### args.num_features = args.no_num + args.no_cat

# split data into train, val and test using generated indices

train_df, train_y   = df_proc.iloc[sup_train_indices], y_proc.iloc[sup_train_indices]
val_df, val_y       = df_proc.iloc[val_indices], y_proc.iloc[val_indices]
test_df, test_y     = df_proc.iloc[test_indices], y_proc.iloc[test_indices]

In [None]:
# Generate data for self-supervised training if specified

train_ssl, train_ssl_y = None, None

if num_supervised_train_data != 'all':
    train_ssl, train_ssl_y = df_proc.iloc[ssl_train_indices], y_proc.iloc[ssl_train_indices]

In [None]:
# save dataframes in the data directory

train_df.to_csv('saint/data/train.csv' , index=False)
train_y.to_csv('saint/data/train_y.csv' , index=False)
val_df.to_csv('saint/data/val.csv' , index=False)
val_y.to_csv('saint/data/val_y.csv' , index=False)
test_df.to_csv('saint/data/test.csv' , index=False)
test_y.to_csv('saint/data/test_y.csv' , index=False)

if train_ssl is not None:
    train_ssl.to_csv('saint/data/train_ssl.csv' , index=False)

if train_ssl_y is not None:
    train_ssl_y.to_csv('saint/data/train_ssl_y.csv' , index=False)

### Self-supervised training

In [None]:
# give the number of gpus available if any
num_gpus = 1

In [None]:
# Train saint model in self-supervised settings. 
# To use gpus, add trainer.gpus=1 where "1" is the total no of gpus to the command

!python saint/main.py experiment=self-supervised \
    experiment.model=saint \
    data.data_folder=/content/saint/data \
    data=bank_ssl

In [None]:
# Retrieve the location of self-supervised pretrained model from logs

# for example
best_ssl_model_ckpt = "/content/outputs/2021-11-01/10-09-16/lightning_logs/version_0/checkpoints/0-916.ckpt"

In [None]:
# Train a supervised model initialized from the ssl model

!python saint/main.py experiment=supervised \
    experiment.model=saint \
    data.data_folder=/content/saint/data \
    data=bank_sup \
    experiment.pretrained_checkpoint={best_ssl_model_ckpt}


In [None]:
# You can also train a supervised model without initializing with a pretrained model i.e. randomly initialized

!python saint/main.py experiment=supervised \
    experiment.model=saint \
    data.data_folder=/content/saint/data \
    data=bank_sup

In [None]:
# To predict on a test dataset  # check saint/experiment/predict for params

pretrained_checkpoint = "/content/outputs/2021-11-01/13-30-49/lightning_logs/version_0/checkpoints/2-20.ckpt"

!python saint/predict.py experiment=predict \
    experiment.model=saint \
    data=bank_sup \
    data.data_folder=/content/saint/data \
    experiment.pretrained_checkpoint={pretrained_checkpoint} \
    experiment.pred_sav_path=/content/predict.csv

## View Tensorboard

In [None]:
# View plots and hyperparameters

%load_ext tensorboard
%tensorboard --logdir /content/outputs/2021-11-01/10-14-14/lightning_logs # change lightning log path

## The End