[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ogunlao/saint/blob/main/notebooks/Income_Dataset.ipynb)

# Cloning the repo & installing requirements

In [None]:
!git clone https://github.com/ogunlao/saint.git

In [None]:
import pandas as pd
import numpy as np

In [None]:
!pip3 install -r '/content/saint/requirements.txt' 

#Reading the data

You will need to download the "income" dataset (from kaggle), to follow this tutorial

In [None]:
train = pd.read_csv('/content/data/train.csv')

# reading test data for predictions
test = pd.read_csv('/content/data/test.csv')

In [None]:
train.columns

# custom preprocessing

In [None]:
train_y = train[['income_>50K']]
train = train.drop(columns='income_>50K')

In [None]:
#concat train and test to preprocess
df = pd.concat([train, test])


In [None]:
from sklearn import preprocessing
def preprocess(data, cls_token_idx):
    """
    preprocess: function
        this is a function that process the features
        and return the:
        - the processed data in the order of [cls, categorical features, numerical_features]
        - number of numerical features
        - number of categorical features
        - list of cat_len length describes the 
        number of catogeries in each catogrical column 
    """
    # copying the data
    data = data.copy()

    # adding the cls token to beginning of data
    data.insert(loc=cls_token_idx, column='cls', value='cls')

    cat_cols = data.select_dtypes(include=['object', 'category']).columns
    num_cols = [col for col in data.columns if col not in cat_cols]

    # z-transform and add missing value token
    num_data = data[num_cols]
    num_data = (num_data-num_data.mean())/num_data.std()
    # num_data = (num_data-num_data.min())/(num_data.max() - num_data.min()) min-max scaling
    cat_data = data[cat_cols]
    
    # fill missing
    num_data = num_data.fillna(-99999)
    cat_data[pd.isnull(cat_data)]  = 'NaN'

    # label encoding 
    labelencode = preprocessing.LabelEncoder()
    cat_data = cat_data.apply(labelencode.fit_transform)

    # cat columns come first
    new_data = pd.concat([cat_data.astype(np.int32), num_data.astype(np.float32)], axis=1)

    
        
    cats = []

    for cat in cat_data.columns:
        cats.append(len(pd.unique(new_data[cat])))

    return new_data, len(num_data.columns), len(cat_data.columns), cats

In [None]:
processed_data, nums, cats, _= preprocess(df,cls_token_idx=0)

In [None]:
#chech catogrical and numerical data
nums,cats,_

# splitting dataset

In [None]:
def generate_splits(dataset_size,
                      validation_split,
                      random_seed, shuffle_dataset=True,):
    """Generate data samplers for supervised and semi-supervised training """

    # Creating data indices for training and validation splits:
    indices = list(range(dataset_size))

    split_val = int(validation_split * dataset_size)
    
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_indices = indices[:split_val] 
    
    train_indices = indices[split_val: ]


    return train_indices, val_indices

In [None]:
train=processed_data.iloc[:len(train)]
test=processed_data.iloc[len(train):]
train_indices,val_indices = generate_splits(len(train),0.25,random_seed=1234)
x_train, y_train = train.iloc[train_indices], train_y.iloc[train_indices]
x_val, y_val = train.iloc[val_indices], train_y.iloc[val_indices]


In [None]:
y_train.values[32967]

# Saving csv files

In [None]:
x_train.to_csv('/content/saint/data/train.csv', index=False)
y_train.to_csv('/content/saint/data/train_y.csv', index=False)
x_val.to_csv('/content/saint/data/val.csv', index=False)
y_val.to_csv('/content/saint/data/val_y.csv', index=False)


In [None]:
%cd '/content/saint/'

In [None]:
# run this cell to train saint model using config

!python main.py --experiment sup --no_cat 9 --no_num 6 

In [None]:
!ls /content/saint/checkpoints/lightning_logs/version_0/checkpoints/