# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from tqdm import tqdm
import os
import random
import pickle
import warnings
import gc
import joblib

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (precision_recall_curve, roc_curve, make_scorer, log_loss,
                             f1_score, confusion_matrix, auc, roc_auc_score, accuracy_score)

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.pretraining import TabNetPretrainer
from torch.optim.lr_scheduler import ReduceLROnPlateau

import optuna
from optuna.samplers import TPESampler

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
RANDOM_STATE = 4158
CHUNKSIZE = 50000

with open(f"./dtypes.pkl", 'rb') as f:
    dtypes = pickle.load(f)

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
print(f"Using PyTorch version: {torch.__version__}, Device: {DEVICE}")
%matplotlib inline

Using PyTorch version: 1.12.1, Device: cuda


# Def

In [2]:
def loaddata(fname:str, chunksize:int, dtype:dict=None, columns:list=None):  
    df = pd.DataFrame()
    for chunk in tqdm(pd.read_csv(fname, engine='python', low_memory=True, chunksize=chunksize, dtype=dtype)):
        df = pd.concat([df, chunk], axis=0)
        del chunk
        gc.collect()

    return df

# Data Load

In [3]:
private = loaddata("../Data/master_private_data.csv", chunksize=CHUNKSIZE, dtype=dtypes)
test = loaddata("../Data/master_test_data.csv", chunksize=CHUNKSIZE)

44it [01:33,  2.13s/it]
66it [02:24,  2.19s/it]


In [4]:
public = loaddata("../Data/master_public_data.csv", chunksize=CHUNKSIZE, dtype=dtypes)

19it [00:39,  2.09s/it]


In [5]:
cat_feats = ['income_type', 'employment_type', 'houseown_type', 'purpose', 'personal_rehabilitation_yn', 'personal_rehabilitation_complete_yn', 
             'gender', 'bank_diversity', 'product_diversity', 'holiday', 'weekday', 'foreign', 'segment', 'bank_label', 'product_label']
cat_idxs = [i for i, col in enumerate(private.columns.values) if col in cat_feats]

In [6]:
for col in private.columns.values:
    if private[col].dtype == 'bool':
        private[col] = private[col].astype('uint8')
    if test[col].dtype == 'bool':
        test[col] = test[col].astype('uint8')
    if public[col].dtype == 'bool':
        public[col] = public[col].astype('uint8')

for col in cat_feats:
    if private[col].min() != 0:
        private[col] -= private[col].min()
    if test[col].min() != 0:
        test[col] -= test[col].min()
    if public[col].min() != 0:
        public[col] -= public[col].min()

# cat_dims = [[col].nunique() for col in cat_feats]

In [7]:
for i in range(test.shape[0]):
    if test['product_diversity'].iloc[i] == 79:
        test['product_diversity'].iloc[i] = 9

In [8]:
print(f"Unique categories")
for col in cat_feats:
    print(f"\tprivate {col} -> {np.sort(private[col].unique())}", end='')
    print(f"\ttest {col} -> {np.sort(test[col].unique())}")

Unique categories
	private income_type -> [0 1 2 3 4 5]	test income_type -> [0 1 2 3 4 5]
	private employment_type -> [0 1 2 3]	test employment_type -> [0 1 2 3]
	private houseown_type -> [0 1 2 3]	test houseown_type -> [0 1 2 3]
	private purpose -> [0 1 2 3 4 5 6 7]	test purpose -> [0 1 2 3 4 5 6 7]
	private personal_rehabilitation_yn -> [0 1]	test personal_rehabilitation_yn -> [0. 1.]
	private personal_rehabilitation_complete_yn -> [0 1]	test personal_rehabilitation_complete_yn -> [0. 1.]
	private gender -> [0 1]	test gender -> [0. 1.]
	private bank_diversity -> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55]	test bank_diversity -> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54]
	private product_diversity -> [ 0  1  2  3  4  5  6  7  8  9 10 1

In [9]:
X_public = public.drop('is_applied', axis=1)
y_public = public['is_applied']

## Evaluation Metric

In [10]:
class F1Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True
    
    def __call__(self, y_true, y_score):
        y_score = (y_score[:, 1] > 0.5).astype(np.uint8)
        return f1_score(y_true, y_score)

In [11]:
clf = joblib.load(f"../Model/clf_no_sampling.pkl")

In [12]:
public_pred_proba = clf.predict_proba(X_public.values)

In [13]:
private_pred_proba = clf.predict_proba(private.values)

In [14]:
test_pred_proba = clf.predict_proba(test.values)

In [20]:
public_pred = (public_pred_proba[:, 1] >= 0.5).astype(np.uint8)
private_pred = (private_pred_proba[:, 1] >= 0.5).astype(np.uint8)
test_pred = (test_pred_proba[:, 1] >= 0.5).astype(np.uint8)

# Deployment CSV

In [21]:
public_pred = pd.DataFrame({'predict': public_pred})
private_pred = pd.DataFrame({'predict': private_pred})
test_pred = pd.DataFrame({'predict': test_pred})

public_pred.to_csv('../Data/tabnet_public_predict.csv', index=False)
private_pred.to_csv('../Data/tabnet_private_predict.csv', index=False)
test_pred.to_csv('../Data/tabnet_test_predict.csv', index=False)