### **Library Imports**

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import random as r
import seaborn as sns
import matplotlib.pyplot as plt

from time import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report

### **Utilities and Constants**

In [2]:
SEED = 42
N_SPLITS = 5
sc_X = StandardScaler()
BASE_PATH = "../input/santander-customer-transaction-prediction"

LOG_PATH = "../logs"
MODEL_PATH = "../models"

if not os.path.exists(LOG_PATH): os.makedirs(LOG_PATH)
if not os.path.exists(MODEL_PATH): os.makedirs(MODEL_PATH)

LOG_FILE = LOG_PATH + "/training_logs.txt"
with open(LOG_FILE, "w+") as f: f.write("Training Logs\n\n\n")

In [3]:
def breaker(num: int=50, char: str="*") -> None:
    print("\n" + num*char + "\n")


def get_scores(y_true: np.ndarray, y_pred: np.ndarray) -> tuple:
    accuracy = accuracy_score(y_pred, y_true)
    try:
        auc = roc_auc_score(y_pred, y_true, average="weighted")
    except:
        auc = 0
    precision, recall, f_score, _ = precision_recall_fscore_support(y_pred, y_true, average="weighted")

    return accuracy, auc, precision, recall, f_score


def print_scores(accuracy: float, auc: float, precision: np.ndarray, recall: np.ndarray, f_score: np.ndarray) -> None:
    print(f"Accuracy  : {accuracy:.5f}")
    print(f"ROC-AUC   : {auc:.5f}")
    print(f"Precision : {precision:.5f}")
    print(f"Recall    : {recall:.5f}")
    print(f"F-Score   : {f_score:.5f}")

    
def train(model, model_path: str, X: np.ndarray, y: np.ndarray, n_splits: int, log_file: str, seed: int) -> tuple:
    accuracies, aucs, precisions, recalls, f_scores = [], [], [], [], []

    with open(log_file, "a") as f:
        f.write("XGB CLASSIFIER\n\n")

    fold = 1
    start_time = time()
    for tr_idx, va_idx in StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(X, y):
        print(f"Fold {fold}\n")
            
        X_train, X_valid, y_train, y_valid = X[tr_idx], X[va_idx], y[tr_idx], y[va_idx]

        X_train = sc_X.fit_transform(X_train)
        X_valid = sc_X.transform(X_valid)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)

        accuracy, auc, precision, recall, f_score = get_scores(y_valid, y_pred)
        print_scores(accuracy, auc, precision, recall, f_score)

        accuracies.append(accuracy)
        aucs.append(auc)
        precisions.append(precision)
        recalls.append(recall)
        f_scores.append(f_score)

        with open(log_file, "a") as f:
            f.write(f"Fold {fold}\n")
            f.write(f"Accuracy  : {accuracy}\n")
            f.write(f"ROC AUC   : {auc}\n")
            f.write(f"Precision : {precision}\n")
            f.write(f"Recall    : {recall}\n")
            f.write(f"F_Score   : {f_score}\n\n")

        with open(model_path + f"/xgbc_fold_{fold}.pkl", "wb") as fp: pickle.dump(model, fp)

        breaker()
        fold += 1
    
    with open(log_file, "a") as f: 
        f.write(f"\nTime Taken for {n_splits}-Fold CV : {(time()-start_time)/60:.2f} minutes")

    print(f"Time Taken for {n_splits}-Fold CV : {(time()-start_time)/60:.2f} minutes")
    breaker()

    return model, accuracies, aucs, precisions, recalls, f_scores

### **Load Data**

In [4]:
train_df = pd.read_csv(os.path.join(BASE_PATH, "train.csv"))
test_df  = pd.read_csv(os.path.join(BASE_PATH, "test.csv"))
ss_df    = pd.read_csv(os.path.join(BASE_PATH, "sample_submission.csv"))

train_df = train_df.drop(columns=["ID_code"])
test_df  = test_df.drop(columns=["ID_code"])

train_df.head(5)

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [5]:
train_df.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

### **Train**

In [6]:
X, y = train_df.iloc[:, 1:].copy().values, train_df.iloc[:, 0].copy().values

breaker()
model, accuracies, aucs, precisions, recalls, f_scores = train(LogisticRegression(random_state=SEED),
                                                               model_path=MODEL_PATH, 
                                                               X=X, 
                                                               y=y, 
                                                               n_splits=N_SPLITS, 
                                                               log_file=LOG_FILE, 
                                                               seed=SEED)


**************************************************

Fold 1

Accuracy  : 0.91378
ROC-AUC   : 0.80303
Precision : 0.95817
Recall    : 0.91378
F-Score   : 0.93137

**************************************************

Fold 2

Accuracy  : 0.91492
ROC-AUC   : 0.81025
Precision : 0.95879
Recall    : 0.91492
F-Score   : 0.93221

**************************************************

Fold 3

Accuracy  : 0.91410
ROC-AUC   : 0.80524
Precision : 0.95837
Recall    : 0.91410
F-Score   : 0.93162

**************************************************

Fold 4

Accuracy  : 0.91543
ROC-AUC   : 0.81226
Precision : 0.95852
Recall    : 0.91543
F-Score   : 0.93235

**************************************************

Fold 5

Accuracy  : 0.91282
ROC-AUC   : 0.79377
Precision : 0.95577
Recall    : 0.91282
F-Score   : 0.92993

**************************************************

Time Taken for 5-Fold CV : 0.11 minutes

**************************************************



### **Submission**

In [7]:
ss_df.target = model.predict(test_df.copy().values)
ss_df.to_csv("submission.csv", index=False)