In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats 

##### Progress bar (進度條)
from tqdm import tqdm

##### KFOLD
from sklearn.model_selection import KFold

##### preprocessing  (預處理)
from sklearn.preprocessing import QuantileTransformer

# Quantile transformation

from sklearn.preprocessing import QuantileTransformer

quantiletransformer will force features transform to same shape

from tqdm import tqdm to visualize progress bar

In [None]:
feats = [col for col in test_df.columns]

for col in tqdm(features):
    transformer = QuantileTransformer(n_quantiles=5000, 
                                      random_state=42, 
                                      output_distribution="normal")
    
    vec_len = len(train_df[col].values)
    vec_len_test = len(test_df[col].values)

    raw_vec = train_df[col].values.reshape(vec_len, 1)
    test_vec = test_df[col].values.reshape(vec_len_test, 1)
    transformer.fit(raw_vec)
    
    train_df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_df[col] = transformer.transform(test_vec).reshape(1, vec_len_test)[0]

print(f"train_df: {train_df.shape} \ntest_df: {test_df.shape}")

# KFOLD

import KFold from sklearn.model_selection
you can also import train_test_split from skleran.model_selection

If your data is sensitive to cardinality of categorical, 
import StratifiedKFold from sklearn.model_selection

In [None]:
##### Set your configuration here!!

FOLD = 5
SEEDS = [24, 42]

fet_imp = 0 # feature importance (not necessary)
counter = 0
oof_score = 0

##### here I use XGBoost to implement KFold, you may choose your model by your self
y_pred_final_xgb = np.zeros((Xtest.shape[0], 1))
y_pred_meta_xgb = np.zeros((Xtrain.shape[0], 1))

##### your chosen seeds
for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    ##### start KFold
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain.values, Ytrain.values)):
        counter += 1
        
        ##### setting sub training data and validation data
        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]
        
        
        ##### This is nothing to do with KFold
        params['learning_rate']=0.02
        init_model = XGBClassifier(**params)
        
        ##### You may fit your model here!
        init_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                       early_stopping_rounds=200, verbose=500)
        
        ##### Ignore this block
        params['learning_rate']=0.01
        model = XGBClassifier(**params)

        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=100, verbose=300, xgb_model=init_model)
        
        ##### predict validation set and testing set
        y_pred = model.predict_proba(val_x, iteration_range=(0, model.best_iteration))[:,-1]
        y_pred_meta_xgb[val] += np.array([y_pred]).T
        y_pred_final_xgb += np.array([model.predict_proba(Xtest, iteration_range=(0, model.best_iteration))[:,-1]]).T
        
        ##### calculate your metrics here
        fet_imp += model.feature_importances_
        score = roc_auc_score(val_y, y_pred)
        oof_score += score
        seed_score += score
        
        ##### metric logger
        print("\nSeed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


fet_imp = fet_imp / float(counter)
y_pred_meta_xgb = y_pred_meta_xgb / float(len(SEEDS))
y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

# Plot confusion matrix

confusion matrix is always a great visualization tool to justify your model performance.

from sklearn.metrics import confusion_matrix

In [None]:
##### you may import itertools

def plot_confusion_matrix(cm, classes):

    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix', fontweight='bold', pad=15)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontweight='bold')
    plt.xlabel('Predicted label', fontweight='bold')
    plt.tight_layout()

In [None]:
cnf_matrix = confusion_matrix(Ytrain, y_pred, labels=[0, 1])
np.set_printoptions(precision=2)
plt.figure(figsize=(12, 5))
plot_confusion_matrix(cnf_matrix, classes=[0, 1])