# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background: #cceeff; border-bottom: 8px solid #004466" > TABLE OF CONTENTS<br><div>  
* [IMPORTS](#1)
* [INTRODUCTION](#2)
    * [CONFIGURATION](#2.1)
    * [CONFIGURATION PARAMETERS](#2.2)    
    * [DATASET COLUMNS](#2.3)
* [PREPROCESSING](#3)
* [ADVERSARIAL CV](#4)
* [EDA AND VISUALS](#5) 
* [DATA TRANSFORMS](#6)
* [MODEL TRAINING](#7)    
* [ENSEMBLE AND SUBMISSION](#8)  
* [PLANNED WAY FORWARD](#9)     

<a id="1"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background: #cceeff; border-bottom: 8px solid #004466" > IMPORTS<br> <div> 

In [None]:
%%time 

# Installing select libraries:-
from gc import collect;
from warnings import filterwarnings;
filterwarnings('ignore');
from IPython.display import clear_output;

!pip install -q --upgrade scipy;
!pip install -q category_encoders;

clear_output();
print();
collect();

In [None]:
%%time

# General library imports:-
from copy import deepcopy;
import pandas as pd;
import numpy as np;
from scipy.stats import mode, kstest, normaltest, shapiro, anderson, jarque_bera;
from collections import Counter;
from itertools import product;
from colorama import Fore, Style, init;
from warnings import filterwarnings;
filterwarnings('ignore');

from tqdm.notebook import tqdm;
import seaborn as sns;
import matplotlib.pyplot as plt;
%matplotlib inline

from pprint import pprint;

print();
collect();
clear_output();

In [None]:
%%time 

# Importing model and pipeline specifics:-
from category_encoders import OrdinalEncoder, OneHotEncoder;

# Pipeline specifics:-
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler;
from sklearn.impute import SimpleImputer as SI;
from sklearn.model_selection import (RepeatedStratifiedKFold as RSKF, 
                                     StratifiedKFold as SKF,
                                     KFold, 
                                     RepeatedKFold as RKF, 
                                     cross_val_score);
from sklearn.inspection import permutation_importance;
from sklearn.feature_selection import mutual_info_classif, RFE;
from sklearn.pipeline import Pipeline, make_pipeline;
from sklearn.base import BaseEstimator, TransformerMixin;
from sklearn.compose import ColumnTransformer;

# ML Model training:-
from sklearn.metrics import f1_score, confusion_matrix;
from xgboost import DMatrix, XGBClassifier;
from lightgbm import LGBMClassifier, log_evaluation, early_stopping;
from catboost import CatBoostClassifier, Pool;

# Ensemble and tuning:-
import optuna;
from optuna import Trial, trial, create_study;
from optuna.samplers import TPESampler, CmaEsSampler;
optuna.logging.set_verbosity = optuna.logging.ERROR;

clear_output();
print();
collect();

In [None]:
%%time 

# Setting rc parameters in seaborn for plots and graphs- 
# Reference - https://matplotlib.org/stable/tutorials/introductory/customizing.html:-
# To alter this, refer to matplotlib.rcParams.keys()

sns.set({"axes.facecolor"       : "#ffffff",
         "figure.facecolor"     : "#ffffff",
         "axes.edgecolor"       : "#000000",
         "grid.color"           : "#ffffff",
         "font.family"          : ['Cambria'],
         "axes.labelcolor"      : "#000000",
         "xtick.color"          : "#000000",
         "ytick.color"          : "#000000",
         "grid.linewidth"       : 0.75,  
         "grid.linestyle"       : "--",
         "axes.titlecolor"      : '#0099e6',
         'axes.titlesize'       : 8.5,
         'axes.labelweight'     : "bold",
         'legend.fontsize'      : 7.0,
         'legend.title_fontsize': 7.0,
         'font.size'            : 7.5,
         'xtick.labelsize'      : 7.5,
         'ytick.labelsize'      : 7.5,        
        });

# Color printing    
def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    "Prints color outputs using colorama using a text F-string";
    print(style + color + text + Style.RESET_ALL); 

# Making sklearn pipeline outputs as dataframe:-
from sklearn import set_config; 
set_config(transform_output = "pandas");
pd.set_option('display.max_columns', 50);
pd.set_option('display.max_rows', 50);

print();
collect();


<a id="2"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background: #cceeff; border-bottom: 8px solid #004466" > INTRODUCTION<br><div> 

| Version<br>Number | Version Details | Best CV score| Single/ Ensemble|
| :-: | --- | :-: | :-: |
| **V1** |* EDA, plots and secondary features and encoding<br>* No scaling<br> * Used original data<br>* Tree based ML models and basic ensemble|0.71275|Simple blend |

<a id="2.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > CONFIGURATION<br><div> 

In [None]:
%%time

# Configuration class:-
class CFG:
    "Configuration class for parameters and CV strategy for tuning and training";
    
    # Data preparation:-   
    version_nb         = 1;
    test_req           = "N";
    gpu_switch         = "OFF"; 
    state              = 42;
    target             = 'outcome';
    episode            = 22;
    path               = f"/kaggle/input/playground-series-s3e{episode}/";
    orig_path          = f"/kaggle/input/horse-survival-dataset/horse.csv";
    
    dtl_preproc_req    = "Y";
    adv_cv_req         = "N";
    ftre_plots_req     = "Y";
    ftre_imp_req       = "Y";
    
    # Data transforms and scaling:-    
    conjoin_orig_data  = "Y";
    sec_ftre_req       = "Y";
    scale_req          = "N";
    # NOTE---Keep a value here even if scale_req = N, this is used for linear models:-
    scl_method         = "Z"; 
    enc_method         = 'Label';
    lesion_OH_req      = "N";
    tgt_mapper         = {"lived": 2, "euthanized": 1, "died": 0};
    
    # Model Training:- 
    baseline_req       = "N";
    pstprcs_oof        = "Y";
    pstprcs_train      = "Y";
    ML                 = "Y";
    use_orig_allfolds  = "N";
    n_splits           = 5 ;
    n_repeats          = 5 ;
    nbrnd_erly_stp     = 50 ;
    mdlcv_mthd         = 'RSKF';
    
    # Ensemble:-    
    ensemble_req       = "Y";
    enscv_mthd         = "RSKF";
    metric_obj         = 'maximize';
    ntrials            = 10 if test_req == "Y" else 200;
    
    # Global variables for plotting:-
    grid_specs = {'visible': True, 'which': 'both', 'linestyle': '--', 
                           'color': 'lightgrey', 'linewidth': 0.75};
    title_specs = {'fontsize': 9, 'fontweight': 'bold', 'color': 'tab:blue'};

print();
PrintColor(f"--> Configuration done!\n");
collect();

In [None]:
%%time 

# Defining functions to be used throughout the code for common tasks:-

# Scaler to be used for continuous columns:- 
all_scalers = {'Robust': RobustScaler(), 
               'Z': StandardScaler(), 
               'MinMax': MinMaxScaler()
              };
scaler      = all_scalers.get(CFG.scl_method);

# Commonly used CV strategies for later usage:-
all_cv= {'KF'  : KFold(n_splits= CFG.n_splits, shuffle = True, random_state= CFG.state),
         'RKF' : RKF(n_splits= CFG.n_splits, n_repeats = CFG.n_repeats, random_state= CFG.state),
         'RSKF': RSKF(n_splits= CFG.n_splits, n_repeats = CFG.n_repeats, random_state= CFG.state),
         'SKF' : SKF(n_splits= CFG.n_splits, shuffle = True, random_state= CFG.state)
        };

# Defining the competition metric:-
def ScoreMetric(ytrue, ypred)-> float:
    """
    This function calculates the metric for the competition. 
    ytrue- ground truth array
    ypred- predictions
    returns - metric value (float)
    """;
    return f1_score(ytrue, ypred, average = "micro");

def PostProcessPred(preds, post_process = "N"):
    """
    This is an optional post-processing general function, here is not required
    """;
    return preds;

# Defining the scoring for LightGBM and XGBoost:-
def ScoreLGBM(ytrue: np.array, ypred: np.array) -> np.float32:
    "Defines the custom metric for light GBM classifier";
    return ('MicroF1', f1_score(ytrue, np.argmax(ypred, axis=1), average = "micro"), True);

def ScoreXGB(ypred: np.array, dtrain: DMatrix) -> np.float32:
    "This function returns the custom metric according to the XGBoost requirements";  
    return ("MicroF1", f1_score(dtrain.get_label(), np.argmax(ypred, axis=1), average = "micro"));

collect();
print();


<a id="2.2"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > CONFIGURATION PARAMETERS<br><div> 


| Parameter         | Description                                             | Possible value choices|
| ---               | ---                                                     | :-:                   |
|  version_nb       | Version Number                                          | integer               |
|  gpu_switch       | GPU switch                                              | ON/OFF                |
|  state            | Random state for most purposes                          | integer               |
|  target           | Target column name                                      | yield                 |
|  episode          | Episode Number                                          | integer               |
|  path             | Path for input data files                               |                       |
|  orig_path        | Path for input original data files                      |                       |
|  dtl_preproc_req  | Proprocessing required                                  | Y/N                   |    
|  adv_cv_req       | Adversarial CV required                                 | Y/N                   |
|  ftre_plots_req   | Feature plots required                                  | Y/N                   |
|  ftre_imp_req     | Feature importance required                             | Y/N                   |
|  conjoin_orig_data| Conjoin original data                                   | Y/N                   |
|  sec_ftre_req     | Secondary features required                             | Y/N                   |
|  scale_req        | Scaling required                                        | Y/N                   |
|  scl_method       | Scaling method                                          | Z/ Robust/ MinMax     |
|  enc_method       | Encoding method                                         |-                      |
|  lesion_OH_req    | Encoding method- lesion columns                         | Y/N                   |
|  tgt_mapper       | Target mapper                                           | dict                  |
|  baseline_req     | Baseline model required                                 | Y/N                   |
|  pstprcs_oof      | Post-process OOF after model training                   | Y/N                   |
|  pstprcs_train    | Post-process OOF during model training for dev-set      | Y/N                   |
|  ML               | Machine Learning Models                                 | Y/N                   |
|  use_orig_allfolds| Use original data across all folds                      | Y/N                   |
|  n_splits         | Number of CV splits                                     | integer               |
|  n_repeats        | Number of CV repeats                                    | integer               |
|  nbrnd_erly_stp   | Number of early stopping rounds                         | integer               |
|  mdl_cv_mthd      | Model CV method name                                    | RKF/ RSKF/ SKF/ KFold |

<a id="2.3"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > DATASET AND COMPETITION DETAILS<br><div>
    
**Data columns**<br>
This is available in my disscussion post as below<br>
https://www.kaggle.com/competitions/playground-series-s3e22/discussion/438603https://www.kaggle.com/competitions/playground-series-s3e22/discussion/438603<br>
<br>
**Competition details and notebook objectives**<br>
1. This is a multi-class classification challenge to predict horse survival using the provided features. **F1-micro** is the metric for the challenge<br>
2. In this starter notebook, we start the assignment with a detailed EDA, feature plots, interaction effects, adversarial CV analysis and develop starter models to initiate the challenge. We will also incorporate other opinions and approaches as we move along the challenge.<br>

<a id="3"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background: #cceeff; border-bottom: 8px solid #004466" > PREPROCESSING<br><div> 

In [None]:
%time 

class Preprocessor():
    """
    This class aims to do the below-
    1. Read the datasets
    2. In this case, process the original data
    3. Check information and description
    4. Check unique values and nulls
    5. Collate starting features 
    6. Conjoin train-original data if requested based on Adversarial CV results
    """;
    
    def __init__(self):
        self.train    = pd.read_csv(CFG.path + f"train.csv", index_col = 'id');
        self.test     = pd.read_csv(CFG.path + f"test.csv", index_col = 'id');
        self.target   = CFG.target ;
        self.original = pd.read_csv(CFG.orig_path);
        self.conjoin_orig_data = CFG.conjoin_orig_data;
        self.dtl_preproc_req = CFG.dtl_preproc_req;
        
        self.sub_fl   = pd.read_csv(CFG.path + f"sample_submission.csv");
        
        PrintColor(f"Data shapes - train-test-original = {self.train.shape} {self.test.shape} {self.original.shape}");
        
        PrintColor(f"\nTrain set head", color = Fore.GREEN);
        display(self.train.head(5).style.format(precision = 3));
        PrintColor(f"\nTest set head", color = Fore.GREEN);
        display(self.test.head(5).style.format(precision = 3));
        PrintColor(f"\nOriginal set head", color = Fore.GREEN);
        display(self.original.head(5).style.format(precision = 3));
                 
        # Resetting original data index:-
        self.original.index = range(len(self.original));
        self.original.index+= max(self.test.index) + 1;
        self.original.index.name = 'id';
        
        #  Changing original data column order to match the competition column structure:-
        self.original = self.original.reindex(self.train.columns, axis=1);
  
    def _AddSourceCol(self):
        self.train['Source'] = "Competition";
        self.test['Source']  = "Competition";
        self.original['Source'] = 'Original';
        
        self.strt_ftre = self.test.columns;
        return self;
    
    def _CollateInfoDesc(self):
        if self.dtl_preproc_req == "Y":
            PrintColor(f"\n{'-'*20} Information and description {'-'*20}\n", color = Fore.MAGENTA);

            # Creating dataset information and description:
            for lbl, df in {'Train': self.train, 'Test': self.test, 'Original': self.original}.items():
                PrintColor(f"\n{lbl} description\n");
                display(df.describe(percentiles= [0.05, 0.25, 0.50, 0.75, 0.9, 0.95, 0.99]).\
                        transpose().\
                        drop(columns = ['count'], errors = 'ignore').\
                        drop([CFG.target], axis=0, errors = 'ignore').\
                        style.format(formatter = '{:,.2f}').\
                        background_gradient(cmap = 'Blues')
                       );

                PrintColor(f"\n{lbl} information\n");
                display(df.info());
                collect();
        return self;
    
    def _CollateUnqNull(self):
        
        if self.dtl_preproc_req == "Y":
            # Dislaying the unique values across train-test-original:-
            PrintColor(f"\nUnique and null values\n");
            _ = pd.concat([self.train[self.strt_ftre].nunique(), 
                           self.test[self.strt_ftre].nunique(), 
                           self.original[self.strt_ftre].nunique(),
                           self.train[self.strt_ftre].isna().sum(axis=0),
                           self.test[self.strt_ftre].isna().sum(axis=0),
                           self.original[self.strt_ftre].isna().sum(axis=0)
                          ], 
                          axis=1);
            _.columns = ['Train_Nunq', 'Test_Nunq', 'Original_Nunq', 
                         'Train_Nulls', 'Test_Nulls', 'Original_Nulls'
                        ];

            display(_.T.style.background_gradient(cmap = 'Blues', axis=1).\
                    format(formatter = '{:,.0f}')
                   );
            
        return self;
       
    def DoPreprocessing(self):
        self._AddSourceCol();
        self._CollateInfoDesc();
        self._CollateUnqNull();
        
        return self; 
        
    def ConjoinTrainOrig(self):
        if self.conjoin_orig_data == "Y":
            PrintColor(f"Train shape before conjoining with original = {self.train.shape}");
            train = pd.concat([self.train, self.original], axis=0, ignore_index = True);
            PrintColor(f"Train shape after conjoining with original= {train.shape}");
            
            train = train.drop_duplicates();
            PrintColor(f"Train shape after de-duping = {train.shape}");
            
            train.index = range(len(train));
            train.index.name = 'id';
        
        else:
            PrintColor(f"We are using the competition training data only");
            train = self.train;
        return train;
          
collect();
print();

In [None]:
%%time 

pp = Preprocessor();
pp.DoPreprocessing();

print();
collect();


## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > INFERENCES<br> <div>

<div style= "font-family: Cambria; letter-spacing: 0px; color:#000000; font-size:110%; text-align:left;padding:3.0px; background: #f2f2f2" >
1. We have numerical, categorical and object columns<br>
2. We may ensue null imputation in this challenge<br>
3. The dataset is very small risking a shakeup<br>
</div>

<a id="4"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background: #cceeff; border-bottom: 8px solid #004466" > ADVERSARIAL CV<br><div>

In [None]:
%%time

# Performing adversarial CV between the 2 specified datasets:-
def Do_AdvCV(df1:pd.DataFrame, df2:pd.DataFrame, source1:str, source2:str):
    "This function performs an adversarial CV between the 2 provided datasets if needed by the user";
    
    # Adversarial CV per column:-
    ftre = pp.test.select_dtypes(include = np.number).\
    drop(columns = ['id', "Source"], errors = 'ignore').columns;
    adv_cv = {};

    for col in ftre:
        shuffle_state = np.random.randint(low = 10, high = 100, size= 1);

        full_df = \
        pd.concat([df1[[col]].assign(Source = source1), df2[[col]].assign(Source = source2)], 
                  axis=0, ignore_index = True).\
        sample(frac = 1.00, random_state = shuffle_state);

        full_df = full_df.assign(Source_Nb = full_df['Source'].eq(source2).astype(np.int8));

        # Checking for adversarial CV:-
        model = LGBMClassifier(random_state = CFG.state, max_depth = 6, learning_rate = 0.05);
        cv    = all_cv['SKF'];
        score = np.mean(cross_val_score(model, 
                                        full_df[[col]], 
                                        full_df.Source_Nb, 
                                        scoring= 'roc_auc', 
                                        cv     = cv)
                       );
        adv_cv.update({col: round(score, 4)});
        collect();
    
    del ftre;
    collect();
    
    fig, ax = plt.subplots(1,1,figsize = (12, 5));
    pd.Series(adv_cv).plot.bar(color = 'tab:blue', ax = ax);
    ax.axhline(y = 0.60, color = 'red', linewidth = 2.75);
    ax.grid(**CFG.grid_specs); 
    plt.yticks(np.arange(0.0, 0.81, 0.05));
    plt.show();
    
# Implementing the adversarial CV:-
if CFG.adv_cv_req == "Y":
    PrintColor(f"\n---------- Adversarial CV - Train vs Original ----------\n", 
               color = Fore.MAGENTA);
    Do_AdvCV(df1 = pp.train, df2 = pp.original, source1 = 'Train', source2 = 'Original');
    
    PrintColor(f"\n---------- Adversarial CV - Train vs Test ----------\n", 
               color = Fore.MAGENTA);
    Do_AdvCV(df1 = pp.train, df2 = pp.test, source1 = 'Train', source2 = 'Test');
    
    PrintColor(f"\n---------- Adversarial CV - Original vs Test ----------\n", 
               color = Fore.MAGENTA);
    Do_AdvCV(df1 = pp.original, df2 = pp.test, source1 = 'Original', source2 = 'Test');   
    
if CFG.adv_cv_req == "N":
    PrintColor(f"\nAdversarial CV is not needed\n", color = Fore.RED);
    
collect();
print();

In [None]:
%%time 

print();
train, test, strt_ftre = pp.ConjoinTrainOrig(), pp.test.copy(deep = True), deepcopy(pp.strt_ftre);
cat_cols  = test.select_dtypes(include = 'object').columns[:-1];
cont_cols = \
test.drop(columns = ['lesion_1', 'lesion_2', 'lesion_3', 'hospital_number'], 
          errors = 'ignore'
         ).\
select_dtypes(exclude = 'object').columns;

PrintColor(f"\nCategory columns\n");
display(cat_cols);
PrintColor(f"\nContinuous columns\n");
display(np.array(cont_cols));
PrintColor(f"\nAll columns\n");
display(strt_ftre);

print();
collect();

## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > INFERENCES<br><div>

<div style= "font-family: Cambria; letter-spacing: 0px; color:#000000; font-size:110%; text-align:left;padding:3.0px; background: #f2f2f2" >
1. Train-test belong to the same distribution, we can perhaps rely on the CV score<br>
2. We need to further check the train-original distribution further, adversarial validation results indicate that we cannot use the original dataset based on a couple of features<br>
</div>

<a id="5"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background: #cceeff; border-bottom: 8px solid #004466" > VISUALS AND EDA <br><div> 
 

<a id="5.2"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > TARGET PLOT<br><div>

In [None]:
%%time 

if CFG.ftre_plots_req == "Y":
    
    fig, axes = plt.subplots(1,2, figsize = (12, 5), sharey = True, gridspec_kw = {'wspace': 0.35});

    for i, df in tqdm(enumerate([pp.train, pp.original]), "Target balance ---> "):
        ax= axes[i];
        a = df[CFG.target].value_counts(normalize = True);
        _ = ax.pie(x = a , labels = a.index.values, 
                   explode      = [0.0, 0.2, 0.2], 
                   startangle   = 40, 
                   shadow       = True, 
                   colors       = ['#3377ff', '#66ffff','#809fff'], 
                   textprops    = {'fontsize': 7, 'fontweight': 'bold', 'color': 'black'},
                   pctdistance  = 0.60, 
                   autopct = '%1.1f%%'
                  );
        df_name = 'Train' if i == 0 else "Original";
        _ = ax.set_title(f"\n{df_name} data\n", **CFG.title_specs);

    plt.tight_layout();
    plt.show();
        
        
    
collect();
print();

In [None]:
%%time 

# Assessing target interactions:-
if CFG.ftre_plots_req == "Y":
    fig, axes = plt.subplots(1,2, figsize = (12, 4), gridspec_kw = {'wspace': 0.2});
    
    for i, (lbl, df) in enumerate({"Train": pp.train, "Original": pp.original}.items()):
        ax = axes[i];
        c = ['#3377ff', '#6699cc'];
        df.groupby(CFG.target).size().plot.bar(ax = ax, color = c[i]);
        ax.set_title(f"Target interaction - {lbl} set", **CFG.title_specs);
        ax.set(xlabel = "");
        
    plt.tight_layout();
    plt.show()
        

<a id="5.4"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > CATEGORY COLUMN PLOTS<br><div>

In [None]:
%%time

if CFG.ftre_plots_req == "Y":
    fig, axes = plt.subplots(len(cat_cols), 3, figsize = (20, len(cat_cols)* 4.5), 
                             gridspec_kw = {'wspace': 0.25, 'hspace': 0.3});

    for i, col in enumerate(cat_cols):
        ax = axes[i, 0];
        a = pp.train[col].value_counts(normalize = True);
        a.sort_index().plot.barh(ax = ax, color = '#007399');
        ax.set_title(f"{col}_Train", **CFG.title_specs);
        ax.set_xticks(np.arange(0.0, 0.9, 0.05), 
                      labels = np.round(np.arange(0.0, 0.9, 0.05),2), 
                      rotation = 90
                     );
        ax.set(xlabel = '', ylabel = '');
        del a;

        ax = axes[i, 1];
        a = pp.test[col].value_counts(normalize = True);
        a.sort_index().plot.barh(ax = ax, color = '#0088cc');
        ax.set_title(f"{col}_Test", **CFG.title_specs);
        ax.set_xticks(np.arange(0.0, 0.9, 0.05), 
                      labels = np.round(np.arange(0.0, 0.9, 0.05),2), 
                      rotation = 90
                     );
        ax.set(xlabel = '', ylabel = '');
        del a;
        
        ax = axes[i, 2];
        a = pp.original[col].value_counts(normalize = True);
        a.sort_index().plot.barh(ax = ax, color = '#0047b3');
        ax.set_title(f"{col}_Original", **CFG.title_specs);
        ax.set_xticks(np.arange(0.0, 0.9, 0.05), 
                      labels = np.round(np.arange(0.0, 0.9, 0.05),2), 
                      rotation = 90
                     );
        ax.set(xlabel = '', ylabel = '');
        del a;       
    
    plt.suptitle(f"Category column plots", **CFG.title_specs, y= 0.90);
    plt.tight_layout();
    plt.show();
    
print();
collect();

<a id="5.5"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > CONTINUOUS COLUMN PLOTS<br><div>

In [None]:
%%time 

if CFG.ftre_plots_req == "Y":
    df = pd.concat([pp.train[cont_cols].assign(Source = 'Train'), 
                    pp.test[cont_cols].assign(Source = 'Test'),
                    pp.original[cont_cols].assign(Source = "Original")
                   ], 
                   axis=0, ignore_index = True
                  );
    
    fig, axes = plt.subplots(len(cont_cols), 4 ,figsize = (16, len(cont_cols) * 4.2), 
                             gridspec_kw = {'hspace': 0.35, 'wspace': 0.3, 'width_ratios': [0.80, 0.20, 0.20, 0.20]});
    
    for i,col in enumerate(cont_cols):
        ax = axes[i,0];
        sns.kdeplot(data = df[[col, 'Source']], x = col, hue = 'Source', 
                    palette = ['#0039e6', '#ff5500', '#00b300'], 
                    ax = ax, linewidth = 2.1
                   );
        ax.set_title(f"\n{col}", **CFG.title_specs);
        ax.grid(**CFG.grid_specs);
        ax.set(xlabel = '', ylabel = '');
        
        ax = axes[i,1];
        sns.boxplot(data = df.loc[df.Source == 'Train', [col]], y = col, width = 0.25,
                    color = '#33ccff', saturation = 0.90, linewidth = 0.90, 
                    fliersize= 2.25,
                    ax = ax);
        ax.set(xlabel = '', ylabel = '');
        ax.set_title(f"Train", **CFG.title_specs);
        
        ax = axes[i,2];
        sns.boxplot(data = df.loc[df.Source == 'Test', [col]], y = col, width = 0.25, fliersize= 2.25,
                    color = '#80ffff', saturation = 0.6, linewidth = 0.90, 
                    ax = ax); 
        ax.set(xlabel = '', ylabel = '');
        ax.set_title(f"Test", **CFG.title_specs);
        
        ax = axes[i,3];
        sns.boxplot(data = df.loc[df.Source == 'Original', [col]], y = col, width = 0.25, fliersize= 2.25,
                    color = '#99ddff', saturation = 0.6, linewidth = 0.90, 
                    ax = ax); 
        ax.set(xlabel = '', ylabel = '');
        ax.set_title(f"Original", **CFG.title_specs);
              
    plt.suptitle(f"\nDistribution analysis- continuous columns\n", **CFG.title_specs, 
                 y = 0.905, x = 0.50
                );
    plt.tight_layout();
    plt.show();
    
print();
collect();

<a id="5.7"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > FEATURE INTERACTION AND UNIVARIATE RELATIONS<br><div>
    
We aim to start off with a simple correlation analysis to determine feature relations

In [None]:
%%time 

def MakeCorrPlot(df: pd.DataFrame, data_label:str, figsize = (30, 9)):
    """
    This function develops the correlation plots for the given dataset
    """;
    
    fig, axes = plt.subplots(1,2, figsize = figsize, gridspec_kw = {'hspace': 0.2, 'wspace': 0.1},
                             sharey = True
                            );
    
    for i, method in enumerate(['pearson', 'spearman']):
        corr_ = df.drop(columns = ['id', 'Source'], errors = 'ignore').corr(method = method);
        ax = axes[i];
        sns.heatmap(data = corr_,  
                    annot= True,
                    fmt= '.2f', 
                    cmap = 'Blues',
                    annot_kws= {'fontweight': 'bold','fontsize': 6.75}, 
                    linewidths= 1.5, 
                    linecolor='white', 
                    cbar= False, 
                    mask= np.triu(np.ones_like(corr_)),
                    ax= ax
                   );
        ax.set_title(f"\n{method.capitalize()} correlation- {data_label}\n", **CFG.title_specs);
        
    collect();
    print();

# Implementing correlation analysis:-
for lbl, df in {"Train": pp.train[cont_cols], "Test": pp.test[cont_cols], "Original": pp.original[cont_cols]}.items():
    MakeCorrPlot(df = df, data_label = lbl, figsize = (16,5));

print();
collect();

<a id="5.9"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > SURVIVAL ANALYSIS WITH CATEGORY COLUMNS<br> <div>

In [None]:
%%time

# Analyzing survival chances across category columns:-
fig, axes = plt.subplots(len(cat_cols), 4, gridspec_kw = {'hspace': 0.35, 'wspace': 0.35},
                         figsize = (20, 4 * len(cat_cols)), 
                         sharex = True
                        );
for lbl, mdl_df in tqdm({'Train': train, "Original": pp.original}.items()):
    for i, col in tqdm(enumerate(cat_cols)):
        df = pd.crosstab(mdl_df[col], mdl_df[CFG.target]);
        df['Sum_C'] = np.sum(df, axis=1);
        df1 = df.apply(lambda x: x/ x['Sum_C'], axis=1);
        
        if lbl == "Train": j = 0;
        else: j = 2;
            
        ax = axes[i,j];
        sns.heatmap(df.iloc[:, :-1], cmap = 'winter', fmt= ',.0f', annot = True, 
                    cbar = False, linewidths= 1.5, linecolor='white',
                    annot_kws= {'fontweight': 'bold','fontsize': 6.75},
                    ax = ax
                   );
        ax.set(xlabel = '', ylabel = '');
        ax.set_title(f"{col} {lbl}", **CFG.title_specs);

        ax = axes[i,j+1];
        sns.heatmap(df1.iloc[:, :-1], cmap = 'icefire', fmt= ',.2%', annot = True, 
                    cbar = False, linewidths= 1.5, linecolor='white',
                    annot_kws= {'fontweight': 'bold','fontsize': 6.75},
                    ax = ax
                   );
        ax.set(xlabel = '', ylabel = '');
        ax.set_title(f"{col}_pct {lbl}", **CFG.title_specs);

        del df, df1;

plt.suptitle(f"Survival analysis with category columns", **CFG.title_specs, y = 0.90);
plt.tight_layout();   
plt.show();

print();
collect();

<a id="5.10"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > INFERENCES<br> <div>

<div style= "font-family: Cambria; letter-spacing: 0px; color:#000000; font-size:110%; text-align:left;padding:3.0px; background: #f2f2f2" >
1. Feature selection is a very important part of the assignment. We have lots of features and feature selection will be a differentiator<br>
2. Quite a few features have outliers. Outlier handling may be another differentiator in this challenge. Certain categorical features have label outliers that are handled while encoding<br>
3. Columns are not highly correlated. Dimensionality reduction may help here<br>
4. We will need to encode lots of object columns with different encoder types<br>
5. Certain key inferences regarding columns (survival) are as below-<br>
    - Adults<br>
    - Non-surgical treatments<br>
    - Normal values in health risk indicators<br>
6. Lesion 3 is a quasi-constant feature and is superfluous as it is 0 in the test set<br>
7. Hospital number may provide insights for feature creation perhaps<br>
8. Lesions are categorical columns as encoded in the subsequent section<br>
</div>

<a id="6"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background: #cceeff; border-bottom: 8px solid #004466" > DATA TRANSFORMS <br><div> 
    
This section aims at creating secondary features, scaling and if necessary, conjoining the competition training and original data tables<br>


In [None]:
%%time 

# Data transforms:-
class Xformer(TransformerMixin, BaseEstimator):
    """
    This class is used to create secondary features from the existing data
    """;
    
    def __init__(self): pass
    
    def fit(self, X, y= None, **params):
        self.ip_cols = X.columns;
        return self;
    
    def transform(self, X, y= None, **params):       
        global strt_ftre;
        df    = X.copy();      
      
        if CFG.sec_ftre_req == "Y":
            df['rectal_temp_risk'] = np.where(df.rectal_temp >= 37.8,1,0).astype(np.int8);
            df['pulse_risk']       = np.where(df.pulse >= 40,1,0).astype(np.int8);
            df['cell_vol_risk']    = np.where(df.packed_cell_volume >= 50, 1,0).astype(np.int8);
            df['protein_risk']     = np.where(df.total_protein >= 7.5, 1,0).astype(np.int8);
                
        if CFG.sec_ftre_req != "Y": 
            PrintColor(f"Secondary features are not required", color = Fore.RED);    
        
        self.op_cols = df.columns;  
        return df;
    
    def get_feature_names_in(self, X, y=None, **params): 
        return self.ip_cols;    
    
    def get_feature_names_out(self, X, y=None, **params): 
        return self.op_cols;
    
collect();
print();

In [None]:
%%time 
    
# Encoding the categorical columns with domain based encoding:-
class Encoder(TransformerMixin, BaseEstimator):
    """
    This class is used to create encoded features from the existing object data
    """; 
    
    def __init__(self): 
        self.lesion_OH_req = CFG.lesion_OH_req;
    
    def fit(self, X, y= None, **params):
        self.ip_cols = X.columns;
        return self;
    
    def transform(self, X, y= None, **params):
        """
        This method performs manual label encoding for the category columns and lesion columns.
        This is done as per the original data instructions- refer the original metadata page for details
        """;
        
        df = X.copy();
        df['surgery'] = df['surgery'].map({'no': 1, 'yes': 0}).astype(np.int8);
        df['age']     = df['age'].map({'adult': 0, 'young': 1}).astype(np.int8);
        df['temp_of_extremities'] = \
        df['temp_of_extremities'].map({'None': 0, "normal":1, "warm": 2, "cool":3, "cold":4}).astype(np.int8);
        df['peripheral_pulse'] = \
        df['peripheral_pulse'].map({'NA': 0, "None": 0, 
                                      "normal":1, "increased": 2, "reduced":3, "absent":4}
                                  ).astype(np.int8);
        df['mucous_membrane'] = \
        df['mucous_membrane'].map({'NA': 0, "None": 0, "normal":1, "normal_pink":1, "pink": 2, "bright" : 3,
                                     "bright_pink":3, "pale_pink":4 , "pale_cyanotic": 5, 
                                     "bright_red":6, "injected": 6, "dark_cyanotic": 7
                                    }
                                 ).astype(np.int8);
        df['capillary_refill_time'] = \
        df['capillary_refill_time'].map({'NA': 0, "None": 0, "less_3_sec":1, "3": 2, "more_3_sec": 2}).astype(np.int8); 
        df['pain'] = \
        df['pain'].map({"NA": 0, "None": 0, "alert" : 1, "no_pain": 2, "depressed": 3, 
                        "mild_pain": 4, 'slight': 3, "moderate": 4, "severe_pain": 5, "extreme_pain": 6
                       }
                      ).astype(np.int8);
        df['peristalsis'] = \
        df['peristalsis'].map({"NA": 0, "None": 0, "hypermotile": 1, 'distend_small':1, 
                               "normal": 2,"hypomotile": 3, "absent": 4}
                             ).astype(np.int8);
        df['abdominal_distention'] = \
        df['abdominal_distention'].map({"NA": 0, "none": 1, "slight": 2, "moderate": 3, "severe": 4}).astype(np.int8);
        df['nasogastric_tube'] = \
        df['nasogastric_tube'].map({"NA": 0, "none": 1, "slight": 2, "significant": 3}).astype(np.int8);
        df['nasogastric_reflux'] = \
        df['nasogastric_reflux'].map({"NA": 0, "none": 1, 'slight':2, "less_1_liter": 2, "more_1_liter": 3}).astype(np.int8);
        df['rectal_exam_feces'] = \
        df['rectal_exam_feces'].map({"NA": 0, "None": 0, 
                                     "normal": 1, "increased": 3, "decreased": 4, "absent": 5, 'serosanguious':6}
                                   ).astype(np.int8);
        df['abdomen'] = \
        df['abdomen'].map({"NA": 0, "None":0, 
                           "normal": 1, "other": 2, "firm": 3, "distend_small": 4, "distend_large": 5}
                         ).astype(np.int8);
        df['abdomo_appearance'] = \
        df['abdomo_appearance'].map({"NA": 0, "None":0,"clear": 1, "cloudy": 2, "serosanguious": 3}).astype(np.int8);
        df['surgical_lesion'] = df['surgical_lesion'].map({"no": 1, "yes": 0}).astype(np.int8);
        df['cp_data'] = df['cp_data'].map({"no": 1, "yes": 0}).astype(np.int8);
        
        # Encoding the lesions- lesion3 is 0 in the test set so no need to encode it:-
        if self.lesion_OH_req == "Y":
            df = \
            pd.concat([df.drop(columns = ['lesion_1', 'lesion_2', 'lesion_3'], errors = 'ignore'), 
                       pd.get_dummies(df['lesion_1'].astype(str).apply(lambda x: x[0])).\
                       astype(np.int8).iloc[:,1:].add_prefix('lesion1_'),
                       pd.get_dummies(df['lesion_2'].astype(str).apply(lambda x: x[0])
                                     ).\
                       astype(np.int8).iloc[:,1:].add_prefix('lesion2_')
                      ], axis=1
                     );   
        
        else:
            df['lesion_1'] = \
            df['lesion_1'].astype(str).apply(lambda x: x[0]).astype(np.int8).clip(0,1).astype(np.int8);
            df['lesion_2'] = \
            df['lesion_2'].astype(str).apply(lambda x: x[0]).astype(np.int8).clip(0,1).astype(np.int8);
            df.drop('lesion_3', axis=1, errors = 'ignore', inplace = True);
        
        self.op_cols = df.columns; 
        return df;
    
    def get_feature_names_in(self, X, y=None, **params): 
        return self.ip_cols;    
    
    def get_feature_names_out(self, X, y=None, **params): 
        return self.op_cols;       
    

collect();
print();

In [None]:
%%time 

# Scaling:-
class Scaler(TransformerMixin, BaseEstimator):
    """
    This class aims to create scaling for the provided dataset
    """;
    
    def __init__(self, scl_method: str, scale_req: str, scl_cols):
        self.scl_method = scl_method;
        self.scale_req  = scale_req;
        self.scl_cols   = scl_cols;
        
    def fit(self,X, y=None, **params):
        "This function calculates the train-set parameters for scaling";
        
        self.params          = X[self.scl_cols].describe(percentiles = [0.25, 0.50, 0.75]).drop(['count'], axis=0).T;
        self.params['iqr']   = self.params['75%'] - self.params['25%'];
        self.params['range'] = self.params['max'] - self.params['min'];
        
        return self;
    
    def transform(self,X, y=None, **params):  
        "This function transform the relevant scaling columns";
        
        df = X.copy();
        if self.scale_req == "Y":
            if CFG.scl_method == "Z":
                df[self.scl_cols] = (df[self.scl_cols].values - self.params['mean'].values) / self.params['std'].values;
            elif CFG.scl_method == "Robust":
                df[self.scl_cols] = (df[self.scl_cols].values - self.params['50%'].values) / self.params['iqr'].values;
            elif CFG.scl_method == "MinMax":
                df[self.scl_cols] = (df[self.scl_cols].values - self.params['min'].values) / self.params['range'].values;
        else:
            PrintColor(f"Scaling is not needed", color = Fore.RED);
    
        return df;
    

In [None]:
%%time

PrintColor(f"\n{'='* 20} Data transformation {'='* 20} \n");

# Implementing the pipeline:-
Xtrain, ytrain = \
train.drop(CFG.target, axis=1, errors = 'ignore'), train[CFG.target].map(CFG.tgt_mapper).astype(np.int8);

# Transforming the data:-
xform = Pipeline(steps = [("Imp", 
                           ColumnTransformer([("CImp", SI(strategy = "most_frequent"), cat_cols.to_list() + ['Source'])],
                                              remainder = SI(strategy = 'mean'),
                                              verbose_feature_names_out = False
                                            )
                          ),
                          ('Xform', Xformer()), ('Enc', Encoder())
                         ], verbose = False
                );

PrintColor(f"\n---> Data pipeline structure\n");
display(xform);

PrintColor(f"\n---> Post pipeline datasets\n");
Xtrain = xform.fit_transform(Xtrain, ytrain);
Xtest  = xform.transform(test);

PrintColor(f"\n---> Train data\n");
display(Xtrain.head(5).style.format(precision = 2));
PrintColor(f"\n---> Test data\n");
display(Xtest.head(5).style.format(precision = 2));

PrintColor(f"\n---> Train data columns after data pipeline\n");
pprint(Xtrain.columns);

PrintColor(f"\n---> Test data columns after data pipeline\n");
pprint(Xtest.columns);

print();
collect();


<a id="7"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background: #cceeff; border-bottom: 8px solid #004466" > MODEL TRAINING <br><div> 
   

In [None]:
%%time 

# Initializing model I-O:-

Mdl_Master = \
{'CBC': CatBoostClassifier(**{'task_type'           : "GPU" if CFG.gpu_switch == "ON" else "CPU",
                              'objective'           : 'MultiClass',
                              'eval_metric'         : 'TotalF1',
                              'bagging_temperature' : 0.45,
                              'colsample_bylevel'   : 0.6,
                              'iterations'          : 1_000,
                              'learning_rate'       : 0.056,
                              'od_wait'             : 14,
                              'max_depth'           : 6,
                              'l2_leaf_reg'         : 0.45,
                              'min_data_in_leaf'    : 5,
                              'random_strength'     : 0.15, 
                              'max_bin'             : 200,
                              'verbose'             : 0,
                           }
                         ), 

  'LGBMC': LGBMClassifier(**{'device'            : "gpu" if CFG.gpu_switch == "ON" else "cpu",
                             'objective'         : 'multiclass',
                             'metric'            : 'auc_mu',
                             'boosting_type'     : 'gbdt',
                             'random_state'      : CFG.state,
                             'colsample_bytree'  : 0.50,
                             'subsample'         : 0.65,
                             'learning_rate'     : 0.08,
                             'max_depth'         : 5,
                             'n_estimators'      : 1000,
                             'num_leaves'        : 45,                    
                             'reg_alpha'         : 0.0001,
                             'reg_lambda'        : 3.5,
                             'verbose'           : -1,
                         }
                      ),

  'XGBC': XGBClassifier(**{'tree_method'        : "gpu_hist" if CFG.gpu_switch == "ON" else "hist",
                           'objective'          : 'multi:softprob',
                           'random_state'       : CFG.state,
                           'colsample_bytree'   : 0.60,
                           'learning_rate'      : 0.08,
                           'max_depth'          : 4,
                           'n_estimators'       : 1000,                         
                           'reg_alpha'          : 0.01,
                           'reg_lambda'         : 2.25,
                           'min_child_weight'   : 10,
                        }
                       ),
};

print();
collect();

In [None]:
%%time

# Selecting relevant columns for the train and test sets:-
PrintColor(f"\n{'='* 20} Model I-O initialization {'='* 20} \n");

drop_cols = ['hospital_number'];
print(); 

try: 
    Xtrain, Xtest = Xtrain.drop(drop_cols, axis=1,errors = 'ignore'), Xtest.drop(drop_cols, axis=1, errors = 'ignore');
    pprint(Xtest.columns, depth = 1, width = 10, indent = 5);
except: 
    PrintColor(f"\n---> Check the columns selected\n---> Selected columns-", color = Fore.RED);
    pprint(Xtest.columns, depth = 1, width = 10, indent = 5);
        
# Initializing output tables for the models:-
methods   = [col for col in Mdl_Master.keys() if col.endswith("C")];
OOF_Preds = pd.DataFrame(columns = [f"Class{i}" for i in range(3)]);
Mdl_Preds = pd.DataFrame(index = pp.sub_fl['id'], columns = [f"Class{i}" for i in range(3)],
                         data = np.zeros((len(Xtest),len(methods)))
                        );
FtreImp   = pd.DataFrame(index = Xtrain.drop(columns = ['Source'], errors = 'ignore').columns,
                         columns = methods,
                         data = np.zeros((len(Xtrain.drop(columns = ['Source'], errors = 'ignore').columns),
                                          len(methods)
                                         )
                                        )
                        );

PrintColor(f"\n---> Selected model options- ");
pprint(methods, depth = 1, width = 100, indent = 5);

print();
collect();

In [None]:
%%time 

if CFG.ML == "Y":
    PrintColor(f"\n{'='* 20} Model Training and CV {'='* 20} \n");
    
    cols_drop = ['id', 'Source', 'Label'];
    cv        = all_cv.get(CFG.mdlcv_mthd);
    Xt        = Xtest.copy(deep = True);
    X,y       = Xtrain.copy(deep = True), ytrain.copy(deep = True);
    scores    = [];
    cat_ftre  = [c for c in Xtest.columns if c not in cont_cols.to_list() + ['Source']];
                
    # Initializing CV splitting:-       
    for fold_nb, (train_idx, dev_idx) in tqdm(enumerate(cv.split(X, y))): 
        Xtr  = X.iloc[train_idx].drop(columns = cols_drop, errors = 'ignore');   
        Xdev = X.iloc[dev_idx].loc[X.Source == "Competition"].drop(columns = cols_drop, errors = 'ignore'); 
        ytr  = y.loc[y.index.isin(Xtr.index)];
        ydev = y.loc[y.index.isin(Xdev.index)];
        
        oof_preds  = np.zeros((len(Xdev), len(methods)));
        mdl_preds  = np.zeros((len(Xt), len(methods)));
       
        # Fitting the models:- 
        for method in methods:
            model = Mdl_Master[method];
            if method in ['CBR', 'CBC']:    
                model.fit(Xtr, ytr, 
                          eval_set = [(Xdev, ydev)], 
                          verbose = 0,
                          early_stopping_rounds = CFG.nbrnd_erly_stp,
                          cat_features = cat_ftre,
                         ); 

            elif method in ['LGBMR', 'LGBMC']: 
                model.fit(Xtr, ytr, eval_set = [(Xdev, ydev)],
                          verbose = 0,
                          callbacks = [log_evaluation(0), 
                                       early_stopping(CFG.nbrnd_erly_stp, verbose = False)
                                      ], 
                          categorical_feature = cat_ftre,
                         );

            elif method in ['XGBR', 'XGBC']:        
                model.fit(Xtr, ytr, eval_set = [(Xdev, ydev)], 
                          verbose = 0,
                          early_stopping_rounds = CFG.nbrnd_erly_stp,
                         );            

            else: 
                model.fit(Xtr, ytr); 
                
            # Collecting predictions and scores and post-processing OOF based on model method:-  
            oof_preds = oof_preds + model.predict_proba(Xdev);
            mdl_preds = mdl_preds + model.predict_proba(Xt.drop(columns = cols_drop, errors = 'ignore'));
            
            try: FtreImp[method] = FtreImp[method] + model.feature_importances_;
            except: pass;
            
        OOF_Preds = pd.concat([OOF_Preds, pd.DataFrame(oof_preds, index = Xdev.index, 
                                                       columns = [f"Class{i}" for i in range(3)])],
                              axis = 0,
                              ignore_index = False
                             );
        Mdl_Preds = Mdl_Preds + mdl_preds;      
        
        # Calculating the fold-level score metric:-        
        score = ScoreMetric(ydev, np.argmax(oof_preds, axis=1));
        scores.append(score);
        
        num_space = 3 if fold_nb <= 8 else 2;
        PrintColor(f"---> Fold{fold_nb + 1}. {' '* num_space} OOF = {score:.5f}", 
                   color = Fore.MAGENTA
                  );
        collect(); 
    PrintColor(f"\n---> Mean Std CV score = {np.mean(scores):.5f} +- {np.std(scores):.5f}\n");
    
    OOF_Preds = OOF_Preds.groupby(level= 0).mean()/ len(methods);
    for col in range(3): 
        Mdl_Preds[f"Class{col}"] = Mdl_Preds[f"Class{col}"]/ (CFG.n_splits * CFG.n_repeats * len(methods));
    
    
collect();
print();


In [None]:
%%time 

if CFG.ML == "Y":

    # Analyzing the Ml model results:-
    fig, axes = plt.subplots(len(methods), 1, figsize = (25, len(methods)* 5), 
                             sharex = True, gridspec_kw= {'hspace': 0.3}
                            );
    for i, method in enumerate(methods):
        ax = axes[i];
        FtreImp[method].plot.bar(ax = ax, color = 'tab:blue');
        ax.set_title(f"Feature Importance - {method}", **CFG.title_specs);
        ax.set(xlabel = '', ylabel = '');

    plt.xticks(rotation = 45);
    plt.tight_layout();
    plt.show();

    # Plotting the confusion matrix with the results:-
    fig, ax = plt.subplots(1,1, figsize = (3,3));
    sns.heatmap(confusion_matrix(ytrain.iloc[0:pp.train.shape[0]].values, 
                                 OOF_Preds.idxmax(axis=1).apply(lambda x: x[-1]).astype(np.int8).values),
                cbar = None, annot= True, fmt = '.0f',
                annot_kws= {'fontweight': 'bold','fontsize': 6.75},
                cmap = 'Pastel1', linewidths = 2, ax = ax
               );
    ax.set_title(f"\nConfusion matrix\n", **CFG.title_specs);
    plt.show();
    
collect();
print();

<a id="8"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background: #cceeff; border-bottom: 8px solid #004466" > SUBMISSION<br> <div> 

In [None]:
%%time 

if CFG.ML == "Y":
    pp.sub_fl[CFG.target] = \
    Mdl_Preds.\
    idxmax(axis=1).\
    apply(lambda x: x[-1]).astype(np.int8).\
    map({k: v for k, v in zip(CFG.tgt_mapper.values(), CFG.tgt_mapper.keys())}).values;

    PrintColor(f"\nTest set predictions\n");
    display(pp.sub_fl.head(5));

    pp.sub_fl.to_csv(f"Submission_{CFG.version_nb}.csv", index = None);

    PrintColor(f"\nTest set prediction counts\n");
    pprint(Counter(pp.sub_fl[CFG.target]));

print();
collect();

<a id="9"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0052cc; border-bottom: 8px solid #cc9966" > NEXT STEPS<br> <div> 

<div style= "font-family: Cambria; letter-spacing: 0px; color:#000000; font-size:110%; text-align:left;padding:3.0px; background: #f2f2f2" >
1. Better feature engineering<br>
2. Incorporating the custom metric in GBM<br>
3. Try better CV oriented tuning<br>
4. Try some other methods<br>
</div>

# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:blue; font-size:80%; text-align:left;padding:3.0px; background: #lightgrey" > If you find my work useful, please upvote and share the notebook.<br> Best regards!!<br> <div>
  