#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: black; font-size:180%; text-align:left;padding:3.0px; background: #ffebcc; border-bottom: 8px solid black" > TABLE OF CONTENTS<br><div>  
* [IMPORTS AND INSTALLATIONS](#1)
* [INTRODUCTION](#2)
    * [UTILITIES](#2.1)
    * [DATASET DETAILS](#2.2)    
    * [CONFIGURATION](#2.3)
    * [VERSION DETAILS](#2.4)
* [PREPROCESSING](#3)
* [MODEL INFERENCING](#4) 
    * [MY LGBM-XGB MODELS](#4.1)
    * [LAMA DENSELIGHT MODEL](#4.2)
* [PLANNED NEXT STEPS](#5)  

<a id="1"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #b32d00; border-bottom: 8px solid black" > PACKAGE IMPORTS AND INSTALLATIONS<br> <div>

In [1]:
%%time

from IPython.display import clear_output;
from gc import collect;
!pip install -q "/kaggle/input/pythonlibrarieswheelfiles/scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
!pip install -q "/kaggle/input/pythonlibrarieswheelfiles/lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl";

print();
collect();

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.[0m[31m
[0m
CPU times: user 929 ms, sys: 204 ms, total: 1.13 s
Wall time: 1min 11s


In [2]:
%%time

from gc import collect;
from warnings import filterwarnings;
filterwarnings('ignore');
from IPython.display import display_html, clear_output;
import ctypes;
libc = ctypes.CDLL("libc.so.6");

from pprint import pprint;
from functools import partial;

from copy import deepcopy;
import pandas as pd, numpy as np, polars as pl, os, joblib;
import polars.selectors as cs;

from os import path, walk, getpid;
from psutil import Process;
import re;
from collections import Counter;
from itertools import product;
from glob import glob;

from colorama import Fore, Style, init;
from warnings import filterwarnings;
filterwarnings('ignore');
from tqdm.notebook import tqdm;

print();
collect();


CPU times: user 563 ms, sys: 88.2 ms, total: 651 ms
Wall time: 1.04 s


In [3]:
%%time

# Pipeline specifics:-
from sklearn.model_selection import (StratifiedGroupKFold as SGKF, cross_val_score, cross_val_predict);
from sklearn.pipeline import Pipeline;
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin;

# ML Model training:-
from sklearn.metrics import roc_auc_score, make_scorer;
from xgboost import DMatrix, XGBClassifier as XGBC;
from lightgbm import log_evaluation, early_stopping, LGBMClassifier as LGBMC;
from catboost import CatBoostClassifier as CBC, Pool;
from sklearn.ensemble import VotingClassifier as VC;
clear_output();

import lightgbm as lgb, xgboost as xgb;
print(f"\nLightGBM = {lgb.__version__}| XGBoost = {xgb.__version__}\n\n");
collect();


LightGBM = 4.3.0| XGBoost = 2.0.3


CPU times: user 3.22 s, sys: 320 ms, total: 3.54 s
Wall time: 4.79 s


In [4]:
%%time

# Making sklearn pipeline outputs as dataframe:-
from sklearn import set_config;
set_config(transform_output = "pandas");
pd.set_option('display.max_columns', 50);
pd.set_option('display.max_rows', 50);
pd.set_option('display.precision', 3);

# Setting global configurations for polars:-
pl.Config.activate_decimals(True).set_tbl_hide_column_data_types(True);
pl.Config(**dict(tbl_formatting = 'ASCII_FULL_CONDENSED',
                 tbl_hide_column_data_types = True,
                 tbl_hide_dataframe_shape = True,
                 fmt_float = "mixed",
                 tbl_cell_alignment = 'CENTER',
                 tbl_hide_dtype_separator = True,
                 tbl_cols = 100,
                 tbl_rows = 50,
                 fmt_str_lengths = 100,
                )
         );

CPU times: user 367 µs, sys: 60 µs, total: 427 µs
Wall time: 1.58 ms


<a id="2"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #b32d00; border-bottom: 8px solid black" > INTRODUCTION<br> <div>

<a id="2.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background:  #e0ebeb; border-bottom: 8px solid #b32d00" > UTILITIES<br><div>

In [5]:
%%time

class Utility:
    """
    This class serves to do the below-
    1. Define method to print in color
    2. Define the classifier metric, custom scorer callable and competition metrics
    3. Define the garbage cleaning process
    4. Define the predict-in-batch method to prevent OOM issues
    """;

    def PrintColor(self,text:str, color = Fore.BLUE, style = Style.BRIGHT):
        "Prints color outputs using colorama using a text F-string";
        print(style + color + text + Style.RESET_ALL);

    def ScoreMetric(self, ytrue:np.array, ypred: np.array)-> float:
        """
        This method calculates the classifier metric to evaluate the base-model
        Inputs- ytrue, ypred:- np.array - input true and predictions arrays
        Output- float:- base classifier metric, here- GINI score
        """;
        return roc_auc_score(ytrue, ypred);

    def StabilityMetric(self, base, w_fallingrate=88.0, w_resstd=-0.5, week_lbl = "WEEK_NUM"):
        """
        This method defines the GINI-stability metric as required for the competition
        Source:- https://www.kaggle.com/code/darynarr/home-credit-drop-date-features
        """;

        grp_gini = \
        base.loc[:, [week_lbl, "target", "score"]].\
        sort_values(week_lbl).\
        groupby(week_lbl)[["target", "score"]].\
        apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist();

        x         = np.arange(len(grp_gini));
        y         = grp_gini;
        a, b      = np.polyfit(x, y, 1);
        y_hat     = a*x + b;
        residuals = y - y_hat;
        res_std   = np.std(residuals);
        avg_gini  = np.mean(gini_in_time);

        return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std;

    def CleanMemory(self):
        "This method cleans the memory off unused objects and displays the cleaned state RAM usage";

        collect();
        libc.malloc_trim(0);
        pid        = getpid();
        py         = Process(pid);
        memory_use = py.memory_info()[0] / 2. ** 30;
        return f"\nRAM usage = {memory_use :.4} GB";
    
    def PredictBatch(self, model, X: pd.DataFrame, prd_proba_req: bool = True, batch_size: int = 1000)-> np.array:
        """
        This method predicts from the model in batches instead of the complete test set to avoid OOM issues in the test set
        Inputs:-
        1. X:- Train/ Test set
        2. prd_proba_req:- need predict proba/ predictions- True [predict_proba], False[predict]
        3. batch_size:- batch size to consider in one attempt

        Returns:-
        preds:- array of predicted probabilities
        """;

        num_samples = len(X);
        num_batches = int(np.ceil(num_samples / batch_size));
        preds       = np.zeros((num_samples,));

        for batch_idx in range(num_batches):
            self.PrintColor(f"---> Processing batch: {batch_idx+1}/{num_batches}", color = Fore.CYAN);

            start_idx = batch_idx * batch_size;
            end_idx   = min((batch_idx + 1) * batch_size, num_samples);
            X_batch   = X.iloc[start_idx : end_idx];

            if prd_proba_req == True:
                batch_probs = model.predict_proba(X_batch)[:, 1];
            else:
                batch_probs = model.predict(X_batch).data.squeeze();

            preds[start_idx: end_idx] = batch_probs;
            _ = self.CleanMemory();

        return preds;
    
Utils = Utility();
print();



CPU times: user 71 µs, sys: 12 µs, total: 83 µs
Wall time: 86.8 µs


<a id="2.2"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background:  #e0ebeb; border-bottom: 8px solid #b32d00" > FOREWORD<br><div>

**Data columns**<br>
This is available in the original data description as below<br>
https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data <br>
<br>**Competition details and notebook objectives**<br>
1. This is a binary classification challenge to predict home loan credit defaulters. **GINI** is the metric for the base classifier in this challenge<br>
2. We also have to additionally assess the stability of GINI measure across time in the evaluation period. We need to score the classifier on a weekly basis and then assess the stability of the weekly GINI score using a regression model against time. Stability measure penalizes models that wane off in prediction capabilities across time. <br>
2. In this starter notebook, we start the assignment with a simple preprocessing, understanding the data structure of the competition data, basic feature emgineering and develop starter models to initiate the challenge. We will also incorporate other opinions and approaches as we move along the challenge.<br>
<br>
**Model strategy** <br>
We start off with simple tree based ML models and a soft-voting ensemble with appropriate inference in the test set submission. 

<br>**Kernel and method strategy**<br>
1. We start off with a simple data transformer class that prepares the secondary features for the challenge. Considering the large size of the data, managing these tables within the confines of a Kaggle notebook environment will be a challenge, especially for model training <br> 
2. We verify the correctness of the data processor class on the train and test datasets, keeping track of execution time and memory usage too <br>
3. We then train ML models here using similar but slightly varying parameters and then blend them using heuristic weights. <br>
4. To prevent a deluge of tables and model objects in the inference pipeline, we synthesize an inherited voting classifier from each model, intaking the fold level model objects as inputs. These objects will be fed into the inference kernel and a combined prediction will be created <br>
5. Training kernel is placed [here](https://www.kaggle.com/code/ravi20076/homecredit-starter-training-v1) for perusal and possible replication <br>

<a id="2.3"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background:  #e0ebeb; border-bottom: 8px solid #b32d00" > CONFIGURATION<br><div>

In [6]:
%%time

# Configuration class:-
class CFG:
    """
    Configuration class for parameters and CV strategy for tuning and training
    """;
    
    exp_nb             = 1;
    version_nb         = 1;
    test_req           = "N";
    test_sample_frac   = 0.025;
    state              = 42;
    target             = 'target';
    train_path         = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train";
    test_path          = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test";
    path               = "/kaggle/input/home-credit-credit-risk-model-stability";
    model_path         = "/kaggle/input/homecreditquality2024ancillary";
    null_cutoff        = 0.95;
    cat_cutoff         = 200;
    n_splits           = 3 if test_req == "Y" else 5;
    n_repeats          = 1 ;
    nbrnd_erly_stp     = 100;
    all_exp_nb         = ["E2V1", "E2V2", "E2V3", "E2V5"];
    myml_inner_wgt     = [0.80, 0.10, 0.10];
    mymdl_wgt          = [0.10, 0.15, 0.20, 0.50];
    blend_wgt          = [0.70, 0.15, 0.15];

    # Global variables for plotting:-
    grid_specs = {'visible': True, 'which': 'both', 'linestyle': '--',
                  'color': 'lightgrey', 'linewidth': 0.75
                  };
    title_specs = {'fontsize': 9, 'fontweight': 'bold', 'color': 'tab:blue'};

print();
Utils.PrintColor(f"--> Configuration done!");
Utils.PrintColor(f"--> Sum of blend weights = {sum(CFG.blend_wgt):.2f}");

_ = Utils.PrintColor(Utils.CleanMemory());


[1m[34m--> Configuration done![0m
[1m[34m--> Sum of blend weights = 1.00[0m
[1m[34m
RAM usage = 0.3168 GB[0m
CPU times: user 118 ms, sys: 764 µs, total: 119 ms
Wall time: 118 ms


<a id="2.4"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background:  #e0ebeb; border-bottom: 8px solid #b32d00" > SUBMISSION HISTORY<br><div>

|Experiment <br> Number|Version|Details|Features| Models|CV score|Stability score| Public LB score|
|:-:|:-:|---|:-:|:-:|:-:|:-:|:-:|
|1|1|* Used my previous work including LGBM and LAMA models <br> * Aggregated experiments 2.1-2.6 |325-512|LGBM x 5||||

<a id="3"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #b32d00; border-bottom: 8px solid black" > PREPROCESSING<br> <div>

In [7]:
%%time

class DataXformer:
    """
    This is a comprehensive preprocessing and data transformer class that does the below-
    1. consumes the input data tables
    2. creates secondary features
    3. ensues memory efficient outputs for the model
    """;

    def __init__(self, null_cutoff: float, cat_cutoff: int,
                 TrainTest: str      = "test",
                 sel_cols: list      = [],
                 cat_cols: list      = [],
                 **kwarg
                 ):

        self.TrainTest   = TrainTest;
        self.null_cutoff = null_cutoff;
        self.cat_cutoff  = cat_cutoff;
        self.target      = CFG.target;
        self.sel_cols    = sel_cols;
        self.cat_cols    = cat_cols;

        if self.TrainTest == "train":
            self.path = CFG.train_path;
        else:
            self.path = CFG.test_path;

        Utils.PrintColor(f"\n{'='*10} {self.TrainTest.upper()} MODE {'='*10}\n", color = Fore.RED);

    def _TypeCastCols(self, df: pl.DataFrame):
        """
        This method casts the columns into the desired dtypes with basic date handling too
        Input- df- pl.DataFrame:- input data table
        Output- df:- pl.DataFrame:- dataframe with type-casting
        """;

        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64));
            elif col in ["date_decision"] or col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date));
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64));
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String));

        return df;

    def _MakeDtFtre(self, df: pl.DataFrame):
        """
        This method creates date features from the provided dataframe
        Input- df- pl.DataFrame:- input data table
        Output- df- pl.DataFrame:- dataframe with date column FE
        """;

        for col in df.columns:
            if col.endswith("D"):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"));
                df = df.with_columns(pl.col(col).dt.total_days());
                df = df.with_columns([pl.col("date_decision").dt.month().alias("month_nb").cast(pl.Int8),
                                      pl.col("date_decision").dt.weekday().alias("weekday_nb").cast(pl.Int8),
                                     ]
                                    );
        return df.drop("date_decision", "MONTH");

    def _MakeAgg(self, df: pl.DataFrame):
        """
        This method makes a set of aggregate expressions for group by on case id for depth > 0 tables

        Note:-
        1. We make [max, min, first, last] aggregations for all columns
        2. We make mean aggregation for columns ending with [P, A, D]
        3. We make mode aggregations for columns ending with [M]

        Input - df- pl.DataFrame:- input data table
        Output- all_agg:- list of aggregate expressions to be used with group_by case_id
        """;

        all_agg = [];
        df_cols = df.columns;

        all_agg.extend([method(col).alias(f"{method.__name__}_{col}") \
                        for method in [pl.max, pl.min, pl.first, pl.last] \
                        for col in df_cols if col[-1] in ("P", "A", "D", "M", "T", "L") or "num_group" in col
                        ]
                       );
        all_agg.extend([pl.col(col).mean().alias(f"mean_{col}") for col in df_cols if col.endswith(("P", "A", "D"))]);
        all_agg.extend([pl.col(col).drop_nulls().mode().first().alias(f"mode_{col}") for col in df_cols if col.endswith("M")]);
        return df.group_by("case_id").agg(all_agg);

    def _PreProcessIpTbl(self, path:str, depth: int, isSingle: bool, **kwarg):
        """
        This method does the below-
        1. Creates chunks for file loads if we have multiple files (isSingle = False)
        2. Concatenates the chunks to a single file with typecasting
        3. Aggregating on case id for depth > 0 tables
        """;

        if isSingle == False:
            components = [];
            for path in glob(str(path)):
                components.append(pl.scan_parquet(path).pipe(self._TypeCastCols));
            df = pl.concat(components, how = "vertical_relaxed");
        else:
            df = pl.scan_parquet(path).pipe(self._TypeCastCols);

        if depth > 0:
            return df.pipe(self._MakeAgg);
        else:
            return df;

    @staticmethod
    def _ReduceMem(df: pd.DataFrame):
        "This method reduces memory for numeric columns in the dataframe";

        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', "uint16", "uint32", "uint64"];
        start_mem = df.memory_usage().sum() / 1024**2;

        for col in df.columns:
            col_type = df[col].dtypes

            if col_type in numerics:
                c_min = df[col].min();
                c_max = df[col].max();

                if "int" in str(col_type):
                    if c_min >= np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min >= np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min >= np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min >= np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min >= np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    if c_min >= np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)

        end_mem = df.memory_usage().sum() / 1024**2;

        Utils.PrintColor(f"Start - end memory:- {start_mem:5.2f} - {end_mem:5.2f} Mb");
        return df;

    def _MakeModelData(self, df_base, depth_0, depth_1, depth_2, **kwarg):
        """
        This method aggregates the input tables and joins them to make a single model table for the next steps
        It converts the final table to a pandas dataframe for the next steps, reduces the memory consumption and selects relevant columns
        """;

        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how = "left", on = "case_id", suffix = f"_{i}");
        df = df_base.pipe(self._MakeDtFtre);
        
        if self.TrainTest.lower() == "train":
            df = df.collect().to_pandas();
            df = self._ReduceMem(df.replace([np.inf, -1*np.inf], np.NaN));
            Utils.PrintColor(f"---> Selecting training columns by nulls and category unique values",
                             color = Fore.CYAN
                             );

            drop_cols = [];
            null_cols = df.drop(columns = [self.target], errors = "ignore").isna().mean();
            drop_cols.extend(null_cols.loc[null_cols >= self.null_cutoff].index.to_list());

            obj_cols = df.select_dtypes(include = "object").columns;
            for col in obj_cols:
                if df[col].nunique() >= self.cat_cutoff or df[col].nunique() == 1:
                    drop_cols.append(col);
            cat_cols  = [c for c in obj_cols if c not in drop_cols];

            df = df.drop(columns = drop_cols, errors = "ignore");
            df[cat_cols] = df[cat_cols].astype("category");

        else:
            Utils.PrintColor(f"---> Selecting all the test set columns to filter later");
        return df;

    def XformData(self, display_store: bool = False):
        """
        This is the cynosure method that aggregates all the inputs and prepares the FE dataset for the model training/ submission
        """;

        data_store = \
         {"df_base" : self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_base.parquet"), depth = 0, isSingle = True),

          "depth_0" : [self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_static_cb_0.parquet"), depth = 0, isSingle = True),
                       self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_static_0_*.parquet"), depth = 0, isSingle = False)
                      ],

          "depth_1": [self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_applprev_1_*.parquet"), depth = 1, isSingle = False),
                      self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_tax_registry_a_1.parquet"), depth = 1, isSingle = True),
                      self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_tax_registry_b_1.parquet"), depth = 1, isSingle = True),
                      self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_tax_registry_c_1.parquet"), depth = 1, isSingle = True),
                      self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_credit_bureau_b_1.parquet"), depth = 1, isSingle = True),
                      self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_other_1.parquet"), depth = 1, isSingle = True),
                      self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_person_1.parquet"), depth = 1, isSingle = True),
                      self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_deposit_1.parquet"), depth = 1, isSingle = True),
                      self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_debitcard_1.parquet"), depth = 1, isSingle = True)
                      ],

          "depth_2": [self._PreProcessIpTbl(os.path.join(self.path, f"{self.TrainTest}_credit_bureau_b_2.parquet"), depth = 2, isSingle = True)]
          };

        if display_store:
            Utils.PrintColor("\n---> Data store\n", color = Fore.CYAN);
            pprint(data_store, width = 200, depth = 3, indent = 5);

        df = self._MakeModelData(**data_store);
        del data_store;

        if self.TrainTest.lower() == "train":
            Utils.PrintColor(f"\n---> {self.TrainTest.capitalize()} set details = {df.shape} | {df.memory_usage().sum()/ 10**6 :,.2f} Mb\n",
                         color = Fore.CYAN
                         );
            Utils.PrintColor("\n---> Train set columns\n");
            with np.printoptions(linewidth = 160):
                pprint(np.array(df.drop(columns = [self.target], errors = "ignore").columns));

                Utils.PrintColor("\n---> Train set category columns\n", color = Fore.CYAN);
                pprint(np.array(df.select_dtypes(include = "category").columns));
        else:
            pass;
        return df;

Utils.PrintColor(Utils.CleanMemory());

[1m[34m
RAM usage = 0.3173 GB[0m
CPU times: user 119 ms, sys: 659 µs, total: 119 ms
Wall time: 119 ms


<a id="4"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #b32d00; border-bottom: 8px solid black" > MODEL INFERENCING<br> <div>

<a id="4.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background:  #e0ebeb; border-bottom: 8px solid #b32d00" > ML MODELS<br><div>

In [8]:
%%time

class VotingModelMaker(BaseEstimator, ClassifierMixin):
    """
    This class prepares a voting model from the individual fold level contributions
    Source - https://www.kaggle.com/code/greysky/home-credit-baseline
    """;

    def __init__(self, estimators: list):
        super().__init__()
        self.estimators = estimators;

    def fit(self, X, y=None):
        return self;

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0);

    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0);

CPU times: user 223 µs, sys: 35 µs, total: 258 µs
Wall time: 262 µs


In [9]:
%%time 

# Creating the test set features:-
sub_fl = pd.read_csv(os.path.join(CFG.path, "sample_submission.csv"));

# Creating output dataframe for model predictions:-
Mdl_Preds = pd.DataFrame(index = range(len(sub_fl)));

pp = DataXformer(TrainTest   = "test",
                 null_cutoff = CFG.null_cutoff,
                 cat_cutoff  = CFG.cat_cutoff,
                 sel_cols    = [],
                 cat_cols    = [],
                );
Xtest = pp.XformData(display_store = False);
_ = Utils.CleanMemory();

with np.printoptions(linewidth = 160):
    Utils.PrintColor(f"\n\n---> All files in the input path\n");
    for _, _, files in os.walk(CFG.model_path):
        pprint(np.array(files));
        
# Creating output structure for my models:-     
MyMdl_Preds = pd.DataFrame(index = sub_fl.index);

for exp_nb in CFG.all_exp_nb:
    Utils.PrintColor(f"---> Current experiment = {exp_nb}");
    cat_cols = joblib.load(os.path.join(CFG.model_path, f"SelCatCols_{exp_nb}.pkl")).to_list();
    sel_cols = joblib.load(os.path.join(CFG.model_path, f"SelCols_{exp_nb}.pkl")).to_list();
    
    Xt = pp._ReduceMem(Xtest.select(sel_cols).\
                       collect().\
                       to_pandas().\
                       drop(columns = ["WEEK_NUM", "case_id"], errors = "ignore")
                      );
    Xt[cat_cols] = Xt[cat_cols].astype("category");
    
    model_files = \
    sorted([f for f in files if f"{exp_nb}" in f and ("LGBM" in f or "XGB" in f) and f.endswith('.model')]);
    Utils.PrintColor(f"\n---> {model_files} | Data shape = {Xt.shape}", color = Fore.MAGENTA);
    preds = pd.DataFrame(index = Mdl_Preds.index, columns = model_files);
    
    print();
    for f in model_files:
        print(f);
        model    = joblib.load(os.path.join(CFG.model_path, f));
        preds[f] = Utils.PredictBatch(model, Xt);
    del Xt;
    _ = Utils.CleanMemory();
    
    try:
        MyMdl_Preds[exp_nb] = np.average(preds.values, axis=1, weights = CFG.myml_inner_wgt);
    except:
        MyMdl_Preds[exp_nb] = np.mean(preds.values, axis=1);
 
Mdl_Preds["MyLGBM"] = np.average(MyMdl_Preds.values, axis=1, weights = CFG.mymdl_wgt);
del MyMdl_Preds;

Utils.CleanMemory();

[1m[31m
[0m
[1m[34m---> Selecting all the test set columns to filter later[0m
[1m[34m

---> All files in the input path
[0m
array(['VC_E2V3_LGBM1C.model', 'VC_V1_LGBM1C.model', 'VC_E2V3_LGBM2C.model', 'SelCatCols_E2V5.pkl', 'SelCols_E2V1.pkl', 'VC_E2V4_LGBM2C.model',
       'VC_E2V1_LGBM1C.model', 'VC_E1V2_LGBM4C.model', 'E2V6_DENSELIGHT.model', 'SelCatCols_E2V3.pkl', 'VC_E1V2_LGBM5C.model', 'VC_E1V2_LGBM3C.model',
       'VC_E2V3_XGB1C.model', 'VC_E1V2_LGBM2C.model', 'VC_E1V4_XGB2C.model', 'VC_E2V4_XGB1C.model', 'VC_E2V5_LGBM1C.model', 'VC_E1V4_XGB5C.model',
       'VC_E2V5_XGB1C.model', 'VC_E2V5_LGBM4C.model', 'SelCols_E2V2.pkl', 'SelCatCols_E2V4.pkl', 'VC_V1_LGBM3C.model', 'VC_E1V3_LGBM4C.model',
       'VC_E1V3_LGBM1C.model', 'VC_V1_LGBM4C.model', 'E2V5_DENSELIGHT.model', 'VC_E1V2_LGBM1C.model', 'SelCatCols_E2V1.pkl', 'VC_E2V1_XGB1C.model',
       'VC_E2V1_LGBM2C.model', 'SelCols_E2V5.pkl', 'VC_E2V2_XGB1C.model', 'VC_E2V5_LGBM2C.model', 'SelCols_E2V3.pkl', 'SelCatCols_E2V6

<a id="4.2"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background:  #e0ebeb; border-bottom: 8px solid #b32d00" > LAMA DENSELIGHT MODEL<br><div>

In [10]:
%%time 

!pip install --no-index -Uq --find-links=/kaggle/input/lightautoml-038-dependencies lightautoml==0.3.8 -q;
from lightautoml.automl.presets.tabular_presets import TabularAutoML;
from lightautoml.tasks import Task;
clear_output();

model    = joblib.load("/kaggle/input/homecreditquality2024ancillary/E2V5_DENSELIGHT.model");
sel_cols = joblib.load("/kaggle/input/homecreditquality2024ancillary/SelCols_E2V5.pkl");
cat_cols = joblib.load("/kaggle/input/homecreditquality2024ancillary/SelCatCols_E2V5.pkl");
Xtest    = pp.XformData(display_store = False);
Xtest    = pp._ReduceMem(Xtest.select(sel_cols).collect().to_pandas());
Xtest[cat_cols] = Xtest[cat_cols].astype("category");
Mdl_Preds["LAMA1"] = Utils.PredictBatch(model, Xtest, prd_proba_req = False);
del Xtest;
_ = Utils.CleanMemory();

print("\n\n\n");
model    = joblib.load("/kaggle/input/homecreditquality2024ancillary/E2V6_DENSELIGHT.model");
sel_cols = joblib.load("/kaggle/input/homecreditquality2024ancillary/SelCols_E2V6.pkl");
cat_cols = joblib.load("/kaggle/input/homecreditquality2024ancillary/SelCatCols_E2V6.pkl");
Xtest    = pp.XformData(display_store = False);
Xtest    = pp._ReduceMem(Xtest.select(sel_cols).collect().to_pandas());
Xtest[cat_cols] = Xtest[cat_cols].astype("category");
Mdl_Preds["LAMA2"] = Utils.PredictBatch(model, Xtest, prd_proba_req = False);
del Xtest;
_ = Utils.CleanMemory(); 

[1m[34m---> Selecting all the test set columns to filter later[0m
[1m[34mStart - end memory:-  0.04 -  0.03 Mb[0m
[1m[36m---> Processing batch: 1/1[0m




[1m[34m---> Selecting all the test set columns to filter later[0m
[1m[34mStart - end memory:-  0.03 -  0.02 Mb[0m
[1m[36m---> Processing batch: 1/1[0m
CPU times: user 36.5 s, sys: 1.94 s, total: 38.4 s
Wall time: 3min 12s


<a id="4.6"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:120%; text-align:left;padding:3.0px; background:  #e0ebeb; border-bottom: 8px solid #b32d00" > FINAL PREDICTIONS<br><div>

In [11]:
%%time 

import pandas as pd;
print(f"Pandas = {pd.__version__}");

Utils.PrintColor(f"\n---> Model predictions across all options\n");
display(Mdl_Preds.head(10).style.format(precision = 4));

sub_fl["score"] = np.average(Mdl_Preds.values, axis=1, weights = CFG.blend_wgt);

del Mdl_Preds;
_ = Utils.CleanMemory();

sub_fl = pl.DataFrame(sub_fl.reset_index());  
Utils.PrintColor(f"\n\n---> Final submission file\n");
display(sub_fl.head(10));

# Saving the submission file for leaderboard:-
sub_fl.select(["case_id", "score"]).write_csv(f"submission.csv",);     
_ = Utils.CleanMemory(); 

Pandas = 2.2.0
[1m[34m
---> Model predictions across all options
[0m


Unnamed: 0,MyLGBM,LAMA1,LAMA2
0,0.0068,0.0047,0.0112
1,0.0155,0.0445,0.038
2,0.0064,0.0037,0.0014
3,0.0135,0.0063,0.0073
4,0.0946,0.0577,0.0518
5,0.0053,0.0023,0.0025
6,0.0392,0.001,0.0021
7,0.0163,0.0004,0.0003
8,0.1737,0.0243,0.0177
9,0.036,0.005,0.0054


[1m[34m

---> Final submission file
[0m


index,case_id,score
0,57543,0.007135
1,57549,0.023253
2,57551,0.005239
3,57552,0.011468
4,57569,0.082646
5,57630,0.004405
6,57631,0.027925
7,57632,0.011522
8,57633,0.127886
9,57634,0.026719


CPU times: user 572 ms, sys: 5.85 ms, total: 578 ms
Wall time: 584 ms


<a id="5"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #b32d00; border-bottom: 8px solid black" > PLANNED NEXT STEPS<br> <div>

<div style= "font-family: Cambria; letter-spacing: 0px; color:#000000; font-size:110%; text-align:left;padding:3.0px; background: #f2f2f2" >
1. We need to understand the data structure first. Exploring through all the files and understanding the columns is key<br>
2. The importance of a good EDA cannot be described enough in words in such a challenge <br>
3. Developing a better set of models with better feature choices is key <br>
4. Understanding the stability metric and incorporating it in the training and inferencing pipeline is also key <br>
</div>