# **LIBRARIES**

In [1]:
import os
import gc
import time
import numpy as np
import pandas as pd
from contextlib import contextmanager
import multiprocessing as mp
from functools import partial
from scipy.stats import kurtosis, iqr, skew
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from glob import glob
from pathlib import Path
from datetime import datetime
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import roc_curve, auc
from sklearn.feature_selection import SelectKBest, f_classif
from tqdm.notebook import tqdm
import joblib
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


# **CONFIGURATION**
<a id='configuration'></a>

[CONFIGURATION](#configuration) 

[MAIN FUNCTION](#main_function)

[MODEL](#model)

[EXECUTION](#execution)

In [2]:
# GENERAL CONFIGURATIONS
NUM_THREADS = 4
DATA_DIRECTORY = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/"
SUBMISSION_SUFIX = "_model_2.1_31"
#MODE CONFIGURATION
BALANCE_COLUMNS=False # set True for submission, set False for debug
SHOW_REPORT = False
SELECTKBEST = False
EXPORT_DATAFRAME = False
IMPORT_DATAFRAME = False
# LIGHTGBM CONFIGURATION AND HYPER-PARAMETERS
GENERATE_SUBMISSION_FILES = True
EVALUATE_VALIDATION_SET = True
STRATIFIED_KFOLD = True
RANDOM_SEED = 2424
NUM_FOLDS = 5
EARLY_STOPPING = 100
ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")


LIGHTGBM_PARAMS = {
  "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
  
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": 'cpu', 
    "verbose": -1,
   'deterministic' : True
   

}

### Set aggregations

In [3]:
APPLPREV1_AGG = {

    'num_group1':['count'],
    'actualdpd_943P': ['mean','last'],
    'annuity_853A': ['max','mean','last'],
    'approvaldate_319D':['max','mean','last'],
    'byoccupationinc_3656910L': ['max'],
    'cancelreason_3545846M':['max','last'],
    'childnum_21L': ['max'],
    'creationdate_885D':['mean','last'],
   # 'credacc_actualbalance_314A': ['min','max','mean','sum'],
    'credacc_credlmt_575A': ['max','mean','last'],
 #   'credacc_maxhisbal_375A': ['min','max','mean','sum'],
  #  'credacc_minhisbal_90A': ['min','max','mean','sum'],
    'credacc_status_367L': ['max'],
  #  'credacc_transactions_402L': ['min','max','mean','sum'],
    'credamount_590A': ['max','mean','last'],
    'credtype_587L': ['max','last'],
    'currdebt_94A': ['max','mean','last'],
    'dateactivated_425D':['max','mean','last'],
    'district_544M':['max'],
    'downpmt_134A': ['max','mean','last'],
    'dtlastpmt_581D':['max','mean'],
    'dtlastpmtallstes_3545839D':['max','mean','last'],
    'education_1138M':['max','last'],
    'employedfrom_700D':['max','last'],
    'familystate_726L': ['max','last'],
    'firstnonzeroinstldate_307D': ['max','mean','last'],
    'inittransactioncode_279L': ['max','last'],
    'isbidproduct_390L': ['max','last'],
   # 'isdebitcard_527L': ['min','max','mean','sum'],
    'mainoccupationinc_437A': ['max','mean','last'],
    'maxdpdtolerance_577P': ['mean','last'],
    'outstandingdebt_522A': ['max','mean','last'],
    'pmtnum_8L': ['max','last'],
    'postype_4733339M':['max','last'],
    #'profession_152M':['max'],
    'rejectreason_755M':['max','last'],
    'rejectreasonclient_4145042M':['max','last'],
   # 'revolvingaccount_394A': ['min','max','mean','sum'],
    'status_219L': ['max','last'],
   # 'tenor_203L': ['min','max','mean','sum'],
    
}
APPLPREV2_AGG = {
    'num_group1':['count'],
    'num_group2':['count'],
    'conts_type_509L':['max','last'],
    'cacccardblochreas_147M': ['max','last'],
    'credacc_cards_status_52L':['max'],
    
}
PERSON1_AGG={
    'num_group1':['count'],
    'birth_259D': ['max','last'],
    #'childnum_185L':['max','mean','min'],
    'contaddr_district_15M':['max'],
    'contaddr_matchlist_1032L':['max','last'],
    'contaddr_smempladdr_334L':['max','last'],
    'contaddr_zipcode_807M':['max'],
    'education_927M':['max','last'],
    'empl_employedfrom_271D':['max','mean','min'],
    'empl_employedtotal_800L':['max'],
    'empl_industry_691L':['max'],
    'empladdr_district_926M' : ['max','min','mean','last'],
    'empladdr_zipcode_114M' : ['max','min','mean','last'],
    'familystate_447L':['max','count'],
    #'gender_992L'
    'housetype_905L':['max'],
    #'housingtype_772L'
    'incometype_1044T':['max','last'],
    #'isreference_387L'
    'language1_981M':['max', 'last'],
    'mainoccupationinc_384A':['max','mean','min', 'count','last'],
    #'maritalst_703L'
    'personindex_1023L':['max','mean','min', 'count','sum','last'],
    'persontype_1072L':['max','mean','min', 'count','sum','last'],
    'persontype_792L':['max','mean','min', 'count','sum'],
    #'registaddr_district_1083M'
    #'registaddr_zipcode_184M'
    'relationshiptoclient_415T':['max','count'],
    'relationshiptoclient_642T':['max','count','last'],
    'remitter_829L':['max'],
    'role_1084L':['max','count','last'],
    #'role_993L'
    'safeguarantyflag_411L':['max','last'],
    'sex_738L':['max','last'],
    'type_25L':['max','last']
    

    
    
    
}
PERSON2_AGG={
    'num_group1':['count'],
    'num_group2':['count'],
    #'addres_district_368M'
    #'addres_role_871L'
    #'addres_zip_823M'
    'conts_role_79M':['max','last'],
    'empls_economicalst_849M':['max','last'],
    #'empls_employedfrom_796D'
    'empls_employer_name_740M':['max','last'],
    #'relatedpersons_role_762T'
}
OTHER_AGG={
    'num_group1':['count'],
    'amtdebitincoming_4809443A':['max','mean','min', 'count','sum'],
    'amtdebitoutgoing_4809440A':['max','mean','min', 'count','sum'],
    #'amtdepositbalance_4809441A'
    #'amtdepositincoming_4809444A'
    #'amtdepositoutgoing_4809442A'
}
DEBITCARD_AGG={
    'num_group1':['count'],
    #'last180dayaveragebalance_704A'
    #'last180dayturnover_1134A'
    #'last30dayturnover_651A'
    'openingdate_857D':['min','max','mean']
}
TAX_REGISTRY_A_AGG={
    'num_group1':['count'],
    'amount_4527230A': ['max','mean','min','sum'],
    'name_4527232M':['max'],
    'recorddate_4527225D':['max','mean','min']
    
}
TAX_REGISTRY_B_AGG={
    'num_group1':['count'],
    'amount_4917619A':['min','mean','max','sum'],
    'deductiondate_4917603D':['max','mean','min'],
    'name_4917606M':['max'],
    
    
}
TAX_REGISTRY_C_AGG={
    'num_group1':['count'],
    'employername_160M':['max'],
    'pmtamount_36A':['min','mean','max','sum','last'],
    'processingdate_168D':['mean','min','max','last'],

}
CREDIT_BUREAU_A_1_AGG={
    
    'num_group1':['count'],
    #'annualeffectiverate_199L'
    #'annualeffectiverate_63L'
    'classificationofcontr_13M':['max','last'],
    'classificationofcontr_400M':['max','last'],
    'contractst_545M':['max','last'],
    'contractst_964M':['max','last'],
    #'contractsum_5085717L'
    'credlmt_230A':['mean'],
    'credlmt_935A':['mean','min','max'],
    'dateofcredend_289D':['mean'],
    'dateofcredend_353D':['mean','max'],
    'dateofcredstart_181D':['max'],
    'dateofcredstart_739D':['mean'],
    'dateofrealrepmt_138D':['mean','max'],
    'debtoutstand_525A':['max'],
    'debtoverdue_47A':['max'],
    'description_351M':['max','last'],
   
    'dpdmax_757P':['mean'],
    'dpdmaxdatemonth_442T':['max'],
    'dpdmaxdatemonth_89T':['max'],
    'dpdmaxdateyear_596T':['max'],
    'dpdmaxdateyear_896T':['max'],
    'financialinstitution_382M':['max','last'],
    'financialinstitution_591M':['max','last'],
    'instlamount_768A':['mean'],
    #'instlamount_852A'
    #'interestrate_508L'
    'lastupdate_1112D':['mean','max'],
    'lastupdate_388D':['mean','max'],
    'monthlyinstlamount_332A':['mean'],
    'monthlyinstlamount_674A':['mean','max'],
    'nominalrate_281L':['max'],
    'nominalrate_498L':['max'],
    'numberofcontrsvalue_258L':['max'],
    'numberofcontrsvalue_358L':['max'],
    'numberofinstls_229L':['max'],
    'numberofinstls_320L':['max'],
    'numberofoverdueinstls_834L':['max'],
    'numberofoutstandinstls_520L':['max'],
    'numberofoutstandinstls_59L':['max'],
    'numberofoverdueinstlmax_1039L':['max'],
    'numberofoverdueinstlmax_1151L' : ['max'],
    'numberofoverdueinstlmaxdat_148D': ['max'],
    'numberofoverdueinstlmaxdat_641D':['mean'],
    'numberofoverdueinstls_725L':['max'],
    'outstandingamount_354A': ['mean'],
    'outstandingamount_362A': ['mean'],
    'overdueamountmaxdatemonth_284T': ['max'],
    'overdueamountmaxdatemonth_365T': ['max'],
    'overdueamountmaxdateyear_2T': ['max'],
    'overdueamountmaxdateyear_994T': ['max'],
    'overdueamount_31A': [ 'mean'],
    'overdueamountmax_35A' : [ 'mean'],
    'overdueamountmax2_398A': [ 'mean'],
    'overdueamountmax2date_1002D'  : [ 'mean'],
    'overdueamountmax2date_1142D': ['max'],
    'overdueamount_659A': [ 'mean'],
    'overdueamountmax2_14A' :['mean'],
    'periodicityofpmts_1102L': ['max'],
    'periodicityofpmts_837L': ['max'],
    'prolongationcount_1120L': [ 'max'],
    'prolongationcount_599L': [ 'max'],
    'purposeofcred_426M': ['max','last'],
    'purposeofcred_874M': ['max','last'],
    'refreshdate_3813885D': ['mean', 'max','last'],
    'residualamount_488A': [ 'max'],
    'residualamount_856A': ['mean'],
    'subjectrole_182M' : [ 'max', 'last'],
    'subjectrole_93M' : [ 'max', 'last'],
    'totalamount_6A': ['mean', 'max'],
    'totalamount_996A': [ 'mean'],
    
    'totaldebtoverduevalue_718A': [ 'mean'],
    'totaloutstanddebtvalue_39A': ['mean'],
    'totaloutstanddebtvalue_668A': [ 'mean']
   
}
CREDIT_BUREAU_B_1_AGG={
    'num_group1':['count'],
    
}
CREDIT_BUREAU_A_2_AGG={
 
    'collater_typofvalofguarant_298M' : [ 'max', 'last'],
    'collater_typofvalofguarant_407M': [ 'max', 'last'],
    'collaterals_typeofguarante_359M': ['max', 'last'],
    'collaterals_typeofguarante_669M': ['max', 'last'],
    'collater_valueofguarantee_1124L': ['max'],
    'collater_valueofguarantee_876L': ['max'],
    'pmts_dpd_1073P': ['mean'],
    'pmts_dpd_303P': ['mean', 'max'],
    'pmts_month_158T': [ 'max','last'],
    'pmts_month_706T': ['max','last'],
    'pmts_overdue_1140A': ['mean'],
    'pmts_overdue_1152A': [ 'mean'],
    'pmts_year_1139T': [ 'max', 'last'],
    'pmts_year_507T': ['max','last'],
    'subjectroles_name_541M': [ 'max','last'],
    'subjectroles_name_838M': ['min', 'mean', 'max','count','last'],
    
    
    'num_group1':['count'],
    'num_group2':['count']
}
CREDIT_BUREAU_B_2_AGG={
    'num_group1':['count'],
    'num_group2':['count'],
    'pmts_date_1107D':['min', 'mean', 'max'],
    'pmts_dpdvalue_108P':['min','mean','max'],
    'pmts_pmtsoverdue_635A':['min','mean','max'],
}

# **MAIN FUNCTION**
<a id='main_function'></a>

[CONFIGURATION](#configuration) 

[MAIN FUNCTION](#main_function)

[MODEL](#model)

[EXECUTION](#execution)

In [4]:
def main(debug= False):
    num_rows = 50111 if debug else None
    print("Notebook started:")
    if not IMPORT_DATAFRAME:
    
        with timer("base"):

            df = get_base(DATA_DIRECTORY, num_rows=num_rows)

            print("base dataframe shape:", df.shape)



        with timer("static"):

            df_static = get_static(DATA_DIRECTORY, num_rows=num_rows)
            df_static = df_static.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_static, on='case_id', how='left', suffix='_static')
            print("static dataframe shape:", df_static.shape)
            print("DATAFRAME shape:", df.shape)

            del df_static
            gc.collect()

        with timer("static_cb"):

            df_static_cb = get_static_cb(DATA_DIRECTORY, num_rows=num_rows)
            df_static_cb = df_static_cb.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_static_cb, on='case_id', how='left', suffix='_static_cb')
            print("static cb dataframe shape:", df_static_cb.shape)
            print("DATAFRAME shape:", df.shape)
            del df_static_cb
            gc.collect()

        with timer("Previous applications depth 1 test"):

            df_applprev1 = get_applprev1(DATA_DIRECTORY, num_rows=num_rows)
            df_applprev1 = df_applprev1.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_applprev1, on='case_id', how='left', suffix='_applprev1')
            print("Previous applications depth 1 test dataframe shape:", df_applprev1.shape)
            print("DATAFRAME shape:", df.shape)
            del df_applprev1
            gc.collect()

        with timer("Previous applications depth 2 test"):

            df_applprev2 = get_applprev2(DATA_DIRECTORY, num_rows=num_rows)
            df_applprev2 = df_applprev2.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_applprev2, on='case_id', how='left', suffix='_applprev2')
            print("Previous applications depth 2 test dataframe shape:", df_applprev2.shape)
            print("DATAFRAME shape:", df.shape)
            del df_applprev2
            gc.collect()

        with timer("Person depth 1 test"):

            df_person1 = get_person1(DATA_DIRECTORY, num_rows=num_rows)
            df_person1 = df_person1.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_person1, on='case_id', how='left', suffix='_person1')
            print("Person depth 1 test dataframe shape:", df_person1.shape)
            print("DATAFRAME shape:", df.shape)
            del df_person1
            gc.collect()

        with timer("Person depth 2 test"):

            df_person2 = get_person2(DATA_DIRECTORY, num_rows=num_rows)
            df_person2 = df_person2.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_person2, on='case_id', how='left', suffix='_person2')
            print("Person depth 2 test dataframe shape:", df_person2.shape)
            print("DATAFRAME shape:", df.shape)
            del df_person2
            gc.collect()

        with timer("Other test"):

            df_other = get_other(DATA_DIRECTORY, num_rows=num_rows)
            df_other = df_other.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_other, on='case_id', how='left', suffix='_other')
            print("Other test dataframe shape:", df_other.shape)
            print("DATAFRAME shape:", df.shape)
            del df_other
            gc.collect()

        with timer("Debit card test"):

            df_debitcard = get_debitcard(DATA_DIRECTORY, num_rows=num_rows)
            df_debitcard = df_debitcard.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_debitcard, on='case_id', how='left', suffix='_debitcard')
            print("Debit card test dataframe shape:", df_debitcard.shape)
            print("DATAFRAME shape:", df.shape)
            del df_debitcard
            gc.collect()

        with timer("Tax registry a test"):

            df_tax_registry_a = get_tax_registry_a(DATA_DIRECTORY, num_rows=num_rows)
            df_tax_registry_a = df_tax_registry_a.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_tax_registry_a, on='case_id', how='left', suffix='_tax_registry_a')
            print("Tax registry a test dataframe shape:", df_tax_registry_a.shape)
            print("DATAFRAME shape:", df.shape)
            del df_tax_registry_a
            gc.collect()

        with timer("Tax registry b test"):

            df_tax_registry_b = get_tax_registry_b(DATA_DIRECTORY, num_rows=num_rows)
            df_tax_registry_b = df_tax_registry_b.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_tax_registry_b, on='case_id', how='left', suffix='_tax_registry_b')
            print("Tax registry b test dataframe shape:", df_tax_registry_b.shape)
            print("DATAFRAME shape:", df.shape)
            del df_tax_registry_b
            gc.collect()

        with timer("Tax registry c test"):

            df_tax_registry_c = get_tax_registry_c(DATA_DIRECTORY, num_rows=num_rows)
            df_tax_registry_c = df_tax_registry_c.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_tax_registry_c, on='case_id', how='left', suffix='_tax_registry_c')
            print("Tax registry c test dataframe shape:", df_tax_registry_c.shape)
            print("DATAFRAME shape:", df.shape)
            del df_tax_registry_c
            gc.collect()



        with timer("Credit bureau a 1 test"):

            df_credit_bureau_a_1 = get_credit_bureau_a_1(DATA_DIRECTORY, num_rows=num_rows)
            df_credit_bureau_a_1 = df_credit_bureau_a_1.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_credit_bureau_a_1, on='case_id', how='left', suffix='_cb_a_1')
            print("Credit bureau a 1 test dataframe shape:", df_credit_bureau_a_1.shape)
            print("DATAFRAME shape:", df.shape)
            del df_credit_bureau_a_1
            gc.collect()
        with timer("Credit bureau b 1 test"):

            df_credit_bureau_b_1 = get_credit_bureau_b_1(DATA_DIRECTORY, num_rows=num_rows)
            df_credit_bureau_b_1 = df_credit_bureau_b_1.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_credit_bureau_b_1, on='case_id', how='left', suffix='_cb_b_1')
            print("Credit bureau b 1 test dataframe shape:", df_credit_bureau_b_1.shape)
            print("DATAFRAME shape:", df.shape)
            del df_credit_bureau_b_1
            gc.collect()




        with timer("Credit bureau a 2 test"):

            df_credit_bureau_a_2 = get_credit_bureau_a_2(DATA_DIRECTORY, num_rows=num_rows)
            df_credit_bureau_a_2 = df_credit_bureau_a_2.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_credit_bureau_a_2, on='case_id', how='left', suffix='_cb_a_2')
            print("Credit bureau a 2 test dataframe shape:", df_credit_bureau_a_2.shape)
            print("DATAFRAME shape:", df.shape)
            del df_credit_bureau_a_2
            gc.collect()

        with timer("Credit bureau b 2 test"):

            df_credit_bureau_b_2 = get_credit_bureau_b_2(DATA_DIRECTORY, num_rows=num_rows)
            df_credit_bureau_b_2 = df_credit_bureau_b_2.filter(pl.col('case_id').is_in(df['case_id'].unique()))
            df = df.join(df_credit_bureau_b_2, on='case_id', how='left', suffix='_cb_b_2')
            print("Credit bureau b 2 test dataframe shape:", df_credit_bureau_b_2.shape)
            print("DATAFRAME shape:", df.shape)
            del df_credit_bureau_b_2
            gc.collect()

        with timer("Feature engineering / preprocessing"): 

            df=feature_engineering(df)
            get_info(df)
            df_pandas, cat_cols = to_pandas(df)
            del df;gc.collect()
            df=df_pandas
            #df=reduce_mem_usage(df)
            print("DATAFRAME shape:", df.shape)
    else:
        with timer("Importing processed dataframe"):
            
            
            df = pd.read_parquet("/kaggle/input/home-credit-2024-additional-dataset/processed_debug.parquet")

         
            for col in df.select_dtypes(exclude=['number']).columns:
                df[col] = df[col].astype('category')
            print(df.dtypes.value_counts())
            #df=reduce_mem_usage(df)
           
            print("DATAFRAME shape:", df.shape)
    
    if EXPORT_DATAFRAME:
        with timer("Export dataframe"):
            df.to_parquet("/kaggle/working/processed_debug.parquet", index=False)
            
            
            print("NOTEBOOK HAS BEEN SUCCESSFULLY EXECUTED !!!")
            return
    
    if(SELECTKBEST):
        with timer("SelectKBest feature research"):
            
            selectkbestX(df)
            print("NOTEBOOK HAS BEEN SUCCESSFULLY EXECUTED !!!")
            return

    with timer("Model training"):
       
        
        del_features = ['target', 'case_id','WEEK_NUM']
        predictors = list(filter(lambda v: v not in del_features, df.columns))
        cat_cols = list(df.select_dtypes("object").columns)
        model = kfold_lightgbm_sklearn(df, cat_cols)
       
        

    
    
    with timer("Feature importance assesment"):
        
        get_features_importances(predictors, model)
        
        
    
        
    with timer("Submission"):

        if GENERATE_SUBMISSION_FILES:
            
            if generate_submission_file(df, model):


                print("Submission file has been created.")
            
    
    print("NOTEBOOK HAS BEEN SUCCESSFULLY EXECUTED !!!")
    
    return df, model
    
    
    

# **UTILITY FUNCTIONS**

### Pipeline

In [5]:
class Pipeline:
    @staticmethod
    
    
    # Sets datatypes accordingly
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))            

        return df
    
    
    # Changes the values of all date columns. The result will not be a date but number of days since date_decision.
    @staticmethod
    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
                
        df = df.with_columns((pl.col("date_decision").dt.month()).alias("month_decision"))
        df = df.with_columns((pl.col("date_decision").dt.weekday()).alias("weekday_decision"))
        df = df.drop("date_decision", "MONTH")

        return df
    
    # It drops columns with a lot of NaN values.
    @staticmethod
    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()

                if isnull > 0.95:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()

                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df

In [6]:
def get_info(dataframe):
    """
    View data types, shape, and calculate the percentage of NaN (missing) values in each column
    of a Polars DataFrame simultaneously.
    
    Parameters:
    dataframe (polars.DataFrame): The DataFrame to analyze.
    
    Returns:
    None
    """
    # Print DataFrame shape
    print("DataFrame Shape:", dataframe.shape)
    print("-" * 60)
    
    # Print column information
    print("{:<50} {:<30} {:<20}".format("Column Name", "Data Type", "NaN Percentage"))
    print("-" * 60)
    
    # Total number of rows in the DataFrame
    total_rows = len(dataframe)
    
    # Iterate over each column
    for column in dataframe.columns:
        # Get the data type of the column
        dtype = str(dataframe[column].dtype)
        
        # Count the number of NaN values in the column
        nan_count = dataframe[column].null_count()
        
        # Calculate the percentage of NaN values
        nan_percentage = (nan_count / total_rows) * 100
        
        # Print the information
        print("{:<50} {:<30} {:.2f}%".format(column, dtype, nan_percentage))


In [7]:
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    
    return df_data, cat_cols

In [8]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [9]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(name, time.time() - t0))


In [10]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    

    temp=base.loc[:, ["WEEK_NUM", "target", "score"]] \
        .sort_values("WEEK_NUM") \
        .groupby("WEEK_NUM").mean()
   
    week_nums_to_drop = temp[(temp["target"] == 0) | (temp["target"] == 1)].index.tolist()

    base_filtered = base[~base["WEEK_NUM"].isin(week_nums_to_drop)]

    # Apply the aggregator
    gini_in_time = base_filtered.loc[:, ["WEEK_NUM", "target", "score"]] \
        .sort_values("WEEK_NUM") \
        .groupby("WEEK_NUM")[["target", "score"]] \
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()

    

    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.nanmean(gini_in_time)  # Use np.nanmean to handle NaN values
    
    if SHOW_REPORT:
        # Display the plot of x on y
        plt.figure(figsize=(8, 6))
        plt.plot(x, y, 'o', label='Gini in Time')
        plt.plot(x, y_hat, '-', label='Fitted line (slope={:.2f}, intercept={:.2f})'.format(a, b))
        plt.xlabel('Week')
        plt.ylabel('Gini in Time')
        plt.title('Gini Stability Over Time')
        plt.legend()
        plt.grid(True)
        plt.show()
    
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

### Report function

In [11]:
'''
def make_report(num_rows, predictors, model):
    # 1. time
    current_time = datetime.now()
    # Print the current time
    print("Current Time:", current_time)
    
    # 2. specification
    if not num_rows:
        print("The notebook was run in full mode.")
    else:
        print("The notebook was run in debug mode. Number of rows: " + str(num_rows))
    
    # 3. features
    feat_importances_df = model.get_features_importances_df(predictors)
    feat_importances_df['gain'] = feat_importances_df['gain'].round(0)
    print(feat_importances_df.shape)
    
    predictions = pd.Series(model.get_predictions())
   
    numerical_columns = data.select_dtypes(include=['int', 'float']).columns

    # Compute correlations of each numerical column with 'PREDICTIONS'
    correlations = {}
    
    # Compute correlations of each numerical column with 'feat'
    for column in numerical_columns:
        correlations[column] = predictions.corr(data[column])

    # Create a new DataFrame with 'features' and 'correlation' columns
    correlation_df = pd.DataFrame(list(correlations.items()), columns=['features', 'correlation'])

    # Round the correlation numbers to three decimal places
    correlation_df['correlation'] = correlation_df['correlation'].round(3)

    # Merge feat_importances_df and correlation_df on 'feature'
    combined_df = pd.merge(feat_importances_df, correlation_df, left_on="feature", right_on='features', how='left')

    # Handle categorical features with no correlation
    combined_df['correlation'] = combined_df['correlation'].fillna(value=np.nan)
    

    # Compute and add valid percentage for each feature
    valid_percentage = (data[0:-10].count() / len(data[0:-10]))
    valid_percentage = valid_percentage.round(3)
    combined_df['valid_percentage'] = combined_df['feature'].map(valid_percentage)

    # Print the combined_df DataFrame
    print(combined_df.to_string(index=False))
    print()
    roc_score=roc_auc_score(data['target'][0:-10],predictions)
    print("ROC score: ",roc_score)

    # Compute false positive rate, true positive rate, and thresholds for ROC curve
    fpr, tpr, thresholds = roc_curve(data['target'][0:-10], predictions)

    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (AUC = %0.2f)' % roc_score)
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()
'''

'\ndef make_report(num_rows, predictors, model):\n    # 1. time\n    current_time = datetime.now()\n    # Print the current time\n    print("Current Time:", current_time)\n    \n    # 2. specification\n    if not num_rows:\n        print("The notebook was run in full mode.")\n    else:\n        print("The notebook was run in debug mode. Number of rows: " + str(num_rows))\n    \n    # 3. features\n    feat_importances_df = model.get_features_importances_df(predictors)\n    feat_importances_df[\'gain\'] = feat_importances_df[\'gain\'].round(0)\n    print(feat_importances_df.shape)\n    \n    predictions = pd.Series(model.get_predictions())\n   \n    numerical_columns = data.select_dtypes(include=[\'int\', \'float\']).columns\n\n    # Compute correlations of each numerical column with \'PREDICTIONS\'\n    correlations = {}\n    \n    # Compute correlations of each numerical column with \'feat\'\n    for column in numerical_columns:\n        correlations[column] = predictions.corr(data[col

In [12]:
def group(df_to_agg, prefix, aggregations, aggregate_by='case_id', datatype='polars'):
    # Create a dictionary mapping aggregation functions to their string representations
    
    if datatype=='polars':
        func_mapping = {
        'min': pl.min,
        'max': pl.max,
        'mean': pl.mean,
        'sum': pl.sum,
        'count': pl.count,
         'median': pl.median,
        "last" : pl.last
        }

    # Perform the aggregation
        agg_df = df_to_agg.group_by(aggregate_by).agg(**{
            f"{func}_{col}": func_mapping[func](col) for col, funcs in aggregations.items() for func in funcs
        })
        '''
        # Rename columns
        for col, funcs in aggregations.items():
            for func in funcs:
                old_name = f"{col}_{func}"
                new_name = f"{prefix}{col}_{func.upper()}"
                agg_df = agg_df.select(pl.col(old_name).alias(new_name))
        '''
        return agg_df
    
    if datatype=='pandas':
            # Create a dictionary mapping aggregation functions to their string representations
        func_mapping = {
            'min': 'min',
            'max': 'max',
            'mean': 'mean',
            'sum': 'sum',
            'count': 'count',
            
        }

        # Perform the aggregation
        agg_df = df_to_agg.groupby(aggregate_by).agg(**{
            f"{prefix}{col}_{func.upper()}": (col, func_mapping[func]) for col, funcs in aggregations.items() for func in funcs
        }).reset_index()
        
        return agg_df

# **SELECTKBEST METHOD**

In [13]:
def selectkbestX(data):
    #########################################################################################
    def preprocessingX(data):
    
        

        def one_hot_encode(data):
            
            
            original_columns = list(data.columns)
            categories = [cat for cat in data.columns if data[cat].dtype == 'category']
            df = pd.get_dummies(data, columns= categories, dummy_na= True) #one_hot_encode the categorical features
            categorical_columns = [cat for cat in df.columns if cat not in original_columns]
            return df, categorical_columns
        
        
        df,categorical_columns=one_hot_encode(data)
        del data;gc.collect()
        
        for column in df.columns:
            # Calculate the mean value of the column excluding NaNs
            #column_mean = df[column].mean()
            # Replace NaN values in the column with the mean value
            df[column]=df[column].fillna(0)

       

        return df
    #########################################################################################
    def selectkbest_base(X_train, y_train):
        
        # Define SelectKBest with desired parameters
        k = 500  # Number of top features to select
        S = SelectKBest(score_func=f_classif, k=k)

        # Fit SelectKBest on training data and transform features
        X_train_k_best = S.fit_transform(X_train, y_train)

        # Get scores assigned to each feature
        feature_scores = S.scores_
        
        # Create a DataFrame to store feature names and their scores
        feature_scores_df = pd.DataFrame({'Feature': X_train.columns, 'Score': feature_scores})

        # Sort DataFrame by scores in descending order
        #feature_scores_df_sorted = feature_scores_df.sort_values(by='Score', ascending=False)

        # Print the table of top features and their scores
      
        # Return DataFrame with feature names and their scores
        return feature_scores_df
    #########################################################################################
    
    
    df=preprocessingX(data)
    del data;gc.collect()
    
    
    
    
   
    N_CHUNKS=5
    df.drop(df[df['target'].isnull()].index, inplace=True)
    
   
    del_features = ['target', 'case_id']
    predictors = [col for col in df.columns if col not in del_features]
    
    feats_df = pd.DataFrame({'feature': predictors}, columns=['feature'])
    
    results=[]
    
    with tqdm(total=N_CHUNKS) as pbar:
        for i in range(N_CHUNKS):

            sub_df = df[df.index % N_CHUNKS == i]
            df.drop(df.index[df.index % N_CHUNKS == i], inplace=True)
            X_train=sub_df[predictors]
            y_train=sub_df['target']


            result_df=selectkbest_base(X_train, y_train)
            
            del sub_df
            gc.collect()

            results.append(result_df)
            pbar.update(1)
            
    del df; gc.collect()
    merged_df = results[0]

# Merge the remaining dataframes horizontally on the 'Feature' column
    for df_index in range(1, len(results)):
        suffix = '_' + str(df_index)  # Add a suffix to distinguish overlapping column names
        merged_df = pd.merge(merged_df, results[df_index], on='Feature', suffixes=('', suffix))

    merged_df.rename(columns={'Score': 'Score_0'}, inplace=True)
    merged_df['mean_score'] = 0
    
    for i in range(N_CHUNKS):
        merged_df['mean_score']+=merged_df["Score_"+str(i)]
    
    
    final_df=merged_df[['Feature', 'mean_score']]
    final_df = final_df.sort_values(by='mean_score', ascending=False)
    pd.set_option('display.max_rows', None)  # Show all rows
# Display the DataFrame
    print(final_df)
   

    final_df.to_csv("/kaggle/working/SelectKBest.csv")
    
    return merged_df

#  **MODEL** <a id='model'></a>

[CONFIGURATION](#configuration) 

[MAIN FUNCTION](#main_function)

[MODEL](#model)

[EXECUTION](#execution)

In [14]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        
        return np.mean(y_preds, axis=0)

    
    def get_splits(self, aggregation_method=np.mean):
        
        feature_importances_list=[]
        for x in self.estimators:
            feature_importances_list.append(x.booster_.feature_importance(importance_type='split'))
            
        # Aggregate feature importances across all models
        if all(importances is not None for importances in feature_importances_list):
            combined_importances = aggregation_method(feature_importances_list, axis=0)
        else:
            combined_importances = None   
        return combined_importances
    
    
    def get_gains(self, aggregation_method=np.mean):
        
        feature_importances_list=[]
        for model in self.estimators:
            feature_importances_list.append(x.booster_.feature_importance(importance_type='gain'))
            
        # Aggregate feature importances across all models
        if all(importances is not None for importances in feature_importances_list):
            combined_importances = aggregation_method(feature_importances_list, axis=0)
        else:
            combined_importances = None
              
        return combined_importances
    
    def get_features_importances_df(self, predictors):
        
        
        importance_df = pd.DataFrame()
        eval_results = dict()
        for model in self.estimators:
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = predictors
            fold_importance["gain"] = model.booster_.feature_importance(importance_type='gain')
            fold_importance["split"] = model.booster_.feature_importance(importance_type='split')
            importance_df = pd.concat([importance_df, fold_importance], axis=0)
            importance_df= importance_df.groupby('feature').mean().reset_index()
        return importance_df
    
    
    def add_predictions(self, predictions):
        self.predictions=predictions
        
    def get_predictions(self):
        return self.predictions
        

In [15]:
def kfold_lightgbm_sklearn(data, categorical_feature = None):
    
    
    
   
    #time.sleep(30)
    start_time = time.time()
    
    df=data.copy()
    df.drop(df[df['target'].isnull()].index, inplace=True)
    #test=data.copy()
    #test.drop(test[test['target'].notnull()].index, inplace=True)
    del data; gc.collect()
    
    #df=reduce_mem_usage(df)
    #test=reduce_mem_usage(test)
    
  
    #time.sleep(30)
    #print("Train/valid shape: {}, test shape: {}".format(df.shape, test.shape))
    print("Train/valid shape: {}, ".format(df.shape))
    
    del_features = ['target', 'case_id', 'WEEK_NUM']
    predictors = list(filter(lambda v: v not in del_features, df.columns))

    if not STRATIFIED_KFOLD:
        folds = KFold(n_splits= NUM_FOLDS, shuffle=True, random_state= RANDOM_SEED)
    else:
        folds = StratifiedKFold(n_splits= NUM_FOLDS, shuffle=True, random_state= RANDOM_SEED)
    weeks = df["WEEK_NUM"]
    cv = StratifiedGroupKFold(n_splits=5, shuffle=False)
    
        # Hold oof predictions, test predictions, feature importance and training/valid auc
    oof_preds = np.zeros(df.shape[0])
    
    importance_df = pd.DataFrame()
    eval_results = dict()
    
    fitted_models = []
    n_fold=0
    print("df train shape ", df[predictors].shape)
    print("y shape ", len(df['target']))
    
    '''
    print("Numerical Columns:")
    for col in df[predictors].select_dtypes(include=np.number).columns:
        print(f"{col}: Min={df[predictors][col].min():.2f}, Mean={df[predictors][col].mean():.2f}, Max={df[predictors][col].max():.2f}, Std={df[predictors][col].std():.2f}")

            # Count categories in categorical columns
    print("\nCategorical Columns:")
    for col in df[predictors].select_dtypes(include='category').columns:
        num_categories = df[predictors][col].nunique()
        print(f"{col}: {num_categories} categories")
    
    '''
    
    
    
    with tqdm(total=NUM_FOLDS) as pbar:
        #for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df[predictors], df['target'])):
        for train_idx, valid_idx in cv.split(df[predictors], df['target'], groups=weeks):  
          
            train_x, train_y = df[predictors].iloc[train_idx], df['target'].iloc[train_idx]
            valid_x, valid_y = df[predictors].iloc[valid_idx], df['target'].iloc[valid_idx]
            
            train_x.sort_index(axis=1, inplace=True)
            valid_x.sort_index(axis=1, inplace=True)
            # Print statistics for numerical columns
            print()
            for col in train_x.columns:
                print(col)
            print()
                
            print("X_train shape ", train_x.shape)
            print("y_train shape ", len(train_y))
            print("X_valid shape ", valid_x.shape)
            print("y_valid shape ", len(valid_y))
        
            #time.sleep(30)
            params = {'random_state': RANDOM_SEED, 'nthread': NUM_THREADS}
            clf = lgb.LGBMClassifier(**LIGHTGBM_PARAMS)


         
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                            callbacks=[lgb.log_evaluation(200), lgb.early_stopping(100)]
                           )
            

            fitted_models.append(clf)

            if EVALUATE_VALIDATION_SET:
                oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]



                # Feature importance by GAIN and SPLIT

            eval_results['train_{}'.format(n_fold+1)]  = clf.evals_result_['training']['auc']
            eval_results['valid_{}'.format(n_fold+1)] = clf.evals_result_['valid_1']['auc']

            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time * (NUM_FOLDS - n_fold - 1) / (n_fold + 1)
            print('Fold %2d AUC : %.6f. Elapsed time: %.2f seconds. Remaining time: %.2f seconds.'
                  % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]), elapsed_time, remaining_time))
            

            del clf, train_x, train_y, valid_x, valid_y
            n_fold+=1
            gc.collect()
            pbar.update(1)
            
    print('Full AUC score %.6f' % roc_auc_score(df['target'], oof_preds))
    # Get the average feature importance between folds
    
    
    
    if len(df)>0:
        base=get_base(DATA_DIRECTORY, len(df))
        base, cat_cols = to_pandas(base)
        base=base[base['target'].notnull()]
        base['score']= oof_preds
        gini_score = gini_stability(base)
        print("Gini Score of the valid set:", gini_score)
    
    
    
    
    # Save feature importance, test predictions and oof predictions as csv
    
        
        
  
        
        
    model = VotingModel(fitted_models)
    
    del df; gc.collect()
    return model

# **SUBMISSION**

In [16]:
def generate_submission_file(data, model):
    test=data.copy()
    test.drop(test[test['target'].notnull()].index, inplace=True)
    del data;gc.collect()

   
    
    

# Drop columns not in 'predictors'
    

    del_features = ['target', 'case_id','WEEK_NUM']
    predictors = list(filter(lambda v: v not in del_features, test.columns))
    columns_to_drop = set(test.columns) - set(predictors)
    case_id_series=test['case_id']
    test.drop(columns=list(columns_to_drop), inplace=True)
    test.sort_index(axis=1, inplace=True)
    print()


# Print each column name
    for col in test.columns:
        print(col)
    print()
    y_pred = pd.Series(model.predict_proba(test)[:, 1], index=case_id_series) 

    df_subm = pd.read_csv(ROOT / "sample_submission.csv")
    df_subm = df_subm.set_index("case_id")
    df_subm["score"] = y_pred

    df_subm.to_csv("submission.csv")
    
    return True



# **EVALUATE FEATURES IMPORTANCES**

In [17]:
def get_features_importances(predictors, model):
    importance_df = model.get_features_importances_df(predictors)
    mean_importance = importance_df.groupby('feature').mean().reset_index()
    mean_importance.to_csv('feature_importance{}.csv'.format(SUBMISSION_SUFIX), index=False)
    mean_importance.sort_values(by= 'gain', ascending=False, inplace=True)
    mean_importance.to_csv('feature_importance_gain{}.csv'.format(SUBMISSION_SUFIX), index=False)
    mean_importance.sort_values(by= 'split', ascending=False, inplace=True)
    mean_importance.to_csv('feature_importance_split{}.csv'.format(SUBMISSION_SUFIX), index=False)
    return True

# **FEATURE ENGINEERING FUNCTION**

In [18]:
def feature_engineering(df):
    
    
    
    df=df.pipe(Pipeline.handle_dates) 
    #df=df.pipe(Pipeline.filter_cols)
    

    
    
    
    columns_to_add = [
        (pl.col("days30_165L")/ pl.col("days360_512L")).alias("ratio_queries_30"),
        ((pl.col("days90_310L") / pl.col("days360_512L")).alias("ratio_queries_90")),
        ((pl.col("days120_123L") / pl.col("days360_512L")).alias("ratio_queries_120")),
        ((pl.col("days180_256L") / pl.col("days360_512L")).alias("ratio_queries_180")),
        
        
        ((pl.col("credamount_770A") / pl.col("max_mainoccupationinc_437A")).alias("CREDIT_INCOME_PERCENT")),
        ((pl.col("annuity_780A") / pl.col("max_mainoccupationinc_437A")).alias("ANNUITY_INCOME_PERCENT")),
        ((pl.col("credamount_770A") / pl.col("annuity_780A")).alias("CREDIT_ANNUITY_PERCENT")),
        
        ((pl.col("annuity_780A") / pl.col("credamount_770A")).alias("CREDIT_TERM")),
        ((pl.col("max_mainoccupationinc_437A") / pl.col("max_childnum_21L")).alias("CHILDREN_CNT_INCOME_PERCENT")),
        
        #data['ANNUITY_LENGTH_EMPLOYED_PERCENT'] = data['CREDIT_TERM']/ data['DAYS_EMPLOYED']
        ((pl.col("CREDIT_TERM") / pl.col("max_empl_employedfrom_271D")).alias("ANNUITY_LENGTH_EMPLOYED_PERCENT")),
    ]
        #data['PHONE_CHANGE_EMP_PERCENT'] = data['DAYS_LAST_PHONE_CHANGE']/data['DAYS_EMPLOYED']
        
    ''' 
        
        #((pl.col("credamount_590A") / pl.col("byoccupationinc_3656910L")).alias("credit_income_percent")),
        
        
        
        #((pl.col("collater_typofvalofguarant_298M") + pl.col("collater_typofvalofguarant_407M")).alias("sum_collater")),
        #((pl.col("collater_typofvalofguarant_298M") + pl.col("sum_collater"))).alias("ratio_collater_active")),
        #((pl.col("collater_typofvalofguarant_407M") + pl.col("sum_collater"))).alias("ratio_collater_close")),
        #((pl.col("overdueamount_31A") + pl.col("overdueamount_659A")).alias("sum_overdue_amount")),
        #((pl.col("overdueamount_31A") / pl.col("sum_overdue_amount")).alias("ratio_overdue_amount_active")),
        #((pl.col("overdueamount_659A") / pl.col("sum_overdue_amount")).alias("ratio_overdue_amount_close")),
        #((pl.col("overdueamount_31A") / pl.col("total_overdue_amount")).alias("ratio_overdue_amount_active")),
        #((pl.col("overdueamount_659A") / pl.col("total_overdue_amount")).alias("ratio_overdue_amount_close")),
        #((pl.col("totalamount")).alias("sum_totalcredit_contract")),
        #((pl.col("totalamount_503A") / pl.col("sum_totalcredit_contract")).alias("ratio_totalcredit_contract_active")),
        #((pl.col("totalamount_6A") / pl.col("sum_totalcredit_contract")).alias("ratio_totalcredit_contract_close")),
        #((pl.col("totaldebtoverduevalue_178A") / pl.col("totaldebt_9A")).alias("ratio_overdue_debt_active")),
        #((pl.col("totaldebtoverduevalue_718A") / pl.col("totaldebt_9A")).alias("ratio_overdue_debt_close")),
        #((pl.col("numberofinstls_229L") + pl.col("numberofinstls_320L")).alias("sum_instalments")),
        #((pl.col("numberofinstls_320L") / pl.col("sum_instalments")).alias("ratio_instalments_active")),
        #((pl.col("numberofinstls_229L") / pl.col("sum_instalments")).alias("ratio_instalments_close"))
    
    '''
# Add the calculated columns to the DataFrame
    
    
    for column in columns_to_add:
        df = df.with_columns([column])
        
    new_cols=["ratio_queries_30","ratio_queries_90","ratio_queries_120","ratio_queries_180","CREDIT_INCOME_PERCENT",
                       "ANNUITY_INCOME_PERCENT","CREDIT_ANNUITY_PERCENT","CREDIT_TERM","CHILDREN_CNT_INCOME_PERCENT",
                        "ANNUITY_LENGTH_EMPLOYED_PERCENT"]
    for column_name in new_cols:
        if column_name in df.columns:
            df=df.with_columns(
            pl.when(pl.col(column_name).is_infinite())
            .then(None)
            .otherwise(pl.col(column_name))
            .keep_name()
        )
        
        

    
    #df=df.pipe(Pipeline.filter_cols)
    
    columns_to_drop=[
     
   'min_pmts_year_1139T',              
'mean_pmts_year_1139T',                            
'max_pmts_year_1139T',                             
'min_pmts_year_507T' ,                               
'mean_pmts_year_507T',                              
'max_pmts_year_507T',
        
        'min_overdueamountmaxdateyear_2T'  ,               
'mean_overdueamountmaxdateyear_2T'   ,              
'max_overdueamountmaxdateyear_2T'  ,              
'min_overdueamountmaxdateyear_994T'  ,              
'mean_overdueamountmaxdateyear_994T' ,             
'max_overdueamountmaxdateyear_994T'
]
    #columns_to_drop_existing = [col for col in columns_to_drop if col in df.columns]

    #df=df.drop(columns_to_drop_existing)
    
    
    
    features575=["count_num_group1","count_num_group2","count_num_group1_applprev2","count_num_group1_person1",
                 "count_num_group1_person2","count_num_group2_person2", "count_num_group1_debitcard",
                 "count_num_group1_tax_registry_a","count_num_group1_tax_registry_b","count_num_group1_tax_registry_c",
                 "count_num_group1_cb_a_1", "count_num_group1_cb_a_2", "count_num_group2_cb_a_2",
        "month_decision", "weekday_decision", "credamount_770A", "applicationcnt_361L", "applications30d_658L", "applicationscnt_1086L", "applicationscnt_464L", "applicationscnt_867L", "clientscnt_1022L", "clientscnt_100L", "clientscnt_1071L", "clientscnt_1130L", "clientscnt_157L",
                 "clientscnt_257L", "clientscnt_304L", "clientscnt_360L", "clientscnt_493L", "clientscnt_533L", "clientscnt_887L", "clientscnt_946L", "deferredmnthsnum_166L", "disbursedcredamount_1113A", "downpmt_116A", "homephncnt_628L", "isbidproduct_1095L", "mobilephncnt_593L", "numactivecreds_622L", "numactivecredschannel_414L", "numactiverelcontr_750L", "numcontrs3months_479L", "numnotactivated_1143L", "numpmtchanneldd_318L", "numrejects9m_859L", "sellerplacecnt_915L", "max_mainoccupationinc_384A", "max_birth_259D", "max_num_group1_9", "birthdate_574D", "dateofbirth_337D", "days180_256L", "days30_165L", "days360_512L", "firstquarter_103L", "fourthquarter_440L", "secondquarter_766L", "thirdquarter_1082L", "max_debtoutstand_525A", "max_debtoverdue_47A", "max_refreshdate_3813885D", "mean_refreshdate_3813885D", "pmtscount_423L", "pmtssum_45A", "responsedate_1012D", "responsedate_4527233D", "actualdpdtolerance_344P", "amtinstpaidbefduel24m_4187115A", "numinstlswithdpd5_4187116L", "annuitynextmonth_57A", "currdebt_22A", "currdebtcredtyperange_828A", "numinstls_657L", "totalsettled_863A", "mindbddpdlast24m_3658935P", "avgdbddpdlast3m_4187120P", "mindbdtollast24m_4525191P", "avgdpdtolclosure24_3658938P", "avginstallast24m_3658937A", "maxinstallast24m_3658928A", "avgmaxdpdlast9m_3716943P", "avgoutstandbalancel6m_4187114A", "avgpmtlast12m_4525200A", "cntincpaycont9m_3716944L", "cntpmts24_3658933L", "commnoinclast6m_3546845L", "maxdpdfrom6mto36m_3546853P", "datefirstoffer_1144D", "datelastunpaid_3546854D", "daysoverduetolerancedd_3976961L", "numinsttopaygr_769L", "dtlastpmtallstes_4499206D", "eir_270L", "firstclxcampaign_1125D", "firstdatedue_489D", "lastactivateddate_801D", "lastapplicationdate_877D", "mean_creationdate_885D", "max_num_group1", "last_num_group1", "max_num_group2_14", "last_num_group2_14", "lastapprcredamount_781A", "lastapprdate_640D", "lastdelinqdate_224D", "lastrejectcredamount_222A", "lastrejectdate_50D", "maininc_215A", "mastercontrelectronic_519L", "mastercontrexist_109L", "maxannuity_159A", "maxdebt4_972A", "maxdpdlast24m_143P", "maxdpdlast3m_392P", "maxdpdtolerance_374P", "maxdbddpdlast1m_3658939P", "maxdbddpdtollast12m_3658940P", "maxdbddpdtollast6m_4187119P", "maxdpdinstldate_3546855D", "maxdpdinstlnum_3546846P", "maxlnamtstart6m_4525199A", "maxoutstandbalancel12m_4187113A", "numinstpaidearly_338L", "numinstpaidearly5d_1087L", "numinstpaidlate1d_3546852L", "numincomingpmts_3546848L", "numinstlsallpaid_934L", "numinstlswithdpd10_728L", "numinstlswithoutdpd_562L", "numinstpaid_4499208L", "numinstpaidearly3d_3546850L", "numinstregularpaidest_4493210L", "numinstpaidearly5dest_4493211L", "sumoutstandtotalest_4493215A", "numinstpaidlastcontr_4325080L", "numinstregularpaid_973L", "pctinstlsallpaidearl3d_427L", "pctinstlsallpaidlate1d_3546856L", "pctinstlsallpaidlat10d_839L", "pctinstlsallpaidlate4d_3546849L", "pctinstlsallpaidlate6d_3546844L", "pmtnum_254L", "posfpd10lastmonth_333P", "posfpd30lastmonth_3976960P", "posfstqpd30lastmonth_3976962P", "price_1097A", "sumoutstandtotal_3546847A", "totaldebt_9A", "mean_actualdpd_943P", "max_annuity_853A", "mean_annuity_853A", "max_credacc_credlmt_575A", "max_credamount_590A", "max_downpmt_134A", "mean_credacc_credlmt_575A", "mean_credamount_590A", "mean_downpmt_134A", "max_currdebt_94A", "mean_currdebt_94A", "max_mainoccupationinc_437A", "mean_mainoccupationinc_437A", "mean_maxdpdtolerance_577P", "max_outstandingdebt_522A", "mean_outstandingdebt_522A", "last_actualdpd_943P", "last_annuity_853A", "last_credacc_credlmt_575A", "last_credamount_590A", "last_downpmt_134A", "last_currdebt_94A", "last_mainoccupationinc_437A", "last_maxdpdtolerance_577P", "last_outstandingdebt_522A", "max_approvaldate_319D", "mean_approvaldate_319D", "max_dateactivated_425D", "mean_dateactivated_425D", "max_dtlastpmt_581D", "mean_dtlastpmt_581D", "max_dtlastpmtallstes_3545839D", "mean_dtlastpmtallstes_3545839D", "max_employedfrom_700D", "max_firstnonzeroinstldate_307D", "mean_firstnonzeroinstldate_307D", "last_approvaldate_319D", "last_creationdate_885D", "last_dateactivated_425D", "last_dtlastpmtallstes_3545839D", "last_employedfrom_700D", "last_firstnonzeroinstldate_307D", "max_byoccupationinc_3656910L", "max_childnum_21L", "max_pmtnum_8L", "last_pmtnum_8L", "max_pmtamount_36A", "last_pmtamount_36A", "max_processingdate_168D", "last_processingdate_168D", "max_num_group1_5", "mean_credlmt_230A", "mean_credlmt_935A", "mean_pmts_dpd_1073P", "max_dpdmaxdatemonth_89T", "max_dpdmaxdateyear_596T", "max_pmts_dpd_303P", "mean_dpdmax_757P", "max_dpdmaxdatemonth_442T", "max_dpdmaxdateyear_896T", "mean_pmts_dpd_303P", "mean_instlamount_768A", "mean_monthlyinstlamount_332A", "max_monthlyinstlamount_674A", "mean_monthlyinstlamount_674A", "mean_outstandingamount_354A", "mean_outstandingamount_362A", "mean_overdueamount_31A", "mean_overdueamount_659A", "max_numberofoverdueinstls_725L", "mean_overdueamountmax2_14A", "mean_totaloutstanddebtvalue_39A", "mean_dateofcredend_289D", "mean_dateofcredstart_739D", "max_lastupdate_1112D", "mean_lastupdate_1112D", "max_numberofcontrsvalue_258L", "max_numberofoverdueinstlmax_1039L", "max_overdueamountmaxdatemonth_365T", "max_overdueamountmaxdateyear_2T", "mean_pmts_overdue_1140A", "max_pmts_month_158T", "max_pmts_year_1139T", "mean_overdueamountmax2_398A", "max_dateofcredend_353D", "max_dateofcredstart_181D", "mean_dateofcredend_353D", "max_numberofoverdueinstlmax_1151L", "mean_overdueamountmax_35A", "max_overdueamountmaxdatemonth_284T", "max_overdueamountmaxdateyear_994T", "mean_pmts_overdue_1152A", "max_residualamount_488A", "mean_residualamount_856A", "max_totalamount_6A", "mean_totalamount_6A", "mean_totalamount_996A", "mean_totaldebtoverduevalue_718A", "mean_totaloutstanddebtvalue_668A", "max_numberofcontrsvalue_358L", "max_dateofrealrepmt_138D", "mean_dateofrealrepmt_138D", "max_lastupdate_388D", "mean_lastupdate_388D", "max_numberofoverdueinstlmaxdat_148D", "mean_numberofoverdueinstlmaxdat_641D", "mean_overdueamountmax2date_1002D", "max_overdueamountmax2date_1142D", "last_refreshdate_3813885D", "max_nominalrate_281L", "max_nominalrate_498L", "max_numberofinstls_229L", "max_numberofinstls_320L", "max_numberofoutstandinstls_520L", "max_numberofoutstandinstls_59L", "max_numberofoverdueinstls_834L", "max_periodicityofpmts_1102L", "max_periodicityofpmts_837L", "last_num_group1_6", "last_mainoccupationinc_384A", "last_birth_259D", "max_empl_employedfrom_271D", "last_personindex_1023L", "last_persontype_1072L", "max_collater_valueofguarantee_1124L", "max_collater_valueofguarantee_876L", "max_pmts_month_706T", "max_pmts_year_507T", "last_pmts_month_158T", "last_pmts_year_1139T", "last_pmts_month_706T", "last_pmts_year_507T", "max_num_group1_13", "max_num_group2_13", "last_num_group2_13", "max_num_group1_15", "max_num_group2_15", "description_5085714M", "education_1103M", "education_88M", "maritalst_385M", "maritalst_893M", "requesttype_4525192L", "credtype_322L", "disbursementtype_67L", "inittransactioncode_186L", "lastapprcommoditycat_1041M", "lastcancelreason_561M", "lastrejectcommoditycat_161M", "lastrejectcommodtypec_5251769M", "lastrejectreason_759M", "lastrejectreasonclient_4145040M", "lastst_736L", "opencred_647L", "paytype1st_925L", "paytype_783L", "twobodfilling_608L", "max_cancelreason_3545846M", "max_education_1138M", "max_postype_4733339M", "max_rejectreason_755M", "max_rejectreasonclient_4145042M", "last_cancelreason_3545846M", "last_education_1138M", "last_postype_4733339M", "last_rejectreason_755M", "last_rejectreasonclient_4145042M", "max_credtype_587L", "max_familystate_726L", "max_inittransactioncode_279L", "max_isbidproduct_390L", "max_status_219L", "last_credtype_587L", "last_familystate_726L", "last_inittransactioncode_279L", "last_isbidproduct_390L", "last_status_219L", "max_classificationofcontr_13M", "max_classificationofcontr_400M", "max_contractst_545M", "max_contractst_964M", "max_description_351M", "max_financialinstitution_382M", "max_financialinstitution_591M", "max_purposeofcred_426M", "max_purposeofcred_874M", "max_subjectrole_182M", "max_subjectrole_93M", "last_classificationofcontr_13M", "last_classificationofcontr_400M", "last_contractst_545M", "last_contractst_964M", "last_description_351M", "last_financialinstitution_382M", "last_financialinstitution_591M", "last_purposeofcred_426M", "last_purposeofcred_874M", "last_subjectrole_182M", "last_subjectrole_93M", "max_education_927M", "max_empladdr_district_926M", "max_empladdr_zipcode_114M", "max_language1_981M", "last_education_927M", "last_empladdr_district_926M", "last_empladdr_zipcode_114M", "last_language1_981M", "max_contaddr_matchlist_1032L", "max_contaddr_smempladdr_334L", "max_empl_employedtotal_800L", "max_empl_industry_691L", "max_familystate_447L", "max_incometype_1044T", "max_relationshiptoclient_415T", "max_relationshiptoclient_642T", "max_remitter_829L", "max_role_1084L", "max_safeguarantyflag_411L", "max_sex_738L", "max_type_25L", "last_contaddr_matchlist_1032L", "last_contaddr_smempladdr_334L", "last_incometype_1044T", "last_relationshiptoclient_642T", "last_role_1084L", "last_safeguarantyflag_411L", "last_sex_738L", "last_type_25L", "max_collater_typofvalofguarant_298M", "max_collater_typofvalofguarant_407M", "max_collaterals_typeofguarante_359M", "max_collaterals_typeofguarante_669M", "max_subjectroles_name_541M", "max_subjectroles_name_838M", "last_collater_typofvalofguarant_298M", "last_collater_typofvalofguarant_407M", "last_collaterals_typeofguarante_359M", "last_collaterals_typeofguarante_669M", "last_subjectroles_name_541M", "last_subjectroles_name_838M", "max_cacccardblochreas_147M", "last_cacccardblochreas_147M", "max_conts_type_509L", "last_conts_type_509L", "max_conts_role_79M", 
                 "max_empls_economicalst_849M", "max_empls_employer_name_740M", "last_conts_role_79M", "last_empls_economicalst_849M", "last_empls_employer_name_740M"]
    
    
    seen = set()
    duplicates = []
    for item in features575:
        if item in seen:
            duplicates.append(item)
        else:
            seen.add(item)
    print("duplicates:")
    print(duplicates)
    print()
    
    print("Length of features575: ", len(features575))
   
    missing_columns = [col for col in features575 if col not in df.columns]

    # Print missing columns, if any
    if missing_columns:
        print("The following columns are missing in the DataFrame:")
        for col in missing_columns:
            print(col)
    else:
        print("All columns from features575 are present in the DataFrame.")
    
    # Columns to preserve
    preserved_columns = ['target', 'case_id', 'WEEK_NUM']

    # Identify columns to drop excluding the preserved columns
    columns_to_drop = [col for col in df.columns if col not in preserved_columns + features575]

    # Drop columns that are not in features_selected and not preserved
    
    df=df.drop(columns=columns_to_drop)
    
    test = df.filter(df['target'].is_null())
    if BALANCE_COLUMNS and len(test)>10:

        train = df.filter(df['target'].is_not_null())
        

        valid_percentage_train = []
        valid_percentage_test=[]
        for col in df.columns:
            valid_percentage_train.append(train[col].count())
            valid_percentage_test.append(test[col].count())

        valid_percentage_train = pd.Series(valid_percentage_train)
        valid_percentage_test= pd.Series(valid_percentage_test)

        print(train.count())
        print(test.count())
        print("length of train",len(valid_percentage_train))
        print("length of test",len(valid_percentage_test))
        print("df columns",len(df.columns))

        info_df = pd.DataFrame({'column': df.columns, 'valid_train': valid_percentage_train, 'valid_test': valid_percentage_test})
        irrelevant_columns = info_df[info_df['valid_test'] < 0.05 ]['column'].to_list()
        columns_to_drop = [col for col in df.columns if (col not in preserved_columns) and (col in irrelevant_columns) ]
        df = df.drop(columns=columns_to_drop)

    return df

In [19]:
'''
df = get_base(DATA_DIRECTORY)
df_applprev1 = get_applprev1(DATA_DIRECTORY)
df_applprev1 = df_applprev1.filter(pl.col('case_id').is_in(df['case_id'].unique()))
df = df.join(df_applprev1, on='case_id', how='left', suffix='_applprev1')
print("Previous applications depth 1 test dataframe shape:", df_applprev1.shape)
print("DATAFRAME shape:", df.shape)
del df_applprev1
gc.collect()


df=feature_engineering(df)
print(df['credit_income_percent'])
'''

'\ndf = get_base(DATA_DIRECTORY)\ndf_applprev1 = get_applprev1(DATA_DIRECTORY)\ndf_applprev1 = df_applprev1.filter(pl.col(\'case_id\').is_in(df[\'case_id\'].unique()))\ndf = df.join(df_applprev1, on=\'case_id\', how=\'left\', suffix=\'_applprev1\')\nprint("Previous applications depth 1 test dataframe shape:", df_applprev1.shape)\nprint("DATAFRAME shape:", df.shape)\ndel df_applprev1\ngc.collect()\n\n\ndf=feature_engineering(df)\nprint(df[\'credit_income_percent\'])\n'

# **GET FUNCTIONS**

### get_base()

In [20]:
def get_base(path, num_rows = None):
    # Read the Parquet file using scan() method
    train={}
    test={}
    
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_base.parquet'))
        
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_base.parquet')).limit(num_rows) 
        
    test = pl.read_parquet(os.path.join(path, 'test/test_base.parquet'))    
    length=len(test)
    nan_series=pl.Series([None] * length)
    test = test.select(pl.col("*"), nan_series.alias("target"))
    df=pl.concat([train, test])
    del test;del train;gc.collect()
    
    
    df = df.with_columns(pl.col('date_decision').cast(pl.Date))
    return df

### get_static()

In [21]:
def get_static(path, num_rows = None):
# Read the Parquet file using scan() method
    chunks = []
    for path in glob(DATA_DIRECTORY+str('train/train_static_0_*.parquet')):
        chunks.append(pl.read_parquet(path,low_memory=True).pipe(Pipeline.set_table_dtypes) )
    train = (pl.concat(chunks, how="vertical_relaxed")).pipe(Pipeline.filter_cols)
    
    if num_rows!= None:
        df1 = train.slice(0,num_rows)
        df2 = train.slice(num_rows,len(train))
        
        train=df1
        del df2
        gc.collect()
    chunks = []
    for path in glob(DATA_DIRECTORY+str('test/test_static_0_*.parquet')):
        chunks.append(pl.read_parquet(path,low_memory=True).pipe(Pipeline.set_table_dtypes) )
    test = pl.concat(chunks, how="vertical_relaxed")
    
    
    columns_to_keep = train.columns

# Find columns in 'test' that are not in 'train'
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]

# Drop columns from 'test' that are not in 'train'
    test = test.drop(columns_to_remove)
    df=pl.concat([train, test])
    del test;del train;gc.collect()
    return df

### get_static_cb()

In [22]:
def get_static_cb(path, num_rows = None):
    
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_static_cb_0.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
        
        
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_static_cb_0.parquet'),low_memory=True).limit(num_rows).pipe(Pipeline.set_table_dtypes) 
       
    
    test = pl.read_parquet(os.path.join(path, 'test/test_static_cb_0.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    
    train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del test;del train;gc.collect()
    return df

### get_applprev1(DATA_DIRECTORY, num_rows=num_rows)

In [23]:
def get_applprev1(path, num_rows = None):
    
    
    chunks = []
    for path in glob(DATA_DIRECTORY+str('train/train_applprev_1_*.parquet')):
        chunks.append(pl.read_parquet(path, low_memory=True).pipe(Pipeline.set_table_dtypes))
    train = pl.concat(chunks, how="vertical_relaxed")#.pipe(Pipeline.filter_cols)
    
    
    if num_rows!= None:
        df1 = train.slice(0,num_rows)
        df2 = train.slice(num_rows,len(train))

        train=df1
        del df2   
        gc.collect()
    chunks = []
    for path in glob(DATA_DIRECTORY+str('test/test_applprev_1_*.parquet')):
        chunks.append(pl.read_parquet(path, low_memory=True).pipe(Pipeline.set_table_dtypes))
    test = pl.concat(chunks, how="vertical_relaxed")
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)
    df=pl.concat([train, test])
    del test;del train;gc.collect()
    agg_df = group(df, '', APPLPREV1_AGG)
    del df;gc.collect()
    return agg_df

### get_applprev2(DATA_DIRECTORY, num_rows=num_rows)

In [24]:
def get_applprev2(path, num_rows = None):
    train={}
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_applprev_2.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
        
     
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_applprev_2.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
       
    
    
    test = pl.read_parquet(os.path.join(path, 'test/test_applprev_2.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del train; del test;gc.collect()
    agg_df = group(df, '', APPLPREV2_AGG)
    del df ;gc.collect()
    return agg_df

### get_person1

In [25]:
def get_person1(path, num_rows = None):
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_person_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
        
    
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_person_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
      
    
    
    test = pl.read_parquet(os.path.join(path, 'test/test_person_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del train; del test;gc.collect()
    agg_df = group(df, '', PERSON1_AGG)
    del df;gc.collect()
    
    return agg_df

### get_person2

In [26]:
def get_person2(path, num_rows = None):
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_person_2.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
        
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_person_2.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes)
        
    test = pl.read_parquet(os.path.join(path, 'test/test_person_2.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del train; del test;gc.collect()
    agg_df = group(df, '', PERSON2_AGG)
    del df;gc.collect()
    
    return agg_df

### other

In [27]:
def get_other(path, num_rows = None):
     # Read the Parquet file using scan() method
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_other_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
        
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_other_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
         
    test = pl.read_parquet(os.path.join(path, 'test/test_other_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    
    
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del train; del test;gc.collect()
    agg_df = group(df, '', OTHER_AGG)
    del df;gc.collect()
    
    return agg_df

## get_debitcard

In [28]:
def get_debitcard(path, num_rows = None):
    # Read the Parquet file using scan() method
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_debitcard_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
        
     
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_debitcard_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
      
        
    test = pl.read_parquet(os.path.join(path, 'test/test_debitcard_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    
    
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del train; del test;gc.collect()
    agg_df = group(df, '', DEBITCARD_AGG)
    del df;gc.collect()
    
    return agg_df

### get_tax_registry_a

In [29]:
def get_tax_registry_a(path, num_rows = None):
    
    # Read the Parquet file using scan() method
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_tax_registry_a_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
        
    
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_tax_registry_a_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
  
    
    
    test = pl.read_parquet(os.path.join(path, 'test/test_tax_registry_a_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del train; del test;gc.collect()
    agg_df = group(df, '', TAX_REGISTRY_A_AGG)    
    del df;gc.collect()
    
    return agg_df

### get_tax_registry_b

In [30]:
def get_tax_registry_b(path, num_rows = None):
    # Read the Parquet file using scan() method
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_tax_registry_b_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes)
        
        
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_tax_registry_b_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
        
    
    test = pl.read_parquet(os.path.join(path, 'test/test_tax_registry_b_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del train; del test;gc.collect()
    agg_df = group(df, '', TAX_REGISTRY_B_AGG) 
    del df;gc.collect()
    
    return agg_df

### get_tax_registry_c

In [31]:
def get_tax_registry_c(path, num_rows = None):
     # Read the Parquet file using scan() method
# Read the Parquet file using scan() method
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_tax_registry_c_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_tax_registry_c_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
        
    
    test = pl.read_parquet(os.path.join(path, 'test/test_tax_registry_c_1.parquet'),low_memory=True).pipe(Pipeline.set_table_dtypes) 
    
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del train; del test;gc.collect()
    agg_df = group(df, '', TAX_REGISTRY_C_AGG)    
    del df; gc.collect()
    
    return agg_df

### get_credit_bureau_a_1

In [32]:
def get_credit_bureau_a_1(path, num_rows = None):
    
    
    
    agg_chunks=[]
    
    for path in glob(DATA_DIRECTORY + 'train/train_credit_bureau_a_1_*.parquet'):
        file_df=pl.read_parquet(path, low_memory=True).pipe(Pipeline.set_table_dtypes)
        agg_file_df = group(file_df, '', CREDIT_BUREAU_A_1_AGG, datatype='polars')
        
        agg_chunks.append(agg_file_df)
        del file_df; gc.collect()
    
    
    train_agg_df=agg_chunks[0]
    for agg_chunk in agg_chunks[1:]:
        train_agg_df=train_agg_df.vstack(agg_chunk)
    train_agg_df.rechunk()
        
        
    
    
    
    agg_chunks=[]
    
    for path in glob(DATA_DIRECTORY + 'test/test_credit_bureau_a_1_*.parquet'):
        file_df=pl.read_parquet(path, low_memory=True).pipe(Pipeline.set_table_dtypes)
        agg_file_df = group(file_df, '', CREDIT_BUREAU_A_1_AGG, datatype='polars')
       
        agg_chunks.append(agg_file_df)
        del file_df; gc.collect()
        
        
    test_agg_df=agg_chunks[0]
    for agg_chunk in agg_chunks[1:]:
        test_agg=test_agg_df.vstack(agg_chunk)
    test_agg_df.rechunk()
    
    
   

    
    agg_df=train_agg_df
    agg_df=agg_df.extend(test_agg_df)
    

 
    
    print("agg df ", agg_df.shape)
   
    unique_count = agg_df['case_id'].n_unique()

    print("Number of unique values in 'case_id' column:", unique_count)
    return agg_df

### get_credit_bureau_b_1

In [33]:
def get_credit_bureau_b_1(path, num_rows = None):
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_credit_bureau_b_1.parquet')).pipe(Pipeline.set_table_dtypes) 
        
        
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_credit_bureau_b_1.parquet')).pipe(Pipeline.set_table_dtypes) 
   
    
    test = pl.read_parquet(os.path.join(path, 'test/test_credit_bureau_b_1.parquet')).pipe(Pipeline.set_table_dtypes) 
    
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)

    df=pl.concat([train, test])
    del train; del test ; gc.collect()
    agg_df = group(df, '', CREDIT_BUREAU_B_1_AGG) 
    
    
    del df; gc.collect()
    
    return agg_df

### get_credit_bureau_a_2

In [34]:
def get_credit_bureau_a_2(path, num_rows = None):
    
    
    
    agg_chunks=[]
    
    for path in glob(DATA_DIRECTORY + 'train/train_credit_bureau_a_2_*.parquet'):
        file_df=pl.read_parquet(path, low_memory=True).pipe(Pipeline.set_table_dtypes)
        agg_file_df = group(file_df, '', CREDIT_BUREAU_A_2_AGG, datatype='polars')
        agg_chunks.append(agg_file_df)
        del file_df;gc.collect()
    
    train_agg_df=agg_chunks[0]
    for agg_chunk in agg_chunks[1:]:
        train_agg_df=train_agg_df.vstack(agg_chunk)
    train_agg_df.rechunk()
    
    
    agg_chunks=[]
    
    for path in glob(DATA_DIRECTORY + 'test/test_credit_bureau_a_2_*.parquet'):
        file_df=pl.read_parquet(path, low_memory=True).pipe(Pipeline.set_table_dtypes)
        agg_file_df = group(file_df, '', CREDIT_BUREAU_A_2_AGG, datatype='polars')
        agg_chunks.append(agg_file_df)
        del file_df;gc.collect()
    
    test_agg_df=agg_chunks[0]
    for agg_chunk in agg_chunks[1:]:
        test_agg_df=test_agg_df.vstack(agg_chunk)
    test_agg_df.rechunk()
    
    agg_df=train_agg_df
    agg_df=agg_df.extend(test_agg_df)
    
    print("agg df ", agg_df.shape)
   
    unique_count = agg_df['case_id'].n_unique()

    print("Number of unique values in 'case_id' column:", unique_count)
    return agg_df

### get_credit_bureau_b_2

In [35]:
def get_credit_bureau_b_2(path, num_rows = None):
    if num_rows == None:
        train = pl.read_parquet(os.path.join(path, 'train/train_credit_bureau_b_2.parquet')).pipe(Pipeline.set_table_dtypes) 
   
    else:
        train = pl.read_parquet(os.path.join(path, 'train/train_credit_bureau_b_2.parquet')).pipe(Pipeline.set_table_dtypes) 

    
    test = pl.read_parquet(os.path.join(path, 'test/test_credit_bureau_b_2.parquet')).pipe(Pipeline.set_table_dtypes) 
    
    #train = train.pipe(Pipeline.filter_cols)
   
    columns_to_keep = train.columns
    columns_to_remove = [column for column in test.columns if column not in columns_to_keep]
    test = test.drop(columns_to_remove)
    
    df=pl.concat([train, test])
    del train;del test; gc.collect()
    agg_df = group(df, '', CREDIT_BUREAU_B_2_AGG) 
    
    del df; gc.collect()
    
    return agg_df

# **EXECUTION** <a id='execution'></a>

[CONFIGURATION](#configuration) 

[MAIN FUNCTION](#main_function)

[MODEL](#model)

[EXECUTION](#execution)

In [36]:

if __name__ == "__main__":
    pd.set_option('display.max_rows', 60)
    pd.set_option('display.max_columns', 100)
    with timer("Pipeline total time"):
        main(debug= False)

Notebook started:
base dataframe shape: (1526669, 5)
base - done in 1s
static dataframe shape: (1526669, 156)
DATAFRAME shape: (1526669, 160)
static - done in 15s
static cb dataframe shape: (1500486, 31)
DATAFRAME shape: (1526669, 190)
static_cb - done in 5s
Previous applications depth 1 test dataframe shape: (1221524, 71)
DATAFRAME shape: (1526669, 260)
Previous applications depth 1 test - done in 30s
Previous applications depth 2 test dataframe shape: (1221523, 8)
DATAFRAME shape: (1526669, 267)
Previous applications depth 2 test - done in 6s
Person depth 1 test dataframe shape: (1526665, 69)
DATAFRAME shape: (1526669, 335)
Person depth 1 test - done in 13s
Person depth 2 test dataframe shape: (1435108, 9)
DATAFRAME shape: (1526669, 343)
Person depth 2 test - done in 3s
Other test dataframe shape: (51111, 12)
DATAFRAME shape: (1526669, 354)
Other test - done in 1s
Debit card test dataframe shape: (111772, 5)
DATAFRAME shape: (1526669, 358)
Debit card test - done in 1s
Tax registry a 

  .keep_name()


duplicates:
[]

Length of features575:  399
The following columns are missing in the DataFrame:
max_num_group1_9
max_num_group1
last_num_group1
max_num_group2_14
last_num_group2_14
max_num_group1_5
last_num_group1_6
max_num_group1_13
max_num_group2_13
last_num_group2_13
max_num_group1_15
max_num_group2_15


  df=df.drop(columns=columns_to_drop)


DataFrame Shape: (1526669, 390)
------------------------------------------------------------
Column Name                                        Data Type                      NaN Percentage      
------------------------------------------------------------
case_id                                            Int64                          0.00%
WEEK_NUM                                           Int64                          0.00%
target                                             Int64                          0.00%
actualdpdtolerance_344P                            Float64                        27.39%
amtinstpaidbefduel24m_4187115A                     Float64                        36.75%
annuitynextmonth_57A                               Float64                        0.00%
applicationcnt_361L                                Float64                        0.00%
applications30d_658L                               Float64                        0.00%
applicationscnt_1086L                

  0%|          | 0/5 [00:00<?, ?it/s]


actualdpdtolerance_344P
amtinstpaidbefduel24m_4187115A
annuitynextmonth_57A
applicationcnt_361L
applications30d_658L
applicationscnt_1086L
applicationscnt_464L
applicationscnt_867L
avgdbddpdlast3m_4187120P
avgdpdtolclosure24_3658938P
avginstallast24m_3658937A
avgmaxdpdlast9m_3716943P
avgoutstandbalancel6m_4187114A
avgpmtlast12m_4525200A
birthdate_574D
clientscnt_100L
clientscnt_1022L
clientscnt_1071L
clientscnt_1130L
clientscnt_157L
clientscnt_257L
clientscnt_304L
clientscnt_360L
clientscnt_493L
clientscnt_533L
clientscnt_887L
clientscnt_946L
cntincpaycont9m_3716944L
cntpmts24_3658933L
commnoinclast6m_3546845L
count_num_group1
count_num_group1_applprev2
count_num_group1_cb_a_1
count_num_group1_cb_a_2
count_num_group1_debitcard
count_num_group1_person1
count_num_group1_person2
count_num_group1_tax_registry_a
count_num_group1_tax_registry_b
count_num_group1_tax_registry_c
count_num_group2
count_num_group2_cb_a_2
count_num_group2_person2
credamount_770A
credtype_322L
currdebt_22A
currdeb