In [1]:
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import joblib
import warnings
from sklearn.base import BaseEstimator, RegressorMixin
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import HistGradientBoostingClassifier
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')
ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'
ROOT

'/kaggle/input/home-credit-credit-risk-model-stability'

In [2]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

class Pipeline:
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.Utf8))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.Utf8):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        return df

In [3]:
import polars as pl

class Aggregator:
    @staticmethod
    def demographic_expr(df):
        cols = [
            'birth_259D', 'birthdate_87D', 'gender_992L', 'sex_738L', 'childnum_185L',
            'education_927M', 'familystate_447L', 'maritalst_703L'
        ]
        exprs = []
        for col in cols:
            if col in ['birth_259D', 'birthdate_87D']:
                exprs.append(pl.col(col).max().alias(f"max_{col}"))
                exprs.append(pl.col(col).min().alias(f"min_{col}"))
                exprs.append((pl.col(col).max() - pl.col(col).min()).alias(f"range_{col}"))
            else:
                exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
                exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def employment_expr(df):
        cols = [
            'empl_employedfrom_271D', 'empl_employedtotal_800L', 'empl_industry_691L',
            'mainoccupationinc_384A'
        ]
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mean().alias(f"mean_{col}"))
            exprs.append(pl.col(col).median().alias(f"median_{col}"))
            exprs.append(pl.col(col).max().alias(f"max_{col}"))
            exprs.append(pl.col(col).min().alias(f"min_{col}"))
        return exprs

    @staticmethod
    def address_expr(df):
        cols = [
            'contaddr_district_15M', 'contaddr_zipcode_807M', 'empladdr_district_926M',
            'empladdr_zipcode_114M', 'registaddr_district_1083M', 'registaddr_zipcode_184M'
        ]
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def relationship_expr(df):
        cols = [
            'isreference_387L', 'relationshiptoclient_415T', 'relationshiptoclient_642T',
            'type_25L', 'role_1084L', 'role_993L'
        ]
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def housing_expr(df):
        cols = ['housetype_905L', 'housingtype_772L']
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def income_remittance_expr(df):
        cols = ['incometype_1044T', 'remitter_829L']
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def miscellaneous_expr(df):
        cols = [
            'contaddr_matchlist_1032L', 'contaddr_smempladdr_334L', 'persontype_1072L', 
            'persontype_792L', 'language1_981M', 'personindex_1023L', 
            'safeguarantyflag_411L'
        ]
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs
    
    @staticmethod
    def demographic_expr(df):
        cols = [
            'birth_259D', 'birthdate_87D', 'gender_992L', 'sex_738L', 'childnum_185L',
            'education_927M', 'familystate_447L', 'maritalst_703L'
        ]
        exprs = []
        for col in cols:
            if col in ['birth_259D', 'birthdate_87D']:
                exprs.append(pl.col(col).max().alias(f"max_{col}"))
                exprs.append(pl.col(col).min().alias(f"min_{col}"))
                exprs.append((pl.col(col).max() - pl.col(col).min()).alias(f"range_{col}"))
            else:
                exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
                exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def employment_expr(df):
        cols = [
            'empl_employedfrom_271D', 'empl_employedtotal_800L', 'empl_industry_691L',
            'mainoccupationinc_384A'
        ]
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mean().alias(f"mean_{col}"))
            exprs.append(pl.col(col).median().alias(f"median_{col}"))
            exprs.append(pl.col(col).max().alias(f"max_{col}"))
            exprs.append(pl.col(col).min().alias(f"min_{col}"))
        return exprs

    @staticmethod
    def address_expr(df):
        cols = [
            'contaddr_district_15M', 'contaddr_zipcode_807M', 'empladdr_district_926M',
            'empladdr_zipcode_114M', 'registaddr_district_1083M', 'registaddr_zipcode_184M'
        ]
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def relationship_expr(df):
        cols = [
            'isreference_387L', 'relationshiptoclient_415T', 'relationshiptoclient_642T',
            'type_25L', 'role_1084L', 'role_993L'
        ]
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def housing_expr(df):
        cols = ['housetype_905L', 'housingtype_772L']
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def income_remittance_expr(df):
        cols = ['incometype_1044T', 'remitter_829L']
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def miscellaneous_expr(df):
        cols = [
            'contaddr_matchlist_1032L', 'contaddr_smempladdr_334L', 'persontype_1072L', 
            'persontype_792L', 'language1_981M', 'personindex_1023L', 
            'safeguarantyflag_411L'
        ]
        exprs = []
        for col in cols:
            exprs.append(pl.col(col).mode().alias(f"mode_{col}"))
            exprs.append(pl.col(col).value_counts().alias(f"freq_counts_{col}"))
        return exprs

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.demographic_expr(df) + \
                Aggregator.employment_expr(df) + \
                Aggregator.address_expr(df) + \
                Aggregator.relationship_expr(df) + \
                Aggregator.housing_expr(df) + \
                Aggregator.income_remittance_expr(df) + \
                Aggregator.miscellaneous_expr(df) + \
                Aggregator.demographic_expr(df) + \
                Aggregator.employment_expr(df) + \
                Aggregator.address_expr(df) + \
                Aggregator.relationship_expr(df) + \
                Aggregator.housing_expr(df) + \
                Aggregator.income_remittance_expr(df) + \
                Aggregator.miscellaneous_expr(df)
        return exprs

In [4]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    return df

In [6]:
ROOT_Test = Path("/kaggle/input/home-credit-credit-risk-modeling")
TEST_DIR = ROOT_Test / "test_dataset" / "transformed"

data_store = {
    "df_base": read_files(Path("/kaggle/input/home-credit-credit-risk-modeling/test.parquet")),
    "depth_0": [read_files(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_files(TEST_DIR / "test_other_1.parquet", 1),
        read_files(TEST_DIR / "test_person_1.parquet", 1),
        read_files(TEST_DIR / "test_deposit_1.parquet", 1),
        read_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2)
    ]
}

ColumnNotFoundError: birth_259D

Error originated just after this operation:
DF ["case_id", "actualdpd_943P", "annuity_853A", "approvaldate_319D"]; PROJECT */41 COLUMNS; SELECTION: "None"

In [None]:
df_train = feature_eng(**data_store)
del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols)
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
df_train

In [None]:
df_train["debt-to-income"] = df_train["totaldebt_9A"] / df_train["maininc_215A"]
df_train

In [None]:
from itertools import combinations, permutations

nums = df_train.select_dtypes(exclude='category').columns
nans_df = df_train[nums].isna()
nans_groups = {}

for col in nums:
    cur_group = nans_df[col].sum()
    nans_groups.setdefault(cur_group, []).append(col)

encoder = OrdinalEncoder()
df_train[cat_cols] = encoder.fit_transform(df_train[cat_cols])
df_train

In [None]:
df_majority = df_train[df_train['target'] == 0]
df_minority = df_train[df_train['target'] == 1]
n_minority = len(df_minority) + 20000
df_majority_undersampled = df_majority.sample(n=n_minority, random_state=42)
df_train_balanced = pd.concat([df_majority_undersampled, df_minority])
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df_train_balanced

In [None]:
TEST_DIR = ROOT / "parquet_files" / "test"

data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ]
}

In [None]:
df_test = feature_eng(**data_store)
del data_store
gc.collect()
df_test = df_test.pipe(Pipeline.filter_cols)
df_test, _ = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)
df_test

In [None]:
indexx = df_test['case_id']
indexx

In [None]:
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = df_test[col].astype('str').fillna('-1')

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(df_train[cat_cols])
df_test[cat_cols] = encoder.transform(df_test[cat_cols])
df_test

In [None]:
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = df_test[col].astype('category').cat.codes

df_test = df_test.drop(columns=['case_id'])
df_test = reduce_mem_usage(df_test)
df_test

In [None]:
df_test["debt-to-income"] = df_test["totaldebt_9A"] / df_test["maininc_215A"]
df_test

In [None]:
y = df_train_balanced["target"]
df_train_balanced = df_train_balanced.drop(columns=["target", "case_id", "WEEK_NUM"])
df_train_balanced = reduce_mem_usage(df_train_balanced)
df_train_balanced

In [None]:
joblib.dump((df_train_balanced, y, df_test), 'data.pkl')

In [None]:
df_train, y, df_test = joblib.load('/kaggle/working/data.pkl')
df_train.shape, df_test.shape

In [None]:
missing_cols = set(df_test.columns) - set(df_train.columns)
missing_cols

In [None]:
df_test = df_test.drop(columns=['last_housingtype_772L' , 'last_profession_152M' , 'lastapprcommoditytypec_5251766M', 'max_profession_152M'])
df_test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(df_train, y, test_size=0.2, random_state=42, stratify=y)

print("X_train shape:", X_train.shape)
print("X_validation shape:", X_validation.shape)
print("y_train shape:", y_train.shape)
print("y_validation shape:", y_validation.shape)

In [None]:
import xgboost as xgb

fitted_models_lgb = []
device: str = "gpu"

params1 = {
    "boosting_type": "gbdt",
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "device": device,
    "extra_trees": True,
    "learning_rate": 0.05,
    "l1_regularization": 0.1,
    "l2_regularization": 10,
    "max_depth": 20,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 64,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}
model_1 = lgb.LGBMClassifier(**params1)
model_1.fit(df_train, y)
fitted_models_lgb.append(model_1)
print("Model_1 Success")

params2 = {
    "boosting_type": "gbdt",
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "extra_trees": True,
    "device": device,
    "learning_rate": 0.03,
    "l1_regularization": 0.1,
    "l2_regularization": 10,
    "max_depth": 16,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 54,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}
model_2 = lgb.LGBMClassifier(**params2)
model_2.fit(df_train, y)
fitted_models_lgb.append(model_2)
print("Model_2 Success")

train_pool = Pool(X_train, y_train)
val_pool = Pool(X_validation, y_validation)
model_3 = CatBoostClassifier(
    best_model_min_trees = 1000,
    boosting_type = "Plain",
    eval_metric = "AUC",
    learning_rate = 0.05,
    l2_leaf_reg = 10,
    max_leaves = 64,
    random_seed = 42,
    iterations = 6000,
    task_type = "GPU",
    use_best_model = True
)
model_3.fit(train_pool, eval_set=val_pool, verbose=False)
fitted_models_lgb.append(model_3)
print("Model_3 Success")

params4 = {
    "learning_rate": 0.05,
    "max_depth": 20,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "reg_alpha": 0.1,
    "device": device,
    "reg_lambda": 10,
    "n_estimators": 2000,
    "random_state": 42,
    "eval_metric": "auc",
}
model_4 = xgb.XGBClassifier(**params4)
model_4.fit(df_train, y)
fitted_models_lgb.append(model_4)
print("Model_4 Success")

In [None]:
from scipy.stats import mode

class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators, weights=None):
        super().__init__()
        self.estimators = estimators
        self.weights = weights
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        if self.weights is None:
            return mode(y_preds, axis=0)[0]
        else:
            weighted_sum = np.sum(np.array(y_preds) * self.weights.reshape(-1, 1), axis=0)
            return np.round(weighted_sum).astype(int)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        mean_proba = np.mean(y_preds, axis=0)
#         threshold = 0.45
#         predicted_proba = np.where(mean_proba[:, 1] > threshold, 1, 0)
#         return predicted_proba
        return mean_proba

model = VotingModel(fitted_models_lgb)
# model = VotingModel(fitted_models_lgb, weights=np.array([0.34, 0.33, 0.33]))
model

In [None]:
y_pred = pd.Series(model.predict_proba(df_test)[:, 1], index=df_test.index)
y_pred

In [None]:
sub = pd.DataFrame({
    "case_id": indexx, "target": y_pred
})
sub

In [None]:
df_subm = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv")
df_subm

In [None]:
df_subm = df_subm.drop(columns=['score'])
merged_df = df_subm.merge(sub, on="case_id", how="left")
merged_df

In [None]:
merged_df.to_csv("submission.csv", index=False)