In [1]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, os, gc, joblib
from pprint import pprint
import lightgbm as lgb
from sklearn import metrics
from functools import reduce
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
)
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    StratifiedGroupKFold,
)
from contextlib import suppress



In [2]:
pathway = "/kaggle/input/home-credit-credit-risk-model-stability/"


def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        # Cast Transform DPD (Days past due, P) and Transform Amount (A) as Float64
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
        # Cast Transform date (D) as Date, causes issues with other columns ending in D
        # if col[-1] in ("D"):
        # df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
        # Cast aggregated columns as Float64, tried combining sum and max, but did not work correctly
        if col[-4:-1] in ("_sum"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
        if col[-4:-1] in ("_max"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
    return df


def convert_strings(df: pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if df[col].dtype == pl.Utf8:
            df = df.with_columns(pl.col(col).cast(pl.Categorical))
    return df


# Changed this function to work for Pandas
def missing_values(df, threshold=0.0):
    for col in df.columns:
        decimal = (pd.isnull(test[col]).sum()) / (len(test[col]))
        if decimal > threshold:
            print(f"{col}: {decimal}")


# Impute numeric columns with the median and cat with mode
def imputer(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df[col].dtype in ["float64"]:
            df[col] = df[col].fillna(df[col].median())
        if df[col].dtype.name in ["category", "object"] and df[col].isnull().any():
            mode_without_nan = df[col].dropna().mode().values[0]
            df[col] = df[col].fillna(mode_without_nan)
    return df

In [3]:
train = (
    pl.read_csv("Step_3_models_final/train_final_final.csv")
    .pipe(set_table_dtypes)
    .pipe(convert_strings)
)
train.head()

maxoutstandbalancel12m_4187113A,maxdebt4_972A,lastapplicationdate_877D_diff,disbursementtype_67L,numinstregularpaid_973L,currdebtcredtyperange_828A,case_id,pmts_overdue_1140A_sum_right,opencred_647L,numinstpaidearly3dest_4493216L,numberofoutstandinstls_520L_sum,credtype_587L,numinstpaid_4499208L,credtype_322L,empls_economicalst_849M,safeguarantyflag_411L,byoccupationinc_3656910L_max,credacc_credlmt_575A_max,dtlastpmtallstes_4499206D_diff,numpmtchanneldd_318L,conts_type_509L,clientscnt_360L,tenor_203L_sum,dpdmax_139P_max,numinstpaidearly_338L,pmts_dpd_303P_sum,applicationscnt_629L,maxinstallast24m_3658928A,monthsannuity_845L,lastapprcredamount_781A,maxlnamtstart6m_4525199A,numactivecredschannel_414L,downpmt_116A,applicationscnt_867L,posfpd30lastmonth_3976960P,mobilephncnt_593L,mindbdtollast24m_4525191P,…,pctinstlsallpaidlate4d_3546849L,isbidproduct_390L,addres_zip_823M,totinstallast1m_4525188A,cntincpaycont9m_3716944L,numinstregularpaidest_4493210L,sex_738L,numberofoverdueinstlmax_1151L_sum,avginstallast24m_3658937A,dateofbirth_337D_diff,isbidproduct_1095L,maxdpdlast24m_143P,maritalst_385M,clientscnt_1130L,equalitydataagreement_891L,datelastinstal40dpd_247D,numinstlswithdpd10_728L,avgdbddpdlast3m_4187120P,overdueamountmax2_398A_max,sellerplacecnt_915L,familystate_447L,clientscnt3m_3712950L,posfstqpd30lastmonth_3976962P,status_219L,days360_512L,maxdpdinstlnum_3546846P,pmts_dpd_1073P_sum,birth_259D_diff,clientscnt_304L,lastactivateddate_801D_diff,isdebitcard_729L,dpdmax_757P_max,maxdpdlast12m_727P,type_25L,overdueamountmax_35A_max,numinstmatpaidtearly2d_4499204L,target
f64,f64,f64,cat,f64,f64,i64,f64,i64,f64,f64,cat,f64,cat,cat,i64,f64,f64,f64,f64,cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,i64,cat,f64,f64,f64,cat,f64,f64,f64,i64,f64,cat,f64,i64,cat,f64,f64,f64,f64,cat,f64,f64,cat,f64,f64,f64,f64,f64,f64,i64,f64,f64,cat,f64,f64,i64
42520.402,0.0,217.0,"""GBA""",18.0,0.0,0,0.0,0,12.0,0.0,"""COL""",22.0,"""CAL""","""a55475b1""",1,15000.0,0.0,-13.0,0.0,"""PRIMARY_MOBILE…",0.0,48.0,0.0,9.0,0.0,0.0,6648.6,18.0,24774.0,32934.6,0.0,0.0,0.0,0.0,1.0,-17.0,…,0.01695,0,"""a55475b1""",6221.2,5.0,22.0,"""F""",1.0,4068.6,12099.0,0,0.0,"""a55475b1""",0.0,1,"""2020-06-15""",0.0,-4.0,3292.6,0.0,"""MARRIED""",0.0,0.0,"""D""",3.0,7.0,0.0,11874.0,0.0,306.0,0,10.0,0.0,"""PRIMARY_MOBILE…",3016.39415,13.0,0
42520.402,0.0,217.0,"""GBA""",18.0,0.0,1,0.0,0,12.0,0.0,"""COL""",22.0,"""CAL""","""a55475b1""",1,15000.0,0.0,-13.0,0.0,"""PRIMARY_MOBILE…",0.0,48.0,0.0,9.0,0.0,0.0,6648.6,18.0,24774.0,32934.6,0.0,0.0,0.0,0.0,1.0,-17.0,…,0.01695,0,"""a55475b1""",6221.2,5.0,22.0,"""M""",1.0,4068.6,12099.0,0,0.0,"""a55475b1""",0.0,1,"""2020-06-15""",0.0,-4.0,3292.6,0.0,"""DIVORCED""",0.0,0.0,"""D""",3.0,7.0,0.0,22435.0,0.0,306.0,0,10.0,0.0,"""PRIMARY_MOBILE…",3016.39415,13.0,0
42520.402,0.0,2102.0,"""GBA""",18.0,0.0,2,0.0,0,12.0,0.0,"""CAL""",22.0,"""CAL""","""a55475b1""",1,15000.0,0.0,-13.0,0.0,"""PRIMARY_MOBILE…",0.0,36.0,0.0,9.0,0.0,0.0,6648.6,18.0,24774.0,32934.6,0.0,0.0,0.0,0.0,2.0,-17.0,…,0.01695,0,"""a55475b1""",6221.2,5.0,22.0,"""F""",1.0,4068.6,12099.0,0,0.0,"""a55475b1""",0.0,1,"""2020-06-15""",0.0,-4.0,3292.6,0.0,"""MARRIED""",0.0,0.0,"""D""",3.0,7.0,0.0,16105.0,0.0,306.0,0,10.0,0.0,"""PRIMARY_MOBILE…",3016.39415,13.0,0
42520.402,0.0,-4.0,"""GBA""",18.0,0.0,3,0.0,0,12.0,0.0,"""CAL""",22.0,"""CAL""","""a55475b1""",1,15000.0,0.0,-13.0,0.0,"""PHONE""",0.0,12.0,0.0,9.0,0.0,0.0,6648.6,18.0,24774.0,32934.6,0.0,0.0,1.0,0.0,1.0,-17.0,…,0.01695,0,"""a55475b1""",6221.2,5.0,22.0,"""F""",1.0,4068.6,12099.0,0,0.0,"""a55475b1""",1.0,1,"""2020-06-15""",0.0,-4.0,3292.6,1.0,"""MARRIED""",0.0,0.0,"""D""",3.0,7.0,0.0,9286.0,0.0,306.0,0,10.0,0.0,"""PRIMARY_MOBILE…",3016.39415,13.0,0
42520.402,0.0,-4.0,"""GBA""",18.0,0.0,4,0.0,0,12.0,0.0,"""CAL""",22.0,"""CAL""","""a55475b1""",1,15000.0,0.0,-13.0,0.0,"""PRIMARY_MOBILE…",0.0,24.0,0.0,9.0,0.0,0.0,6648.6,18.0,24774.0,32934.6,0.0,0.0,1.0,0.0,1.0,-17.0,…,0.01695,0,"""a55475b1""",6221.2,5.0,22.0,"""F""",1.0,4068.6,12099.0,0,0.0,"""a55475b1""",0.0,1,"""2020-06-15""",0.0,-4.0,3292.6,0.0,"""MARRIED""",0.0,0.0,"""T""",3.0,7.0,0.0,9134.0,0.0,306.0,0,10.0,0.0,"""PRIMARY_MOBILE…",3016.39415,13.0,1


In [4]:
test = (
    pl.read_csv("Step_3_models_final/test_final_final.csv")
    .pipe(set_table_dtypes)
    .pipe(convert_strings)
)
test.head()

maxoutstandbalancel12m_4187113A,maxdebt4_972A,lastapplicationdate_877D_diff,disbursementtype_67L,numinstregularpaid_973L,currdebtcredtyperange_828A,case_id,pmts_overdue_1140A_sum_right,opencred_647L,numinstpaidearly3dest_4493216L,numberofoutstandinstls_520L_sum,credtype_587L,numinstpaid_4499208L,credtype_322L,empls_economicalst_849M,safeguarantyflag_411L,byoccupationinc_3656910L_max,credacc_credlmt_575A_max,dtlastpmtallstes_4499206D_diff,numpmtchanneldd_318L,conts_type_509L,clientscnt_360L,tenor_203L_sum,dpdmax_139P_max,numinstpaidearly_338L,pmts_dpd_303P_sum,applicationscnt_629L,maxinstallast24m_3658928A,monthsannuity_845L,lastapprcredamount_781A,maxlnamtstart6m_4525199A,numactivecredschannel_414L,downpmt_116A,applicationscnt_867L,posfpd30lastmonth_3976960P,mobilephncnt_593L,mindbdtollast24m_4525191P,…,maxdpdtolerance_577P_max,pctinstlsallpaidlate4d_3546849L,isbidproduct_390L,addres_zip_823M,totinstallast1m_4525188A,cntincpaycont9m_3716944L,numinstregularpaidest_4493210L,sex_738L,numberofoverdueinstlmax_1151L_sum,avginstallast24m_3658937A,dateofbirth_337D_diff,isbidproduct_1095L,maxdpdlast24m_143P,maritalst_385M,clientscnt_1130L,equalitydataagreement_891L,datelastinstal40dpd_247D,numinstlswithdpd10_728L,avgdbddpdlast3m_4187120P,overdueamountmax2_398A_max,sellerplacecnt_915L,familystate_447L,clientscnt3m_3712950L,posfstqpd30lastmonth_3976962P,status_219L,days360_512L,maxdpdinstlnum_3546846P,pmts_dpd_1073P_sum,birth_259D_diff,clientscnt_304L,lastactivateddate_801D_diff,isdebitcard_729L,dpdmax_757P_max,maxdpdlast12m_727P,type_25L,overdueamountmax_35A_max,numinstmatpaidtearly2d_4499204L
f64,f64,f64,cat,f64,f64,i64,f64,i64,f64,f64,cat,f64,cat,cat,i64,f64,f64,f64,f64,cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,i64,cat,f64,f64,f64,cat,f64,f64,f64,i64,f64,cat,f64,i64,cat,f64,f64,f64,f64,cat,f64,f64,cat,f64,f64,f64,f64,f64,f64,i64,f64,f64,cat,f64,f64
157731.78,231440.03,41.0,"""GBA""",96.0,0.0,57543,0.0,0,34.0,0.0,"""CAL""",96.0,"""CAL""","""a55475b1""",0,15000.0,0.0,-1.0,0.0,"""PRIMARY_MOBILE…",0.0,164.0,0.0,25.0,0.0,0.0,131700.8,66.0,14000.0,16672.6,0.0,0.0,9.0,0.0,2.0,-7.0,…,50.0,0.07292,0,"""a55475b1""",17859.6,5.0,96.0,"""F""",34.0,16049.4,14804.0,1,7.0,"""38c061ee""",0.0,1,"""2020-05-28""",0.0,2.0,3271.6882,0.0,"""SINGLE""",0.0,0.0,"""K""",8.0,14.0,0.0,14804.0,0.0,18.0,0,20.0,3.0,"""PRIMARY_MOBILE…",3271.6882,37.0
21278.0,34066.0,-10.0,"""GBA""",44.0,10638.2,57549,0.0,0,15.0,0.0,"""CAL""",44.0,"""CAL""","""a55475b1""",1,50000.0,34066.0,161.0,0.0,"""PRIMARY_MOBILE…",0.0,91.0,0.0,15.0,0.0,0.0,122511.4,41.0,94000.0,31820.6,0.0,0.0,10.0,0.0,3.0,-2.0,…,1.0,0.18605,1,"""a55475b1""",126058.0,3.0,44.0,"""F""",310.0,32426.201,22723.0,1,0.0,"""a7fcb6e5""",0.0,1,"""2020-05-28""",15.0,0.0,48690.402,2.0,"""SINGLE""",0.0,0.0,"""K""",12.0,16.0,0.0,22723.0,0.0,-10.0,0,108.0,0.0,"""PRIMARY_MOBILE…",48690.402,15.0
62619.0,54000.0,14.0,"""SBA""",9.0,0.0,57551,0.0,0,3.0,0.0,"""CAL""",9.0,"""COL""","""a55475b1""",0,32500.0,17033.0,161.0,0.0,"""PRIMARY_MOBILE…",0.0,127.5,0.0,0.0,0.0,0.0,41783.402,9.0,200000.0,54000.0,0.0,0.0,2.0,0.0,1.0,-4.0,…,25.5,0.11111,0,"""a55475b1""",18374.3,4.0,9.0,"""F""",4.0,8357.2,14090.0,0,4.0,"""3439d993""",0.0,1,"""2020-05-28""",0.0,-3.0,10429.616,1.0,"""SINGLE""",0.0,0.0,"""K""",4.0,6.0,0.0,14090.0,0.0,405.0,0,3.0,4.0,"""PRIMARY_MOBILE…",10429.616,5.0
288642.6,188126.14,80.0,"""GBA""",32.0,191269.61,57552,0.0,0,26.0,0.0,"""CAL""",32.0,"""CAL""","""a55475b1""",1,32500.0,17033.0,161.0,0.0,"""PRIMARY_MOBILE…",0.0,127.5,0.0,24.0,0.0,0.0,12155.4,23.0,0.0,104473.6,0.0,0.0,9.0,0.0,1.0,-13.0,…,25.5,0.0,0,"""a55475b1""",18889.0,18.0,32.0,"""M""",19.0,7440.4,23768.0,1,0.0,"""a55475b1""",0.0,1,"""2020-05-28""",0.0,-7.0,6850.6521,0.0,"""SINGLE""",0.0,0.0,"""K""",5.0,7.0,0.0,23768.0,0.0,234.0,0,11.5,0.0,"""PRIMARY_MOBILE…",6850.6521,27.0
0.0,64555.668,-14.0,"""GBA""",15.0,0.0,57569,0.0,1,5.0,0.0,"""CAL""",15.0,"""CAL""","""P148_57_109""",0,32500.0,17033.0,209.0,0.0,"""PRIMARY_MOBILE…",0.0,127.5,0.0,5.0,16300.0,0.0,26969.401,11.0,20000.0,42910.3,0.0,0.0,6.0,0.0,2.0,2783.0,…,25.5,0.66667,0,"""P96_113_139""",18374.3,1.0,15.0,"""F""",19.0,7898.8,26408.0,1,2865.0,"""3439d993""",0.0,1,"""2018-09-18""",36.0,-3.0,6850.6521,2.0,"""SINGLE""",0.0,0.0,"""K""",4.0,7.0,0.0,26408.0,0.0,3440.0,0,11.5,2865.0,"""PRIMARY_MOBILE…",6850.6521,5.0


In [5]:
common_columns = list(set(train.columns) & set(test.columns))

test = test[common_columns]
# Subset train with only columns seen in test + target
train = train[common_columns + ["target"]]
train.shape, test.shape

((1526659, 228), (10, 227))

In [6]:
test = test.to_pandas()

In [7]:
train = train.to_pandas()

In [8]:
ids = test["case_id"].tolist()

In [45]:
# remove possible mismatch columns
missmatch = [
    "addres_district_368M",
    "addres_zip_823M",
    "conts_role_79M",
    "conts_type_509L",
    "credtype_322L",
    "credtype_587L",
    "datelastinstal40dpd_247D",
    "description_5085714M",
    "disbursementtype_67L",
    "education_1103M",
    "education_88M",
    "empls_economicalst_849M",
    "empls_employer_name_740M",
    "familystate_447L",
    "familystate_726L",
    "incometype_1044T",
    "inittransactioncode_186L",
    "inittransactioncode_279L",
    "lastst_736L",
    "maritalst_385M",
    "maritalst_893M",
    "role_1084L",
    "status_219L",
    "type_25L",
]

train = train.drop(missmatch, axis=1)
test = test.drop(missmatch, axis=1)

In [49]:
y = train.loc[:, "target"].to_frame("target")
X = train.drop(
    [
        "target",
    ],
    axis=1,
)

# Do not include case_id, or week_num as numeric
numeric_cols = test.select_dtypes(include=["number"]).columns.tolist()
numeric_cols.remove("case_id")
numeric_cols.remove("WEEK_NUM")


# scale values before passing on to model
warnings.filterwarnings("ignore")
scaler = MinMaxScaler(copy=False)
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
test[numeric_cols] = scaler.transform(test[numeric_cols])

# Drop case_id and week_num from features
weeks = X["WEEK_NUM"]
X_feats = X.drop(["case_id", "WEEK_NUM"], axis=1)

# Sort columns in alphabetical order for training so columns match test submission
X_feats = X_feats.reindex(sorted(X_feats.columns), axis=1)

In [50]:
# using optimal parameters for lgbm

warnings.filterwarnings("ignore")
cv = StratifiedGroupKFold(n_splits=2, shuffle=True)

fitted_models = []
cv_scores = []

# Note: uncomment device when running with GPU P100 accelerator
grid_params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 30,
    "learning_rate": 0.03,
    "n_estimators": 800,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees": True,
    "num_leaves": 95,
    "verbose": -1,
    "max_bin": 250,
    #'device':'gpu',
}

for idx_train, idx_valid in cv.split(X_feats, y, groups=weeks):
    X_train, y_train = X_feats.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X_feats.iloc[idx_valid], y.iloc[idx_valid]

    X_train_subset = X_train.iloc[:1000]
    y_train_subset = y_train.iloc[:1000]

    X_valid_subset = X_valid.iloc[:1000]
    y_valid_subset = y_valid.iloc[:1000]

    clf = lgb.LGBMClassifier(**grid_params)
    clf.fit(
        X_train_subset,
        y_train_subset,
        eval_set=[(X_valid_subset, y_valid_subset)],
        callbacks=[lgb.log_evaluation(200), lgb.early_stopping(100)],
    )
    fitted_models.append(clf)

    y_pred_valid = clf.predict_proba(X_valid_subset)[:, 1]
    auc_score = roc_auc_score(y_valid_subset, y_pred_valid)
    cv_scores.append(auc_score)

print("CV AUC scores: ", cv_scores)
print("Maximum CV AUC score: ", max(cv_scores))

warnings.filterwarnings("default")

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[50]	valid_0's auc: 0.666873
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[17]	valid_0's auc: 0.644845
CV AUC scores:  [0.6668732230079463, 0.6448454120687941]
Maximum CV AUC score:  0.6668732230079463


In [51]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)


lgb_model = VotingModel(fitted_models)

In [52]:
test = test.reindex(sorted(test.columns), axis=1)
testt = test.drop(["case_id", "WEEK_NUM"], axis=1)

test_subset = testt.iloc[:10]

In [57]:
test.shape

(10, 203)

In [53]:
# run a loop and go over columns of X_feats and test_subset and check if they are equal

# a = 0
# for i in range(len(X_feats.columns)):
#     a = a+1
#     if X_feats.columns[i] == test_subset.columns[i]:
#         print (a)
#         #print(X_feats.columns[i], test_subset.columns[i])


# also check if the columns are the same category or not

mismatched_columns = []

for col in X_feats.select_dtypes(["category"]).columns:
    if (
        X_feats[col].cat.categories.sort_values().tolist()
        != test_subset[col].cat.categories.sort_values().tolist()
    ):
        mismatched_columns.append(col)

print(mismatched_columns)

[]


In [58]:
for col in X_feats.select_dtypes(["category"]).columns:
    if (
        X_feats[col].cat.categories.sort_values().tolist()
        != test_subset[col].cat.categories.sort_values().tolist()
    ):
        print(
            f"Column: {col}, X_feats categories: {X_feats[col].cat.categories.sort_values().tolist()}, test_subset categories: {test_subset[col].cat.categories.sort_values().tolist()}"
        )

In [55]:
test_subset

Unnamed: 0,actualdpd_943P_sum,actualdpdtolerance_344P,amount_4917619A_sum,amtinstpaidbefduel24m_4187115A,annuity_780A,annuity_853A_sum,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,...,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,tenor_203L_sum,thirdquarter_1082L,totalamount_6A_sum,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L
0,0.0,0.0,-7.586495e-08,6.870009e-14,-0.000763,2.709796e-13,1.818421e-12,0.0,0.0,0.0,...,0.020268,0.02256,9.490741e-08,1.3e-05,1.707613e-23,6.850152e-15,4.1145290000000004e-18,-2.692169e-07,FO,AL
1,0.0,0.0,-7.586495e-08,4.646622e-14,-0.000763,1.65871e-13,5.29405e-12,0.0,9.1e-05,0.0,...,0.020268,0.02256,5.266204e-08,8e-06,5.323975000000001e-23,5.99563e-15,3.371887e-18,-2.692167e-07,FO,AL
2,0.0,0.0,-7.586496e-08,2.544858e-14,-0.000763,2.184253e-13,0.0,0.0,4.6e-05,0.0,...,0.020268,0.02256,7.378472e-08,2.1e-05,9.780111000000001e-23,0.0,6.786614999999999e-19,-2.692169e-07,FO,AL
3,0.0,0.0,-7.586495e-08,6.591459e-14,-0.000763,2.184253e-13,1.81445e-11,0.0,0.0,0.0,...,0.020268,0.02256,7.378472e-08,4e-06,7.552043000000001e-23,1.077985e-13,2.564304e-18,-2.692169e-07,BO,AL
4,0.0,0.0,-7.586495e-08,0.0,-0.000763,2.184253e-13,0.0,0.0,4.6e-05,0.0,...,0.020268,0.02256,7.378472e-08,1.3e-05,7.552043000000001e-23,0.0,8.602785999999999e-19,-2.692169e-07,FO,AL
5,0.0,0.0,-7.586495e-08,0.0,-0.000763,2.184253e-13,0.0,0.0,0.0,0.0,...,0.020268,0.02256,7.378472e-08,4e-06,7.552043000000001e-23,0.0,8.73159e-20,-2.692169e-07,FO,AL
6,0.0,0.0,-7.586495e-08,2.412504e-14,-0.000763,2.184253e-13,0.0,0.0,0.0,0.0,...,0.020268,0.02256,7.378472e-08,2.1e-05,7.552043000000001e-23,0.0,0.0,-2.692169e-07,FO,AL
7,0.0,0.0,-7.586495e-08,2.280149e-14,-0.000763,2.184253e-13,0.0,0.0,0.0,0.0,...,0.020268,0.02256,7.378472e-08,4e-06,7.552043000000001e-23,0.0,5.742985e-19,-2.692169e-07,FO,AL
8,0.0,0.0,-7.586495e-08,2.412504e-14,-0.000763,2.184253e-13,0.0,0.0,0.0,0.0,...,0.020268,0.02256,7.378472e-08,4e-06,3.635885e-22,0.0,0.0,-2.692169e-07,FO,AL
9,0.0,0.0,-7.586495e-08,1.431154e-14,-0.000763,2.184253e-13,0.0,0.0,0.0,0.0,...,0.020268,0.02256,7.378472e-08,4e-06,7.552043000000001e-23,0.0,3.6045509999999997e-19,-2.692169e-07,FO,AL


In [59]:
X_train_subset = X_feats.iloc[:1000000, :100]
print(X_train_subset.columns.equals(test_subset.columns))

False


In [30]:
test_subset

Unnamed: 0,actualdpd_943P_sum,actualdpdtolerance_344P,addres_district_368M,addres_zip_823M,amount_4917619A_sum,amtinstpaidbefduel24m_4187115A,annuity_780A,annuity_853A_sum,annuitynextmonth_57A,applicationcnt_361L,...,sumoutstandtotalest_4493215A,tenor_203L_sum,thirdquarter_1082L,totalamount_6A_sum,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,type_25L,typesuite_864L
0,0.0,0.0,a55475b1,a55475b1,0.057571,0.136197,0.033927,0.063491,0.013922,0.0,...,0.033509,0.136667,0.048387,7.5e-05,0.01004,0.009494,0.022467,FO,PRIMARY_MOBILE,AL
1,0.0,0.0,a55475b1,a55475b1,0.035458,0.092119,0.05345,0.038864,0.040533,0.0,...,0.032143,0.075833,0.032258,0.000234,0.008787,0.00778,0.158583,FO,PRIMARY_MOBILE,AL
2,0.0,0.0,a55475b1,a55475b1,0.013345,0.050452,0.026092,0.051177,0.0,0.0,...,0.02256,0.10625,0.080645,0.000431,0.0,0.001566,0.023115,FO,PRIMARY_MOBILE,AL
3,0.0,0.0,a55475b1,a55475b1,0.035458,0.130675,0.058701,0.051177,0.138919,0.0,...,0.194861,0.10625,0.016129,0.000333,0.157992,0.005917,0.023762,BO,PRIMARY_MOBILE,AL
4,0.0,0.0,P121_131_159,P96_113_139,0.035458,0.0,0.043443,0.051177,0.0,0.0,...,0.02256,0.10625,0.048387,0.000333,0.0,0.001985,0.023115,FO,PRIMARY_MOBILE,AL
5,0.0,0.0,a55475b1,a55475b1,0.035458,0.0,0.083305,0.051177,0.0,0.0,...,0.02256,0.10625,0.016129,0.000333,0.0,0.000201,0.023115,FO,PRIMARY_MOBILE,AL
6,0.0,0.0,a55475b1,a55475b1,0.035458,0.047828,0.023222,0.051177,0.0,0.0,...,0.02256,0.10625,0.080645,0.000333,0.0,0.0,0.023115,FO,PRIMARY_MOBILE,AL
7,0.0,0.0,a55475b1,a55475b1,0.035458,0.045204,0.04391,0.051177,0.0,0.0,...,0.02256,0.10625,0.016129,0.000333,0.0,0.001325,0.008896,FO,PRIMARY_MOBILE,AL
8,0.0,0.0,a55475b1,a55475b1,0.035458,0.047828,0.077339,0.051177,0.0,0.0,...,0.02256,0.10625,0.016129,0.001601,0.0,0.0,0.023115,FO,PRIMARY_MOBILE,AL
9,0.0,0.0,a55475b1,a55475b1,0.035458,0.028373,0.010243,0.051177,0.0,0.0,...,0.02256,0.10625,0.016129,0.000333,0.0,0.000832,0.023115,FO,PRIMARY_MOBILE,AL


In [32]:
# make sure test subset is the same as train subset

y_pred = lgb_model.predict_proba(test_subset)[:, 1]

In [33]:
# before finding predictions, make sure the test subset is the same as the train subset like the features are exactly the same

array([0.04840184, 0.04620828, 0.05312928, 0.03682798, 0.03731271,
       0.03273324, 0.02801258, 0.02764776, 0.03813599, 0.03013773])

In [31]:
predictions = lgb_model.predict_proba(test_subset)
print(predictions)

[[0.95159816 0.04840184]
 [0.95379172 0.04620828]
 [0.94687072 0.05312928]
 [0.96317202 0.03682798]
 [0.96268729 0.03731271]
 [0.96726676 0.03273324]
 [0.97198742 0.02801258]
 [0.97235224 0.02764776]
 [0.96186401 0.03813599]
 [0.96986227 0.03013773]]
