In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime
import re
from sklearn.preprocessing import FunctionTransformer

#Mixed Variables
class TermTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed['term'] = X_transformed['term'].str.replace(' months', '', regex=False).astype(float)
        return X_transformed

class IssueDTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed['issue_d'] = pd.to_datetime(X_transformed['issue_d'], format='%b-%Y')
        X_transformed['issue_d'] = X_transformed['issue_d'].dt.strftime('%b')
        return X_transformed

class EmpLengthTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed['emp_length'] = X_transformed['emp_length'].replace('10+ years', '10')
        X_transformed['emp_length'] = X_transformed['emp_length'].replace('< 1 year', '0')
        X_transformed['emp_length'] = X_transformed['emp_length'].astype(str).apply(lambda x: float(re.findall(r'\d+', x)[0]) if re.findall(r'\d+', x) else (x if x == 'nan' else None))
        X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
        return X_transformed

class EarliestCrLineTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed['earliest_cr_line'] = pd.to_datetime(
            X_transformed['earliest_cr_line'], format='%b-%Y', errors='coerce'
        ).dt.year
        return X_transformed

#New Feature Extraction
class NewFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # X is expected to have 'earliest_cr_line' (numeric year), 'installment', 'annual_inc', 'loan_amnt'
        current_year = 2015

        cr_history = pd.Series(np.nan, index=X.index)
        if 'earliest_cr_line' in X.columns and pd.api.types.is_numeric_dtype(X['earliest_cr_line']):
            cr_history = current_year - X['earliest_cr_line']

        installment_to_income_ratio = (X['installment'] / X['annual_inc'])
        installment_to_income_ratio = installment_to_income_ratio.replace([np.inf, -np.inf], np.nan).fillna(0)

        loan_to_inc_ratio = (X['loan_amnt'] / X['annual_inc'])
        loan_to_inc_ratio = loan_to_inc_ratio.replace([np.inf, -np.inf], np.nan).fillna(0)

        # Return  DF with only the new features
        return pd.DataFrame({
            'cr_history': cr_history,
            'installment_to_income_ratio': installment_to_income_ratio,
            'loan_to_inc_ratio': loan_to_inc_ratio
        }, index=X.index)

class NewFeatureAddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.new_feature_generator = NewFeatureGenerator()
        self.column_name_map = {
            'earliest_cr_line': 'earliest_cr_line_pipeline__earliest_cr_line',
            'installment': 'installment_passthrough__installment',
            'annual_inc': 'annual_inc_pipeline__annual_inc',
            'loan_amnt': 'loan_amnt_pipeline__loan_amnt'
        }

    def fit(self, X, y=None):
        X_for_generation_fit = pd.DataFrame(index=X.index)
        for gen_col, pipeline_col in self.column_name_map.items():
            if pipeline_col in X.columns:
                X_for_generation_fit[gen_col] = X[pipeline_col]
            else:
                raise ValueError(f"NewFeatureAddingTransformer.fit: Expected column '{pipeline_col}' not found in input X.")

        self.new_feature_generator.fit(X_for_generation_fit, y)
        return self

    def transform(self, X, y=None):
        X_for_generation_transform = pd.DataFrame(index=X.index)
        for gen_col, pipeline_col in self.column_name_map.items():
            if pipeline_col in X.columns:
                X_for_generation_transform[gen_col] = X[pipeline_col]
            else:
                raise ValueError(f"NewFeatureAddingTransformer.transform: Expected column '{pipeline_col}' not found in input X.")

        new_features_df = self.new_feature_generator.transform(X_for_generation_transform)

        # Concatenate new features with the existing DataFrame X (which contains all processed original columns)
        Xt_with_new_features = pd.concat([X, new_features_df], axis=1)
        return Xt_with_new_features

#Outlier
class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, capping_thresholds=None):
        if capping_thresholds is None:
            self.capping_thresholds = {
                'annual_inc': 260000.0,
                'dti': 100.0,
                'bc_util': 100.0,
                'revol_util': 100.0
            }
        else:
            self.capping_thresholds = capping_thresholds

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for col, threshold in self.capping_thresholds.items():
            if col in X_transformed.columns:
                #winsorization (capping) for upper outliers
                X_transformed[col] = np.where(X_transformed[col] > threshold, threshold, X_transformed[col])
        return X_transformed

#Binning
def bin_pub_rec(X_df):
    series = X_df.iloc[:, 0]
    bins = [-0.1, 0.9, 2.9, 5.9, np.inf] # Bins for 0, 1-2, 3-5, >5
    labels = [0,1,2,3]
    return pd.cut(series, bins=bins, labels=False, right=True, include_lowest=True).to_frame(name=X_df.columns[0])

def bin_emp_length(X_df):
    series = X_df.iloc[:, 0]
    bins = [-0.1, 3, 9, np.inf]
    labels = [0,1,2]
    return pd.cut(series, bins=bins, labels=False, right=True, include_lowest=True).to_frame(name=X_df.columns[0])

def bin_delinq_2yrs(X_df):
    series = X_df.iloc[:, 0]
    bins = [-0.1, 3, 11, 19, np.inf] # Bins for 0-3, 4-11, 12-19, >19
    labels = [0,1,2,3]
    return pd.cut(series, bins=bins, labels=False, right=True, include_lowest=True).to_frame(name=X_df.columns[0])

def bin_fico_range_low(X_df):
    series = X_df.iloc[:, 0]
    bins = [-1,649, 699,749, np.inf]
    labels = [0,1,2,3]
    return pd.cut(series, bins=bins, labels=False, right=True, include_lowest=True).to_frame(name=X_df.columns[0])

#Binarizer
def binarize_revol_util(X_df):
    series = X_df.iloc[:, 0]
    return (series > 75).astype(int).to_frame(name=X_df.columns[0])

def binarize_bc_util(X_df):
    series = X_df.iloc[:, 0]
    return (series > 80).astype(int).to_frame(name=X_df.columns[0])

#Mathematical transformations
def apply_log1p_df(X_df):
    return pd.DataFrame(np.log1p(X_df.iloc[:,0]), index=X_df.index, columns=X_df.columns)

#Categorical Column transformations
def transform_home_ownership(X_df):
    series = X_df.iloc[:, 0]
    valid_ownership = ['MORTGAGE', 'RENT', 'OWN']
    return series.apply(lambda x: x if x in valid_ownership else 'Other').to_frame(name=X_df.columns[0])

def transform_purpose(X_df):
    series = X_df.iloc[:, 0]
    valid_purpose = ['debt_consolidation', 'credit_card']
    return series.apply(lambda x: x if x in valid_purpose else 'Other').to_frame(name=X_df.columns[0])

#cols to drop
class ColumnDroppingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        cols_to_actually_drop = [col for col in self.columns_to_drop if col in X.columns]
        return X.drop(columns=cols_to_actually_drop)

#1.Read Dataset and Load Preprocessors

In [25]:
import pandas as pd
import numpy as np
from sklearn import set_config
set_config(transform_output='pandas')

In [26]:
df=pd.read_csv('df_classification.csv')
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,...,loan_status,purpose,dti,delinq_2yrs,earliest_cr_line,fico_range_low,pub_rec,revol_util,application_type,bc_util
0,3600.0,36 months,13.99,123.03,C,C4,10+ years,MORTGAGE,55000.0,Not Verified,...,Fully Paid,debt_consolidation,5.91,0.0,Aug-2003,675.0,0.0,29.7,Individual,37.2
1,24700.0,36 months,11.99,820.28,C,C1,10+ years,MORTGAGE,65000.0,Not Verified,...,Fully Paid,small_business,16.06,1.0,Dec-1999,715.0,0.0,19.2,Individual,27.1
2,20000.0,60 months,10.78,432.66,B,B4,10+ years,MORTGAGE,63000.0,Not Verified,...,Fully Paid,home_improvement,10.78,0.0,Aug-2000,695.0,0.0,56.2,Joint App,55.9
3,10400.0,60 months,22.45,289.91,F,F1,3 years,MORTGAGE,104433.0,Source Verified,...,Fully Paid,major_purchase,25.37,1.0,Jun-1998,695.0,0.0,64.5,Individual,77.5
4,11950.0,36 months,13.44,405.18,C,C3,4 years,RENT,34000.0,Source Verified,...,Fully Paid,debt_consolidation,10.2,0.0,Oct-1987,690.0,0.0,68.4,Individual,91.0


In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['loan_status'] = le.fit_transform(df['loan_status'])

In [28]:
X=df.drop(['loan_status','application_type','int_rate','grade','sub_grade'],axis=1)
y=df['loan_status']

In [29]:
import joblib
linear_preprocessor=joblib.load('linear_based_preprocessor.pkl')
tree_preprocessor=joblib.load('tree_based_preprocessor.pkl')

In [30]:
df['loan_status'].value_counts()

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
1,214873
0,54156


In [31]:
214873/4

53718.25

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [33]:
class_counts = y_train.value_counts()
print("Class counts in y_train:\n", class_counts)

majority_class = class_counts.idxmax()
minority_class = class_counts.idxmin()

print(f"\nMajority class: {majority_class}")
print(f"Minority class: {minority_class}")

Class counts in y_train:
 loan_status
1    171898
0     43325
Name: count, dtype: int64

Majority class: 1
Minority class: 0


In [34]:
X_train_majority = X_train[y_train == majority_class]
y_train_majority = y_train[y_train == majority_class]

X_train_minority = X_train[y_train == minority_class]
y_train_minority = y_train[y_train == minority_class]

print(f"Shape of X_train_majority: {X_train_majority.shape}")
print(f"Shape of y_train_majority: {y_train_majority.shape}")
print(f"Shape of X_train_minority: {X_train_minority.shape}")
print(f"Shape of y_train_minority: {y_train_minority.shape}")

Shape of X_train_majority: (171898, 16)
Shape of y_train_majority: (171898,)
Shape of X_train_minority: (43325, 16)
Shape of y_train_minority: (43325,)


In [35]:
minority_count = len(y_train_minority)

##1.RandomForest Dataset

In [36]:
# RandomForest Dataset
X_rf_majority_undersampled = X_train_majority.sample(n=minority_count, random_state=42)
y_rf_majority_undersampled = y_train_majority.sample(n=minority_count, random_state=42)

X_rf = pd.concat([X_rf_majority_undersampled, X_train_minority], axis=0)
y_rf = pd.concat([y_rf_majority_undersampled, y_train_minority], axis=0)

print("RandomForest Dataset Class Counts:")
print(y_rf.value_counts())

RandomForest Dataset Class Counts:
loan_status
1    43325
0    43325
Name: count, dtype: int64


##2.XGBoost Dataset

In [37]:
X_xg_majority_undersampled = X_train_majority.sample(n=minority_count, random_state=1)
y_xg_majority_undersampled = y_train_majority.sample(n=minority_count, random_state=1)

X_xg = pd.concat([X_xg_majority_undersampled, X_train_minority], axis=0)
y_xg = pd.concat([y_xg_majority_undersampled, y_train_minority], axis=0)

print("XGBoost Dataset Class Counts:")
print(y_xg.value_counts())

XGBoost Dataset Class Counts:
loan_status
1    43325
0    43325
Name: count, dtype: int64


##3.LightGBM Dataset

In [38]:
X_lgbm_majority_undersampled = X_train_majority.sample(n=minority_count, random_state=2)
y_lgbm_majority_undersampled = y_train_majority.sample(n=minority_count, random_state=2)

X_lgbm = pd.concat([X_lgbm_majority_undersampled, X_train_minority], axis=0)
y_lgbm = pd.concat([y_lgbm_majority_undersampled, y_train_minority], axis=0)

print("LightGBM Dataset Class Counts:")
print(y_lgbm.value_counts())

LightGBM Dataset Class Counts:
loan_status
1    43325
0    43325
Name: count, dtype: int64


##4.GradientBoosting Dataset


In [39]:
X_gb_majority_undersampled = X_train_majority.sample(n=minority_count, random_state=3)
y_gb_majority_undersampled = y_train_majority.sample(n=minority_count, random_state=3)

X_gb = pd.concat([X_gb_majority_undersampled, X_train_minority], axis=0)
y_gb = pd.concat([y_gb_majority_undersampled, y_train_minority], axis=0)

print("GradientBoosting Dataset Class Counts:")
print(y_gb.value_counts())

GradientBoosting Dataset Class Counts:
loan_status
1    43325
0    43325
Name: count, dtype: int64


In [40]:
df_for_meta_test=pd.DataFrame()

#2.Baseline Model Building

##**1.RandomForest**

In [41]:
from sklearn.pipeline import Pipeline

In [42]:
from sklearn.ensemble import RandomForestClassifier
rf_base=RandomForestClassifier()
rf_base

In [43]:
rfb_pipe = Pipeline([
    ('preprocessor', tree_preprocessor),
    ('model', rf_base)]
)

In [44]:
rfb_pipe.fit(X_rf,y_rf)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.8s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


In [45]:
y_pred_rfb_proba = rfb_pipe.predict_proba(X_test)


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [46]:
y_pred_rfb = (y_pred_rfb_proba[:, 1] > 0.5).astype(int)

In [47]:
from sklearn.metrics import classification_report,roc_auc_score
print(classification_report(y_test, y_pred_rfb))

              precision    recall  f1-score   support

           0       0.31      0.64      0.42     10831
           1       0.88      0.65      0.74     42975

    accuracy                           0.65     53806
   macro avg       0.60      0.65      0.58     53806
weighted avg       0.76      0.65      0.68     53806



In [48]:
roc_auc_score(y_test, y_pred_rfb)

np.float64(0.645364234444589)

In [49]:
df_for_meta_test['RandomForest']=y_pred_rfb_proba[:, 1]

##**2.xgboost**

In [50]:
import xgboost as xgb
xgb_base=xgb.XGBClassifier()
xgb_base

In [51]:
xgb_pipe = Pipeline(
    [('preprocessor', tree_preprocessor),
     ('model', xgb_base)]
)

In [52]:
xgb_pipe.fit(X_xg,y_xg)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.8s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


In [53]:
y_pred_xgb_proba=xgb_pipe.predict_proba(X_test)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [54]:
y_pred_xgb=(y_pred_xgb_proba[:, 1] > 0.5).astype(int)

In [55]:
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.33      0.65      0.44     10831
           1       0.88      0.67      0.76     42975

    accuracy                           0.66     53806
   macro avg       0.61      0.66      0.60     53806
weighted avg       0.77      0.66      0.69     53806



In [56]:
roc_auc_score(y_test, y_pred_xgb)

np.float64(0.6574361625156585)

In [57]:
df_for_meta_test['XGBoost']=y_pred_xgb_proba[:, 1]

##**3.LightGBM**

In [58]:
import lightgbm as lgb
lgb_base = lgb.LGBMClassifier()
lgb_base

In [59]:
lgb_pipe = Pipeline(
    [('preprocessor', tree_preprocessor),
     ('model', lgb_base)]
)

In [60]:
lgb_pipe.fit(X_lgbm, y_lgbm)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.8s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Number of positive: 43325, number of negative: 43325
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006743 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1372
[LightGBM] [Info] Number of data points in the train set: 86650, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [61]:
y_pred_lgb_proba = lgb_pipe.predict_proba(X_test)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [62]:
y_pred_lgb=(y_pred_lgb_proba[:, 1] > 0.5).astype(int)

In [63]:
print(classification_report(y_test, y_pred_lgb))

              precision    recall  f1-score   support

           0       0.33      0.64      0.44     10831
           1       0.88      0.68      0.77     42975

    accuracy                           0.67     53806
   macro avg       0.61      0.66      0.60     53806
weighted avg       0.77      0.67      0.70     53806



In [64]:
roc_auc_score(y_test, y_pred_lgb)

np.float64(0.6578463236194947)

In [65]:
df_for_meta_test['LightGBM']=y_pred_lgb_proba[:, 1]

##**4.GradientBoosting**

In [66]:
from sklearn.ensemble import GradientBoostingClassifier
gb_base=GradientBoostingClassifier()
gb_base

In [67]:
gb_pipe=Pipeline([
    ('preprocessor',tree_preprocessor),
    ('model',gb_base)
])

In [68]:
gb_pipe.fit(X_gb,y_gb)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   0.8s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


In [69]:
y_pred_gb_proba=gb_pipe.predict_proba(X_test)
y_pred_gb=(y_pred_gb_proba[:, 1] > 0.5).astype(int)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [70]:
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

           0       0.33      0.62      0.43     10831
           1       0.88      0.68      0.77     42975

    accuracy                           0.67     53806
   macro avg       0.60      0.65      0.60     53806
weighted avg       0.77      0.67      0.70     53806



In [71]:
roc_auc_score(y_test, y_pred_gb)

np.float64(0.6513345921895166)

In [72]:
df_for_meta_test['GradBoost']=y_pred_gb_proba[:, 1]

#4.OOF Predictions

In [73]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from copy import deepcopy
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    'RandomForest': rfb_pipe,
    'XGBoost': xgb_pipe,
    'LightGBM': lgb_pipe,
    'GradBoost': gb_pipe
}

oof_preds = {}
oof_labels = {}

for model_name, pipeline in models.items():
    print(f"Generating OOF predictions for {model_name}...")
    model_oof_preds = []
    model_oof_labels = []

    for fold, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        #Pipeline Copy to prevent data leakage
        pipeline_clone = deepcopy(pipeline)

        pipeline_clone.fit(X_train_fold, y_train_fold)
        y_pred_proba_fold = pipeline_clone.predict_proba(X_val_fold)[:, 1]

        model_oof_preds.append(y_pred_proba_fold)
        model_oof_labels.append(y_val_fold)

    oof_preds[model_name] = np.concatenate(model_oof_preds)
    oof_labels[model_name] = np.concatenate(model_oof_labels)

print("OOF prediction generation complete.")

Generating OOF predictions for RandomForest...


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.5s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.5s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.6s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


Generating OOF predictions for XGBoost...


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   2.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.5s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.8s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.5s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


Generating OOF predictions for LightGBM...


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Number of positive: 137518, number of negative: 34660
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1374
[LightGBM] [Info] Number of data points in the train set: 172178, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798697 -> initscore=1.378169
[LightGBM] [Info] Start training from score 1.378169


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.6s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Number of positive: 137518, number of negative: 34660
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025300 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1371
[LightGBM] [Info] Number of data points in the train set: 172178, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798697 -> initscore=1.378169
[LightGBM] [Info] Start training from score 1.378169


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Number of positive: 137518, number of negative: 34660
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015330 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1373
[LightGBM] [Info] Number of data points in the train set: 172178, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798697 -> initscore=1.378169
[LightGBM] [Info] Start training from score 1.378169


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Number of positive: 137519, number of negative: 34660
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1372
[LightGBM] [Info] Number of data points in the train set: 172179, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798698 -> initscore=1.378176
[LightGBM] [Info] Start training from score 1.378176


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   2.1s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s
[LightGBM] [Info] Number of positive: 137519, number of negative: 34660
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1374
[LightGBM] [Info] Number of data points in the train set: 172179, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798698 -> initscore=1.378176
[LightGBM] [Info] Start training from score 1.378176


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


Generating OOF predictions for GradBoost...


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.6s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


[Pipeline]  (step 1 of 3) Processing initial_feature_processing, total=   1.4s
[Pipeline]  (step 2 of 3) Processing new_feature_generation, total=   0.0s
[Pipeline] ... (step 3 of 3) Processing column_dropping, total=   0.0s


  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


OOF prediction generation complete.




In [74]:
from sklearn.metrics import roc_auc_score
for model_name in models.keys():
    roc_auc = roc_auc_score(oof_labels[model_name], oof_preds[model_name])
    print(f"OOF ROC AUC for {model_name}: {roc_auc:.4f}")

OOF ROC AUC for RandomForest: 0.6965
OOF ROC AUC for XGBoost: 0.7225
OOF ROC AUC for LightGBM: 0.7242
OOF ROC AUC for GradBoost: 0.7150


In [75]:
oof_preds_df = pd.DataFrame(oof_preds)
oof_preds_df['target']=oof_labels['RandomForest']
oof_preds_df.sample(5)

Unnamed: 0,RandomForest,XGBoost,LightGBM,GradBoost,target
113338,0.79,0.65678,0.698523,0.715689,1
167084,0.76,0.832427,0.7969,0.782593,1
59175,0.73,0.764875,0.679672,0.670816,1
8431,0.75,0.957152,0.948309,0.919487,1
51525,0.34,0.519741,0.473868,0.414409,0


#5.Meta Model

In [76]:
from sklearn.linear_model import LogisticRegression
meta_model=LogisticRegression()
meta_model

In [77]:
meta_model.fit(oof_preds_df.drop('target',axis=1),oof_preds_df['target'])

In [78]:
y_pred_meta_proba = meta_model.predict_proba(df_for_meta_test)

In [79]:
y_pred_meta=(y_pred_meta_proba[:, 1] > 0.435).astype(int)
print(classification_report(y_test, y_pred_meta))

              precision    recall  f1-score   support

           0       0.35      0.61      0.44     10831
           1       0.88      0.71      0.79     42975

    accuracy                           0.69     53806
   macro avg       0.61      0.66      0.62     53806
weighted avg       0.77      0.69      0.72     53806



In [80]:
roc_auc_score(y_test, y_pred_meta)

np.float64(0.661748321252063)

In [81]:
def calculate_ks(y_true, y_proba):
    df = pd.DataFrame({'y_true': y_true, 'y_proba': y_proba})
    df = df.sort_values('y_proba').reset_index(drop=True)
    total_pos = df['y_true'].sum()
    total_neg = len(df) - total_pos
    df['cumulative_pos'] = df['y_true'].cumsum() / total_pos
    df['cumulative_neg'] = (1 - df['y_true']).cumsum() / total_neg
    ks_statistic = np.max(np.abs(df['cumulative_pos'] - df['cumulative_neg']))
    return ks_statistic

ks_value = calculate_ks(y_test, y_pred_meta_proba[:, 1])
print(f"KS Statistic for Meta-Model: {ks_value:.4f}")

KS Statistic for Meta-Model: 0.3255


#6.Inference Pipeline

In [82]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [83]:
class PredictProbaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        proba = self.estimator.predict_proba(X)
        return proba[:, 1].reshape(-1, 1)

print("PredictProbaTransformer class defined.")

PredictProbaTransformer class defined.


In [84]:
from sklearn.pipeline import FeatureUnion
rf_proba_transformer = PredictProbaTransformer(estimator=rfb_pipe)
xgb_proba_transformer = PredictProbaTransformer(estimator=xgb_pipe)
lgb_proba_transformer = PredictProbaTransformer(estimator=lgb_pipe)
gb_proba_transformer = PredictProbaTransformer(estimator=gb_pipe)
feature_union = FeatureUnion([
    ('rf_proba', rf_proba_transformer),
    ('xgb_proba', xgb_proba_transformer),
    ('lgb_proba', lgb_proba_transformer),
    ('gb_proba', gb_proba_transformer)
])
final_inference_pipeline = Pipeline([
    ('base_model_predictions', feature_union),
    ('meta_model', meta_model)
])

print("Final inference pipeline created.")

Final inference pipeline created.


In [85]:
y_pred_final=final_inference_pipeline.predict_proba(X_test)

  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)
  X_transformed['emp_length'] = X_transformed['emp_length'].replace('nan', np.nan).astype(float)


In [86]:
print(f"ROC AUC Score for Final Stacking Model: {roc_auc_score(y_test, y_pred_final[:, 1]):.4f}")

ROC AUC Score for Final Stacking Model: 0.7245


In [111]:
y_pred_final_check=(y_pred_final[:, 1] > 0.45).astype(int)

In [112]:
print("Classification Report for Final Stacking Model:")
print(classification_report(y_test, y_pred_final_check))

Classification Report for Final Stacking Model:
              precision    recall  f1-score   support

           0       0.34      0.63      0.44     10831
           1       0.88      0.70      0.78     42975

    accuracy                           0.68     53806
   macro avg       0.61      0.66      0.61     53806
weighted avg       0.77      0.68      0.71     53806



#7.Saving the pipeline

In [113]:
import joblib
joblib.dump(final_inference_pipeline, 'final_inference_pipeline.pkl')

['final_inference_pipeline.pkl']

In [114]:
from sklearn import set_config

# Set the display configuration to 'diagram'
set_config(display='diagram')

# Display the pipeline diagram
final_inference_pipeline