### Load packages

In [1]:
import warnings
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

# Sklearn
from sklearn.model_selection import (
    train_test_split,
    RandomizedSearchCV,
    GridSearchCV,
    RepeatedStratifiedKFold,
    cross_validate
)

from sklearn.feature_selection import (
    VarianceThreshold,
    SelectPercentile
)

# Assemble pipeline(s)
from sklearn import set_config
from sklearn.pipeline import make_pipeline, Pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import (
    ColumnTransformer,
    make_column_selector
)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler
)

# Handle constant/duplicates and missing features/columns
from feature_engine.selection import (
    DropFeatures,
    DropConstantFeatures,
    DropDuplicateFeatures
)

# Sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Models
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier
)
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform

set_config(display="diagram")  # make pipeline visible

warnings.filterwarnings("ignore")

### Load data

In [2]:
df_loan_approval = pd.read_csv(
    "https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/Loan-Approval-Prediction.csv"
)

df_loan_approval.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df_loan_approval.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


### Split train and test sets

In [4]:
X = df_loan_approval.drop(columns=["Loan_Status"])
y = df_loan_approval[["Loan_Status"]]


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42
)

### Fit model pipeline

In [5]:
# Base ensembled classifier
ensembled_classifier = VotingClassifier(
    estimators=[
        ("LR", LogisticRegression()),
        ("RFC", RandomForestClassifier()),
        ("XGB", XGBClassifier())
    ],
    voting="soft"
)

# Hyperparameter Tuning
params = {
    'LR__solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'LR__penalty': ['l1', 'l2', 'elasticnet'],
    'LR__C': np.logspace(-4, 4, 20),
    'LR__max_iter': [100, 1000, 2500, 5000, 10000]
}


# Build final pipeline
build_pipeline = imbPipeline(
    steps=[
        # STEP 1: Drop irrelevant columns/features
        # -->1.1: Drop primary key identifier columns
        (
            "Drop Columns",
            DropFeatures(
                features_to_drop=[
                    "Loan_ID"
                ]
            )
        ),

        # -->1.2: Drop constant features
        (
            "Drop Constant Values",
            DropConstantFeatures(missing_values="ignore")
        ),

        # -->1.3: Drop duplicate features if they show the same values for every observation
        (
            "Drop Duplicates",
            DropDuplicateFeatures(missing_values="ignore")
        ),

        # STEP 2: Imputation & Scaling
        (
            "Data Cleaning",
            ColumnTransformer(
                [
                    # -->2.1: For numerical features
                    (
                        "Numerical Features",
                        make_pipeline(
                            SimpleImputer(strategy="median"),
                            MinMaxScaler()
                        ),
                        make_column_selector(
                            dtype_include=["int64", "float64"]
                        )
                    ),
                    # -->2.2: For categorical features
                    (
                        "Categorical Features",
                        make_pipeline(
                            SimpleImputer(strategy="most_frequent"),
                            OneHotEncoder(
                                drop="first"
                            )
                        ),
                        make_column_selector(dtype_include=["object"])
                    )
                ]
            )
        ),

        # STEP 3: Handling class imbalance using SMOTE
        (
            "Re-sampling",
            SMOTE()
        ),

        ("Variance Threshold", VarianceThreshold()),

        ("Select Percentile", SelectPercentile()),

        # STEP 6: Logistic Regresion model
        (
            "LR",
            LogisticRegression()
        )
    ]
)


rsf = RepeatedStratifiedKFold()

build_pipeline = RandomizedSearchCV(
    estimator=build_pipeline,
    param_distributions=params,
    scoring='roc_auc',
    verbose=1,
    cv=rsf
)

build_pipeline.fit(X_train, y_train)

Fitting 50 folds for each of 10 candidates, totalling 500 fits


### Metrics evaluation

In [6]:
actual_values = [
    1 if i[0]=="Y" else 0 for i in y_test.values
]

pred_values = [
    1 if i[0]=="Y" else 0 for i in build_pipeline.predict(X_test)
]

metrics = pd.DataFrame(
    [
        accuracy_score(actual_values, pred_values),
        precision_score(actual_values, pred_values),
        recall_score(actual_values, pred_values),
        f1_score(actual_values, pred_values),
        roc_auc_score(actual_values, pred_values)
    ], 
    index=["Accuracy", "Precision", "Recall", "F1", "ROC_AUC"]
).T

metrics

Unnamed: 0,Accuracy,Precision,Recall,F1,ROC_AUC
0,0.788618,0.759615,0.9875,0.858696,0.703052


### Pickling the model file for deployment

In [7]:
pickle.dump(
    build_pipeline,
    open(
        "build_pipeline.pkl",
        "wb"
    )
)

# pickled_model = pickle.load(
#     open(
#         "build_pipeline.pkl",
#         "rb"
#     )
# )

# pickled_model.predict(X_test)