In [171]:
if '__IPYTHON__' in globals():
    from IPython import get_ipython
    ipython = get_ipython()
    ipython.magic('load_ext autoreload')
    ipython.magic('autoreload 2')

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn import set_config
set_config(display="diagram")

In [172]:
target = "Loan_Status"
random_state= 42
test_size = 0.3

### 1. Import data

In [173]:
df = pd.read_csv("data/train.csv")
df["Loan_Status"] = df["Loan_Status"].map({"Y":1, "N":0})

X, y = (
    df.drop(target, axis=1),
    df[[target]],
)


In [174]:
X.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [175]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, test_size=test_size, stratify=y)

### 2. Prepare data

In [176]:
def discretize_feature_bins(X, bins=[0,1], labels=None):
    X = pd.cut(X.flatten(), bins, labels=labels)
    return np.array(X).reshape(-1, 1)

### 3. Create the Pipeline

In [178]:
imputer = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(handle_unknown="ignore", sparse=False),
        ),
        ["Gender", "Married", "Self_Employed", "Credit_History"],
    ),
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(),
        ),
        ["Dependents"],
    ),
    (
        SimpleImputer(strategy="mean"),
        ["LoanAmount"],
    ),
    (
        SimpleImputer(strategy="median"),
        ["Loan_Amount_Term"],
    ),

)

### 4. Create and evaluate a model

In [190]:
# The model
model = RandomForestClassifier(
    n_estimators=20,
    random_state=random_state,
    min_samples_split= 20,
)

# The final pipeline
pipeline = make_pipeline(imputer, model)

# Metrics to evaluate the model
scoring = ("accuracy", "roc_auc", "f1", "neg_brier_score")

# Cross validation
scores = cross_validate(
    pipeline, X_train, np.array(y_train).ravel(), cv=10, scoring=scoring, n_jobs=-1
)

# Mean value of the metrics
for k, v in scores.items():
    print(f"{k}: {v.mean()}")

fit_time: 0.03976154327392578
score_time: 0.023997020721435548
test_accuracy: 0.7900332225913622
test_roc_auc: 0.7149336870026526
test_f1: 0.8627916643643158
test_neg_brier_score: -0.16948974082655457
