In [361]:
if '__IPYTHON__' in globals():
    from IPython import get_ipython
    ipython = get_ipython()
    ipython.magic('load_ext autoreload')
    ipython.magic('autoreload 2')

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, f1_score

from sklearn import set_config
set_config(display="diagram")

In [362]:
target = "Loan_Status"
random_state= 42
val_size = 0.3

### 1. Import data

In [363]:
df = pd.read_csv("data/train.csv")
df["Loan_Status"] = df["Loan_Status"].map({"Y":1, "N":0})

X_train, y_train = (
    df.drop(target, axis=1),
    df.pop(target),
)

X_test = pd.read_csv("data/test.csv")

### 2. Prepare data

In [364]:
def discretize_feature_bins(X, bins=[0,1], labels=None):
    X = pd.cut(X.flatten(), bins, labels=labels)
    return np.array(X).reshape(-1, 1)

def log(X):
    return np.log(X)

### 3. Create the Pipeline

In [365]:
features = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(handle_unknown="ignore", sparse=False),
        ),
        ["Gender", "Married", "Self_Employed", "Credit_History"],
    ),
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(),
        ),
        ["Dependents"],
    ),
    (
        make_pipeline(SimpleImputer(strategy="median"), FunctionTransformer(log)),
        ["LoanAmount"],
    ),
    (
        SimpleImputer(strategy="most_frequent"),
        ["Loan_Amount_Term"],
    ),
)


### 4. Create model

In [366]:
# The model
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

### 5. Validation

In [367]:
# Transform X_train and X_val
X_train_features = features.fit_transform(X_train)

# Metrics to evaluate the model
scoring = ("accuracy", "roc_auc", "f1")

# Cross validation
scores = cross_validate(
    model, X_train_features, y_train, cv=15, scoring=scoring, n_jobs=-1
)

# Mean value of the metrics
for k, v in scores.items():
    print(f"{k}: {v.mean():0.4f}")

fit_time: 0.0018
score_time: 0.0022
test_accuracy: 0.8095
test_roc_auc: 0.7189
test_f1: 0.8769


### 6. Make predictions and submit

In [368]:
X_test_features = features.transform(X_test)

model.fit(X_train_features, y_train)
y_test_pred = model.predict(X_test_features)

submission = pd.read_csv("data/sample_submission.csv")
submission['Loan_Status']= y_test_pred
submission['Loan_Status'] = submission['Loan_Status'].map({1:"Y", 0:"N"})
submission.to_csv(f"{model.__class__.__name__}.csv", index=False)