### Here we use the same data as in modelling.ipynb so we skip over a lot of the EDA and preprocessing.

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling tools
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, log_loss

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC

In [55]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [56]:
df_train["TotIncome"] = df_train["ApplicantIncome"] + df_train["CoapplicantIncome"]
df_train["DebtToTotIncome"] = df_train["LoanAmount"] / df_train["TotIncome"]
df_train["DebtToIncome"] = df_train["LoanAmount"] / df_train["ApplicantIncome"]

In [57]:
df_train_grouped = df_train.drop(["ApplicantIncome", "CoapplicantIncome", "DebtToIncome"], axis=1)
df_train_org = df_train.drop(["TotIncome", "DebtToTotIncome"], axis=1)

df_train_g = df_train_grouped.drop("Loan_ID", axis=1)
df_train_o = df_train_org.drop("Loan_ID", axis=1)

In [58]:
X = df_train_g.drop("Loan_Status", axis=1)
y = df_train_g["Loan_Status"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Simple Logistic Regression

In [59]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [60]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
    ])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  OneHotEncoder())
    ])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

model = Pipeline(steps = [('preprocessor', preprocessor),
                    ('classifier',LogisticRegression())
                  ])

model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:,1]
y_pred = model.predict(X_test)

Model Selection

In [88]:
classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression()
    ]

In [89]:
model_results = []

for classifier in classifiers:
    current_model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    current_model.fit(X_train, y_train)   
    current_model_score = current_model.score(X_test, y_test)
    current_model_results = {"Model": str(classifier).split()[0].partition("(")[0], "Score": np.round(current_model_score,8)}
    model_results.append(current_model_results)
    
df_models = pd.DataFrame(model_results)

In [90]:
df_models.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score
4,RandomForestClassifier,0.804878
5,AdaBoostClassifier,0.804878
6,GradientBoostingClassifier,0.804878
2,NuSVC,0.796748
7,LogisticRegression,0.788618
0,KNeighborsClassifier,0.747967
3,DecisionTreeClassifier,0.691057
1,SVC,0.650406
