##### About Dataset

The loan approval dataset is a collection of financial records and associated information used to determine the eligibility of individuals or organizations for obtaining loans from a lending institution. It includes various factors such as cibil score, income, employment status, loan term, loan amount, assets value, and loan status. This dataset is commonly used in machine learning and data analysis to develop models and algorithms that predict the likelihood of loan approval based on the given features.


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("./archive/loan_approval_dataset.csv")

In [None]:
df.head()

In [None]:
df.drop("loan_id", axis=1, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

In [None]:
for label, content in df.items():
    if pd.api.types.is_object_dtype(content):
        df[label] = enc.fit_transform(content)

In [None]:
df.dtypes

In [None]:
X = df.drop(" loan_status", axis=1)
y = df[" loan_status"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10000)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
feature_importances = pd.Series(model.feature_importances_, model.feature_names_in_)


In [None]:
import seaborn as sn

In [None]:
sn.barplot(x=feature_importances.values, y=feature_importances.index, width=1);

In [None]:
y_preds = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
sn.heatmap(confusion_matrix(y_pred=y_preds, y_true=y_test), linewidths=1, annot=True, fmt="d");

In [None]:
feature_importances

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
import optuna

In [None]:
def objective(trial):
    params = {
    "learning_rate": trial.suggest_float("learning_rate", 0, 1),
    "max_depth": trial.suggest_int("max_depth", 8, 12),
    "max_leaves": trial.suggest_int("max_leaves", 2, 8),
    "num_parallel_tree": trial.suggest_int("num_parallel_tree", 2, 4),
    "n_estimators": trial.suggest_int("n_estimators", 200, 1000)
}
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


In [None]:
study = optuna.create_study(direction="maximize")

In [None]:
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)

In [None]:
study.best_params

In [None]:
new_params = study.best_params

In [None]:
model = XGBClassifier(**new_params)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
import matplotlib.pyplot as plt

In [None]:
optuna.visualization.plot_optimization_history(study)