# Perform exploratory data analysis (EDA)


In [None]:
import pandas as pd
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors
from sklearn.tree import DecisionTreeClassifier  # Decision Trees
from sklearn.ensemble import RandomForestClassifier  # Random Forests
from sklearn.linear_model import LogisticRegression  # Logistic Regression

pd.set_option('display.max_columns', None)

In [None]:
df_initial = pd.read_csv('./Loan_Default.csv')

df_initial.head()

In [None]:
df_initial.describe()

In [None]:
df_initial.info()

# Handle missing values and perform any necessary data preprocessing.

## Handle Null value

In [None]:
df_initial.isna().sum()

## Visualize Null data

In [None]:
df_initial.isna().sum().plot.bar()
plt.show()

## Remove cloumn ID and Year

In [None]:
df_initial = df_initial.drop(["ID","year"], axis = "columns")
df_initial.head()

In [None]:
df_initial.nunique().sum

In [None]:
cate_vars = []
num_vars = []
for i in df_initial.columns:
    if(df_initial[i].dtype == "object"):
        cate_vars.append(i)
    else:
        num_vars.append(i)
print("Categorical Variables:\n",cate_vars,"\n")
print("Numerical Variables:\n",num_vars,"\n")

imputing numerical vars

In [None]:
df_num = df_initial[num_vars]
knn = KNNImputer(n_neighbors = 3)
knn.fit(df_num)
X = knn.fit_transform(df_num)

In [None]:
df_num = pd.DataFrame(X, columns=num_vars)
df_num.isna().sum()

## Imputing categorical vars

In [None]:
df_cat = df[cate_vars]
for i in cate_vars:
    mode = df[i].mode()
    mode = mode[0]
    df_cat[i].fillna(value=mode, inplace=True)

df_cat.isna().sum()

In [None]:
df_full = pd.concat([df_num, df_cat], axis=1, join='inner')
df_full.isna().sum()

In [None]:
df_full.plot(kind="box",subplots=True,layout=(7,2),figsize=(20,20));

## Drop the outlier

In [None]:
df_full = def drop_outliers(df, field_name):
    iqr = 1.5 * (np.percentile(df[field_name], 75) - np.percentile(df[field_name], 25))
    upper_bound = iqr + np.percentile(df[field_name], 75)
    lower_bound = np.percentile(df[field_name], 25) - iqr

    if ((df[field_name] > upper_bound) | (df[field_name] < lower_bound)).empty:
        print("No outliers to drop.")
        return

    df.drop(df[(df[field_name] > upper_bound) | (df[field_name] < lower_bound)].index, inplace=True)

drop_outliers(df_full, "loan_amount")
drop_outliers(df_full, "rate_of_interest")
drop_outliers(df_full, "Interest_rate_spread")
drop_outliers(df_full, "Upfront_charges")
drop_outliers(df_full, "property_value")
drop_outliers(df_full, "income")
drop_outliers(df_full, "LTV")
drop_outliers(df_full, "Status")
drop_outliers(df_full, "term")
drop_outliers(df_full, "dtir1")pd.concat([df_num, df_cat], axis=1, join='inner')
df_full.isna().sum()def drop_outliers(df, field_name):
    iqr = 1.5 * (np.percentile(df[field_name], 75) - np.percentile(df[field_name], 25))
    upper_bound = iqr + np.percentile(df[field_name], 75)
    lower_bound = np.percentile(df[field_name], 25) - iqr

    if ((df[field_name] > upper_bound) | (df[field_name] < lower_bound)).empty:
        print("No outliers to drop.")
        return

    df.drop(df[(df[field_name] > upper_bound) | (df[field_name] < lower_bound)].index, inplace=True)

drop_outliers(df_full, "loan_amount")
drop_outliers(df_full, "rate_of_interest")
drop_outliers(df_full, "Interest_rate_spread")
drop_outliers(df_full, "Upfront_charges")
drop_outliers(df_full, "property_value")
drop_outliers(df_full, "income")
drop_outliers(df_full, "LTV")
drop_outliers(df_full, "Status")
drop_outliers(df_full, "term")
drop_outliers(df_full, "dtir1")

## Encoding Categorical Variables

In [None]:
print(df_full[cat_vars].nunique().sum)

In [None]:
label = LabelEncoder()
for i in cat_vars:
    df_full[i] = label.fit_transform(df_full[i])

In [None]:
df_full.info()

# Train test split

In [None]:
train_set, test_set = train_test_split(df_full, test_size=0.2, random_state=42)

y_train = train_set['Status']
X_train = train_set.drop(columns=['Status'])
y_test = test_set['Status']
X_test = test_set.drop(columns=['Status'])

## Build Classification Model using Random Forest, KNN, Decision Tree, Logistic Regression

In [None]:
clf = RandomForestClassifier() # Random Forest
knn = KNeighborsClassifier()  # K-Nearest Neighbors
dct = DecisionTreeClassifier()  # Decision Trees
lr = LogisticRegression()  # Logistic Regression

In [None]:
clf.fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)
print('Accuracy :',accuracy_score(y_test, y_pred_clf))
print("\n")
print(classification_report(y_test, y_pred_clf))

In [None]:
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print('Accuracy :',accuracy_score(y_test, y_pred_knn))
print("\n")
print(classification_report(y_test, y_pred_knn))

In [None]:
dct.fit(X_train, y_train)
y_pred_dct = dct.predict(X_test)
print('Accuracy :',accuracy_score(y_test, y_pred_dct))
print("\n")
print(classification_report(y_test, y_pred_dct))

In [None]:
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print('Accuracy :',accuracy_score(y_test, y_pred_lr))
print("\n")
print(classification_report(y_test, y_pred_lr))

## Scores Compilation and Summary result

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

trained_models = [clf, knn, dct, lr]
model_names = ["RandomForest","K-Nearest Neighbors","Decision Trees","Logistic Regression"]
model_list = list(zip(model_names, trained_models))
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for model_name, model in model_list:
    y_pred = model.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

metrics_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1-Score': f1_scores
})

metrics_df