### Import Module and Dataframe

In [193]:
#Import Modules that will be used

import numpy as np
import pandas as pd

import seaborn as sns
import random

In [151]:
#Load the dataset
og_train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Train.csv")
og_test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Test.csv")

In [6]:
from pandas_profiling import ProfileReport
profile = ProfileReport(og_train_df, title='Pandas Profiling Report', explorative=True)

  from pandas_profiling import ProfileReport


In [32]:
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [237]:
train_df = og_train_df.drop("Loan_ID", axis = 1)
test_df = og_test_df.drop("Loan_ID", axis = 1)

In [238]:
#drop the column
train_df = train_df.drop("Months_Since_Deliquency", axis=1)
test_df = test_df.drop("Months_Since_Deliquency", axis=1)

train_df = train_df.drop("Income_Verified", axis=1)
test_df = test_df.drop("Income_Verified", axis=1)

train_df = train_df.drop("Number_Open_Accounts", axis=1)
test_df = test_df.drop("Number_Open_Accounts", axis=1)

### Handling Missing Values, Outlier, and Categorical Data Types

In [239]:
train_df["Loan_Amount_Requested"] = pd.to_numeric(train_df["Loan_Amount_Requested"].str.replace(",", ""))
test_df["Loan_Amount_Requested"] = pd.to_numeric(test_df["Loan_Amount_Requested"].str.replace(",", ""))

In [240]:
train_df.dropna(subset=["Length_Employed"], inplace=True)
test_df.dropna(subset=["Length_Employed"], inplace=True)

length_employed_mapping = {
    '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, 
    '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7, 
    '8 years': 8, '9 years': 9, '10+ years': 10
}

train_df["Length_Employed"] = train_df["Length_Employed"].map(length_employed_mapping)
test_df["Length_Employed"] = test_df["Length_Employed"].map(length_employed_mapping)

In [241]:
train_df.dropna(subset=["Annual_Income"], inplace=True)
test_df.dropna(subset=["Annual_Income"], inplace=True)

In [242]:
#find frequency percentage of each category in home owner variable
ho_value_freq = train_df["Home_Owner"].value_counts(normalize=True)

#calculate expected frequency of each category among null value
ho_null_count = train_df["Home_Owner"].isnull().sum()
ho_expected_freq = ho_value_freq * ho_null_count

#fill null value based on expected frequency
ho_fill_values = list(ho_expected_freq.index)
ho_fill_weights = list(ho_expected_freq.values)
ho_fill_choice = random.choices(ho_fill_values, weights=ho_fill_weights, k= ho_null_count)

#fill all the null value with the weighted value
train_df.loc[train_df["Home_Owner"].isnull(), "Home_Owner"] = ho_fill_choice

In [243]:
#find frequency percentage of each category in home owner variable
ho_value_freq = test_df["Home_Owner"].value_counts(normalize=True)

#calculate expected frequency of each category among null value
ho_null_count = test_df["Home_Owner"].isnull().sum()
ho_expected_freq = ho_value_freq * ho_null_count

#fill null value based on expected frequency
ho_fill_values = list(ho_expected_freq.index)
ho_fill_weights = list(ho_expected_freq.values)
ho_fill_choice = random.choices(ho_fill_values, weights=ho_fill_weights, k= ho_null_count)

#fill all the null value with the weighted value
test_df.loc[test_df["Home_Owner"].isnull(), "Home_Owner"] = ho_fill_choice

In [244]:
#join all small frequency category into "other" category
pol_value_freq = train_df["Purpose_Of_Loan"].value_counts(normalize=True)
pol_other_category = pol_value_freq[(pol_value_freq < 0.05)].index.tolist()

train_df["Purpose_Of_Loan"] = train_df["Purpose_Of_Loan"].replace(pol_other_category)

In [245]:
#join all small frequency category into "other" category
pol_value_freq = test_df["Purpose_Of_Loan"].value_counts(normalize=True)
pol_other_category = pol_value_freq[(pol_value_freq < 0.05)].index.tolist()

test_df["Purpose_Of_Loan"] = test_df["Purpose_Of_Loan"].replace(pol_other_category)

In [246]:
#create category mapping
gender_mapping = {"Male": 0, "Female": 1}
train_df["Gender"] = train_df["Gender"].map(gender_mapping)
test_df["Gender"] = test_df["Gender"].map(gender_mapping)

In [247]:
#one hot encoding all categorical variable in dataframe
cat_columns = train_df.select_dtypes(include = "object").columns.tolist()

for col in cat_columns:
    one_hot = pd.get_dummies(train_df[col], prefix=col)
    train_df = train_df.drop(col, axis = 1)
    train_df = train_df.join(one_hot)

    one_hot = pd.get_dummies(test_df[col], prefix=col)
    test_df = test_df.drop(col, axis = 1)
    test_df = test_df.join(one_hot)

### Data Balancing and Scaling

In [248]:
from sklearn.preprocessing import RobustScaler, StandardScaler
from imblearn.over_sampling import SMOTE

In [249]:
#create X and target dataframe
X_train = train_df.drop("Interest_Rate", axis = 1)
y_train = train_df["Interest_Rate"]

X_test = test_df

In [250]:
# Perform oversampling using SMOTE
oversampler = SMOTE()
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

In [251]:
#Scale the X dataframe

columns_to_scale = ['Loan_Amount_Requested', 'Length_Employed', 'Annual_Income', 'Debt_To_Income', 'Inquiries_Last_6Mo', 'Total_Accounts']
scaler = StandardScaler()

# Fit and transform the selected columns
X_resampled[columns_to_scale] = scaler.fit_transform(X_resampled[columns_to_scale])
X_scaled = X_resampled.copy()
# X_test[columns_to_scale] = scaler.fit_transform(X_test[columns_to_scale])

### Classification Model

In [252]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

In [253]:
X = X_scaled
y = y_resampled

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.2, 
                                                    random_state=42)

In [254]:
classifiers = [
    LogisticRegression(max_iter = 1000),
    KNeighborsClassifier(),
    DecisionTreeClassifier()
]

for classifier in classifiers:
    classifier_name = classifier.__class__.__name__
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    report = classification_report(y_test, y_pred)
    print(f"{classifier_name} Classification Report:")
    print(report)
    print()

LogisticRegression Classification Report:
              precision    recall  f1-score   support

           1       0.71      0.48      0.57     11603
           2       0.43      0.63      0.51     11242
           3       0.51      0.44      0.47     11462

    accuracy                           0.52     34307
   macro avg       0.55      0.52      0.52     34307
weighted avg       0.55      0.52      0.52     34307


KNeighborsClassifier Classification Report:
              precision    recall  f1-score   support

           1       0.54      0.62      0.58     11603
           2       0.42      0.46      0.44     11242
           3       0.48      0.35      0.41     11462

    accuracy                           0.48     34307
   macro avg       0.48      0.48      0.47     34307
weighted avg       0.48      0.48      0.47     34307


DecisionTreeClassifier Classification Report:
              precision    recall  f1-score   support

           1       0.55      0.56      0.55     1