In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np


loan_data = pd.read_csv('Loan_default.csv')



label_encoders = {}
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    loan_data[col] = label_encoders[col].fit_transform(loan_data[col])


X = loan_data.drop(['LoanID', 'Default'], axis=1)
y = loan_data['Default']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



In [None]:

def predict_loan_default():
    print("Enter your loan details:")
    age = int(input("Age: "))
    income = int(input("Annual Income: "))
    loan_amount = int(input("Loan Amount: "))
    credit_score = int(input("Credit Score: "))
    months_employed = int(input("Months Employed: "))
    num_credit_lines = int(input("Number of Credit Lines: "))
    interest_rate = float(input("Interest Rate: "))
    loan_term = int(input("Loan Term (in months): "))
    dti_ratio = float(input("Debt-to-Income Ratio: "))


    education = label_encoders['Education'].transform([input("Education (Bachelor's/Master's/High School): ")])[0]
    employment_type = label_encoders['EmploymentType'].transform([input("Employment Type (Full-time/Unemployed): ")])[0]
    marital_status = label_encoders['MaritalStatus'].transform([input("Marital Status (Married/Divorced): ")])[0]
    has_mortgage = label_encoders['HasMortgage'].transform([input("Has Mortgage (Yes/No): ")])[0]
    has_dependents = label_encoders['HasDependents'].transform([input("Has Dependents (Yes/No): ")])[0]
    loan_purpose = label_encoders['LoanPurpose'].transform([input("Loan Purpose (Auto/Business/Other): ")])[0]
    has_cosigner = label_encoders['HasCoSigner'].transform([input("Has Co-Signer (Yes/No): ")])[0]


    user_input = np.array([[age, income, loan_amount, credit_score, months_employed, num_credit_lines,
                            interest_rate, loan_term, dti_ratio, education, employment_type, marital_status,
                            has_mortgage, has_dependents, loan_purpose, has_cosigner]])


    prediction = model.predict(user_input)
    if prediction[0] == 1:
        print("Prediction: The applicant is likely to default.")
    else:
       print("Prediction: The applicant is not likely to default.")


predict_loan_default()


Enter your loan details:
Age: 56
Annual Income: 85994
Loan Amount: 50587
Credit Score: 520
Months Employed: 80
Number of Credit Lines: 4
Interest Rate: 15.23
Loan Term (in months): 36
Debt-to-Income Ratio: 0.44
Education (Bachelor's/Master's/High School): Bachelor's
Employment Type (Full-time/Unemployed): Full-time
Marital Status (Married/Divorced): Divorced
Has Mortgage (Yes/No): Yes
Has Dependents (Yes/No): Yes
Loan Purpose (Auto/Business/Other): Other
Has Co-Signer (Yes/No): Yes
Prediction: The applicant is not likely to default.




In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Accuracy: 88.67%
Confusion Matrix:
[[45000   170]
 [ 5617   283]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45170
           1       0.62      0.05      0.09      5900

    accuracy                           0.89     51070
   macro avg       0.76      0.52      0.51     51070
weighted avg       0.86      0.89      0.84     51070



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder


loan_data = pd.read_csv('Loan_default.csv')


label_enc = LabelEncoder()
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
for col in categorical_cols:
    loan_data[col] = label_enc.fit_transform(loan_data[col])


X = loan_data.drop(['LoanID', 'Default'], axis=1)
y = loan_data['Default']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred_rf = model.predict(X_test)


decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)


xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)


accuracy_dt = accuracy_score(y_test, y_pred_dt)
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)
class_report_dt = classification_report(y_test, y_pred_dt)


accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)


print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix_rf}")
print(f"Classification Report:\n{class_report_rf}")

print(f"\nDecision Tree Accuracy: {accuracy_dt * 100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix_dt}")
print(f"Classification Report:\n{class_report_dt}")

print(f"\nXGBoost Accuracy: {accuracy_xgb * 100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix_xgb}")
print(f"Classification Report:\n{class_report_xgb}")


Parameters: { "use_label_encoder" } are not used.



Random Forest Accuracy: 88.67%
Confusion Matrix:
[[45000   170]
 [ 5617   283]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45170
           1       0.62      0.05      0.09      5900

    accuracy                           0.89     51070
   macro avg       0.76      0.52      0.51     51070
weighted avg       0.86      0.89      0.84     51070


Decision Tree Accuracy: 80.26%
Confusion Matrix:
[[39623  5547]
 [ 4534  1366]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     45170
           1       0.20      0.23      0.21      5900

    accuracy                           0.80     51070
   macro avg       0.55      0.55      0.55     51070
weighted avg       0.82      0.80      0.81     51070


XGBoost Accuracy: 88.59%
Confusion Matrix:
[[44737   433]
 [ 5396   504]]
Classification Report:
              precision    recall  f1-sco

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)


accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
conf_matrix_lgb = confusion_matrix(y_test, y_pred_lgb)
class_report_lgb = classification_report(y_test, y_pred_lgb)

print(f"LightGBM Accuracy: {accuracy_lgb * 100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix_lgb}")
print(f"Classification Report:\n{class_report_lgb}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Number of positive: 23753, number of negative: 180524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 204277, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116278 -> initscore=-2.028155
[LightGBM] [Info] Start training from score -2.028155
LightGBM Accuracy: 88.76%
Confusion Matrix:
[[44932   238]
 [ 5503   397]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     45170
           1       0.63      0.07      0.12      5900

    accuracy                           0.89     51070
   macro avg       0.76      0.53      0.53     51070
weighted avg       0.86      0.89      0.85     51070



In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
from catboost import CatBoostClassifier


catboost_model = CatBoostClassifier(random_state=42, verbose=0)
catboost_model.fit(X_train, y_train)
y_pred_cat = catboost_model.predict(X_test)


accuracy_cat = accuracy_score(y_test, y_pred_cat)
conf_matrix_cat = confusion_matrix(y_test, y_pred_cat)
class_report_cat = classification_report(y_test, y_pred_cat)

print(f"CatBoost Accuracy: {accuracy_cat * 100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix_cat}")
print(f"Classification Report:\n{class_report_cat}")

CatBoost Accuracy: 88.76%
Confusion Matrix:
[[44861   309]
 [ 5433   467]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     45170
           1       0.60      0.08      0.14      5900

    accuracy                           0.89     51070
   macro avg       0.75      0.54      0.54     51070
weighted avg       0.86      0.89      0.85     51070



In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


base_estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('lgb', lgb.LGBMClassifier(random_state=42)),
    ('cat', CatBoostClassifier(random_state=42, verbose=0))
]


final_estimator = LogisticRegression()

stacking_clf = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator)


stacking_clf.fit(X_train, y_train)


y_pred_stack = stacking_clf.predict(X_test)


accuracy_stack = accuracy_score(y_test, y_pred_stack)
conf_matrix_stack = confusion_matrix(y_test, y_pred_stack)
class_report_stack = classification_report(y_test, y_pred_stack)


print(f"Stacking Classifier Accuracy: {accuracy_stack * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix_stack)
print("Classification Report:")
print(class_report_stack)


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 23753, number of negative: 180524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 204277, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116278 -> initscore=-2.028155
[LightGBM] [Info] Start training from score -2.028155


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 19002, number of negative: 144419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 163421, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116276 -> initscore=-2.028175
[LightGBM] [Info] Start training from score -2.028175
[LightGBM] [Info] Number of positive: 19002, number of negative: 144419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 163421, number of used features: 16
[LightGBM] [In

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

knn_predictions = knn_model.predict(X_test)


knn_accuracy = accuracy_score(y_test, knn_predictions)
print(f"KNN Model Accuracy: {knn_accuracy:.2f}")


print("KNN Classification Report:\n", classification_report(y_test, knn_predictions))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, knn_predictions))


KNN Model Accuracy: 0.87
KNN Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93     45170
           1       0.25      0.05      0.08      5900

    accuracy                           0.87     51070
   macro avg       0.57      0.51      0.51     51070
weighted avg       0.81      0.87      0.83     51070

KNN Confusion Matrix:
 [[44305   865]
 [ 5619   281]]
