import data from database

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from xgboost import XGBClassifier
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# connect to sqlite3 database

# Replace with your database file path in Google Drive
db_path = '/content/drive/MyDrive/House Loan Analytics/my_database.db'

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

df= pd.read_sql("SELECT * FROM Risk_classifier", conn)
# Example query (replace with your actual query)
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print(tables)

conn.close()

[('raw_data',), ('Loan_Approval',), ('Interest_Rate_Features',), ('Risk_classifier',)]


Model

In [5]:
# Step 1: Define Target & Features (Risk Level Multiclass)
target = 'risk_level'
features = df.drop(columns=[target])

# Keep numeric features only
features = features.select_dtypes(include=[np.number])

# Step 2: Handle Missing Values
features = features.fillna(features.median())

# Step 3: Train-Test Split (Stratified for multi-class)
X_train, X_test, y_train, y_test = train_test_split(
    features, df[target],
    test_size=0.2, random_state=42, stratify=df[target]
)

# Step 4: Model Training (Random Forest & XGBoost)
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Train and Evaluate Models with corrected labels for XGBoost
for name, model in models.items():
    if name == "XGBoost":
        # Adjust labels for XGBoost
        y_train_adj = y_train - 1
        y_test_adj = y_test - 1

        model.fit(X_train, y_train_adj)
        y_pred = model.predict(X_test) + 1  # Shift predictions back to original labels
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    print(f"\n{name} Model Performance (Multi-class Classification):")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred), "\n")
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 5: Feature Importance
rf_importances = models["Random Forest"].feature_importances_
xgb_importances = models["XGBoost"].feature_importances_

feature_importance_df = pd.DataFrame({
    "Feature": features.columns,
    "RandomForest_Importance": rf_importances,
    "XGBoost_Importance": xgb_importances
}).sort_values(by="XGBoost_Importance", ascending=False)

print("\nTop 10 Most Important Features (XGBoost):")
print(feature_importance_df.head(10))


Random Forest Model Performance (Multi-class Classification):
Accuracy: 0.9796

Confusion Matrix:
 [[   25   226    43     0     0]
 [    0 18091   744     0     0]
 [    0  1146 48029     2     0]
 [    0     1    15 10929     5]
 [    0     0     4     4 28105]] 

Classification Report:
               precision    recall  f1-score   support

           1       1.00      0.09      0.16       294
           2       0.93      0.96      0.94     18835
           3       0.98      0.98      0.98     49177
           4       1.00      1.00      1.00     10950
           5       1.00      1.00      1.00     28113

    accuracy                           0.98    107369
   macro avg       0.98      0.80      0.82    107369
weighted avg       0.98      0.98      0.98    107369



Parameters: { "use_label_encoder" } are not used.




XGBoost Model Performance (Multi-class Classification):
Accuracy: 0.9937

Confusion Matrix:
 [[  294     0     0     0     0]
 [    0 18734   101     0     0]
 [    0   579 48598     0     0]
 [    0     0     0 10950     0]
 [    0     0     0     0 28113]] 

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00       294
           2       0.97      0.99      0.98     18835
           3       1.00      0.99      0.99     49177
           4       1.00      1.00      1.00     10950
           5       1.00      1.00      1.00     28113

    accuracy                           0.99    107369
   macro avg       0.99      1.00      1.00    107369
weighted avg       0.99      0.99      0.99    107369


Top 10 Most Important Features (XGBoost):
                                         Feature  RandomForest_Importance  \
82                   submission_of_application_3                 0.006565   
9                           