import data from database

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from xgboost import XGBClassifier
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# connect to sqlite3 database

# Replace with your database file path in Google Drive
db_path = 'my_database.db'

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

df= pd.read_sql("SELECT * FROM Risk_classifier", conn)
# Example query (replace with your actual query)
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print(tables)

conn.close()

[('raw_data',), ('Loan_Approval',), ('Interest_Rate_Features',), ('Property_Values',), ('Risk_classifier',)]


In [10]:
features.columns

Index(['debt_to_income_ratio', 'loan_to_value_ratio', 'interest_rate',
       'loan_amount', 'rate_spread', 'total_loan_costs', 'origination_charges',
       'loan_term', 'income', 'property_value', 'applicant_credit_score_type',
       'co-applicant_credit_score_type', 'co-applicant_age',
       'applicant_age_above_62', 'co-applicant_age_above_62', 'loan_type_2',
       'loan_type_3', 'loan_type_4', 'loan_purpose_2', 'loan_purpose_4',
       'loan_purpose_5', 'loan_purpose_31', 'loan_purpose_32',
       'derived_loan_product_type_Conventional:Subordinate Lien',
       'derived_loan_product_type_FHA:First Lien',
       'derived_loan_product_type_FHA:Subordinate Lien',
       'derived_loan_product_type_FSA/RHS:First Lien',
       'derived_loan_product_type_FSA/RHS:Subordinate Lien',
       'derived_loan_product_type_VA:First Lien',
       'derived_loan_product_type_VA:Subordinate Lien', 'occupancy_type_2',
       'occupancy_type_3', 'derived_dwelling_category_Multifamily:Site-Built',
 

Model

In [8]:
# Step 1: Define Target & Features (Risk Level Multiclass)
target = 'risk_level'
features = df.drop(columns=[target])

# Keep numeric features only
features = features.select_dtypes(include=[np.number])

# Step 2: Handle Missing Values
features = features.fillna(features.median())

# Step 3: Train-Test Split (Stratified for multi-class)
X_train, X_test, y_train, y_test = train_test_split(
    features, df[target],
    test_size=0.2, random_state=42, stratify=df[target]
)

# Step 4: Model Training (Random Forest & XGBoost)
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    #"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Train and Evaluate Models with corrected labels for XGBoost
for name, model in models.items():
    if name == "XGBoost":
        # Adjust labels for XGBoost
        y_train_adj = y_train - 1
        y_test_adj = y_test - 1

        model.fit(X_train, y_train_adj)
        y_pred = model.predict(X_test) + 1  # Shift predictions back to original labels
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    print(f"\n{name} Model Performance (Multi-class Classification):")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred), "\n")
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 5: Feature Importance
rf_importances = models["Random Forest"].feature_importances_
#xgb_importances = models["XGBoost"].feature_importances_

feature_importance_df = pd.DataFrame({
    "Feature": features.columns,
    "RandomForest_Importance": rf_importances,
#    "XGBoost_Importance": xgb_importances
}).sort_values(by="RandomForest_Importance", ascending=False)

print("\nTop 10 Most Important Features (RandomForest_Importance):")
print(feature_importance_df.head(10))


Random Forest Model Performance (Multi-class Classification):
Accuracy: 0.9989

Confusion Matrix:
 [[  183   110     1     0     0]
 [    0  4065     7     0     0]
 [    0     0 63939     1     0]
 [    0     0     0 10950     0]
 [    0     0     0     0 28113]] 

Classification Report:
               precision    recall  f1-score   support

           1       1.00      0.62      0.77       294
           2       0.97      1.00      0.99      4072
           3       1.00      1.00      1.00     63940
           4       1.00      1.00      1.00     10950
           5       1.00      1.00      1.00     28113

    accuracy                           1.00    107369
   macro avg       0.99      0.92      0.95    107369
weighted avg       1.00      1.00      1.00    107369


Top 10 Most Important Features (RandomForest_Importance):
                           Feature  RandomForest_Importance
1              loan_to_value_ratio                 0.355787
0             debt_to_income_ratio      

In [9]:
import joblib
joblib.dump(model, 'high_risk_model.pkl')

['high_risk_model.pkl']