# Loan Risk Prediction Project

## Imports

In [17]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc


## Data Loading and Exploration

In [18]:
# Load the loan data from CSV file
df = pd.read_json('../data/loan_approval_dataset.json')

In [None]:
# Display the first few rows of the dataset
df.head()

In [None]:
# Get information about the dataset
df.info()

In [None]:
df.isnull().sum()


## Data Cleaning and Preprocessing

In [21]:
# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


## Data Visualization

In [None]:
# Numeric variable distributions
numeric_cols = ['Income', 'Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
for i, col in enumerate(numeric_cols):
    sns.histplot(df[col], ax=axes[i//3, i%3], kde=True)
    axes[i//3, i%3].set_title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Loan Approval Features', fontsize=16)
plt.show()

In [None]:
# Risk distribution by marital status
plt.figure(figsize=(12, 8))
sns.countplot(x='Married/Single', hue='Risk_Flag', data=df)
plt.title('Risk Distribution by Marital Status', fontsize=16)
plt.xlabel('Marital Status', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Risk Flag', title_fontsize='12', fontsize='10')
plt.show()

## Feature Selection and Model Preparation

In [25]:
# Block: Model Preparation
X = df.drop('Risk_Flag', axis=1)
y = df['Risk_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Training

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

## Model Evaluation

In [27]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix', fontsize=16)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.show()

In [None]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# ROC curve
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=16)
plt.legend(loc="lower right", fontsize=10)
plt.show()

In [None]:
# Feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf_model.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance in Loan Risk Prediction', fontsize=16)
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.show()