In [None]:
# Basic imports
import numpy as np
import pandas as pd

# Machine learning models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, mean_squared_error, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Reproducibility
np.random.seed(42)

In [None]:
# Load data
df = pd.read_csv('../data/Task 3 and 4_Loan_Data.csv')
df = df.set_index('customer_id')
df.head()

In [None]:
print(df.info())
print(df.describe())

In [None]:
# Check target balance
print(df['default'].value_counts(normalize=True))
sns.countplot(x='default', data=df)
plt.show()

In [None]:
# Exploratory Data Analysis
df.describe()
sns.pairplot(df, hue='default')
plt.show()

In [None]:
# Correlation heatmap
figsize = (12, 8)
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm_r', vmin=-1, vmax=1)
plt.show()

In [None]:
# Data Preprocessing
X = df.drop(columns=['default'])
y = df['default']

# Train-test split
X_train_df, X_test_df, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_df)
X_test = scaler.transform(X_test_df)



In [None]:
# Build models and evaluate
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    roc_auc = roc_auc_score(y_test, y_proba)
    mse = mean_squared_error(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    results[model_name] = {
        'ROC AUC': roc_auc,
        'MSE': mse,
        'Confusion Matrix': cm
    }

# Display results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"ROC AUC: {metrics['ROC AUC']:.4f}")
    print(f"MSE: {metrics['MSE']:.4f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])
    print("-" * 30)

In [None]:
def calculate_expected_loss(model, X_input, loan_amount, recovery_rate=0.1):
    pd = model.predict_proba(X_input)[:, 1]  # Probability of Default
    ead = loan_amount                        # Exposure at Default
    lgd = 1 - recovery_rate                  # Loss Given Default
    el = pd * lgd * ead                      # Expected Loss
    return el[0]

# Example
example_loan = X_test[0].reshape(1, -1)
loan_amount = X_test_df.loc[X_test_df.index[0], 'loan_amt_outstanding']
el = calculate_expected_loss(models['Logistic Regression'], example_loan, loan_amount)
print(f"Expected Loss for the example loan: {el:.2f}")
