In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('Loan_Default.csv')
df

### Dataset Overview
The dataset contains 34 columns and around 0.15M rows (samples)

Target variable: 'Status'
   - 0 => likely to pay back
   - 1 => Not likely to pay back (risky customer)

This is a **credit risk modeling** problem.


### Business Context
- **False Negatives (FN)**: Risky borrower predicted as safe → **high financial loss**
- **False Positives (FP)**: Safe borrower predicted as risky → opportunity loss
- Therefore, **recall for defaulters (Status = 1)** is prioritized over accuracy.

### Feature Description
The dataset contains borrower demographics, loan characteristics, and collateral information.

Key feature groups:
- Borrower attributes: income, age, credit score, credit type
- Loan structure: loan amount, interest rate, amortization type, LTV
- Collateral details: property value, occupancy type, security type
- Application metadata: region, submission channel

Many features reflect lender risk assessment at loan origination.


In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe(include='all')

### Initial Inspection
- Examined data types, missing values, and value distributions.
- Identified categorical, numerical, ordinal, and binary features.
- Year and ID columns do not contribute to prediction and will be removed.


In [None]:
df.isnull().sum()

### Missing Value Analysis
- Numerical features contain missing values and outliers.
- Categorical features have missing or unknown categories.
- Median imputation is used for numerical variables.
- Missing categorical values are assigned an explicit `Unknown` category.


In [None]:
import seaborn as sns
sns.countplot(x='Status', data=df)


### Target Variable Distribution
- The dataset is **highly imbalanced**.
- Majority of borrowers are safe (Status = 0).
- Accuracy alone is misleading; recall and F1-score are more informative.


In [None]:
sns.scatterplot(x='dtir1', y='income', hue='Status', data=df)

### Income vs Debt-to-Income Ratio
- Higher income generally corresponds to lower DTI.
- Defaults occur even among high-income borrowers.
- Indicates income alone does not fully explain default risk.


In [None]:
sns.boxplot(x='Status', y='LTV', data=df)

### Loan-to-Value (LTV) and Default Risk
- Higher LTV loans exhibit a greater tendency to default.
- Lower borrower equity increases lender exposure.


In [None]:
sns.barplot(x='age', y='Status', data=df)

### Age and Default Behavior
- Default rates vary across age groups.
- Relationship between age and default is non-linear.


In [None]:
sns.scatterplot(x='loan_amount', y='income', hue='Status', data=df)
import matplotlib.pyplot as plt
plt.yscale("log")
plt.xscale("log")


### Loan Amount vs Income
- Larger loans are generally issued to higher-income borrowers.
- Default risk increases when loan size is high relative to income.
- Log-scale visualization highlights affordability stress.


In [None]:
sns.boxplot(x='total_units', y='property_value', hue='Status', data=df)

### Property Units and Collateral Value
- Property value increases with the number of units.
- Defaulted loans are associated with comparatively lower collateral value.


In [None]:
sns.barplot(x='Neg_ammortization', y='Status', data=df)

### Negative Amortization Impact
- Loans with negative amortization show significantly higher default rates.
- Deferred interest structures increase borrower risk.


In [None]:
df['credit_bin'] = pd.cut(
    df['Credit_Score'],
    bins = [300, 580, 670, 740, 800, 850],
    labels= ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
)

sns.barplot(x='credit_bin', y ='Status', data=df)

### Credit Score Risk Buckets
- Credit scores are grouped into ordinal risk categories.
- Default rates decrease as credit quality improves.


In [None]:
sns.scatterplot(x='rate_of_interest', y='Status', data=df)

### Interest Rate and Spread Analysis
- Higher interest rates and spreads are associated with higher default risk.
- Indicates lenders already price risk into loan terms.


### EDA Summary
- No single feature cleanly separates defaulters from non-defaulters.
- Default risk emerges from interactions between affordability, loan structure, and collateral.
- Machine learning models are appropriate for capturing these interactions.


In [None]:
# preprocessing
df.drop(columns=['ID', 'year', 'term'], inplace=True)

In [None]:
df['income'].fillna(df['income'].median(), inplace=True)
df['property_value'].fillna(df['property_value'].median(), inplace=True)
df['LTV'].fillna(df['LTV'].median(), inplace=True)
df['dtir1'].fillna(df['dtir1'].median(), inplace=True)
df['rate_of_interest'].fillna(df['rate_of_interest'].median(), inplace=True)
df['Interest_rate_spread'].fillna(df['Interest_rate_spread'].median(), inplace=True)
df['Upfront_charges'].fillna(df['Upfront_charges'].median(), inplace=True)
df['age'] = df['age'].fillna('Unknown')

In [None]:
df['loan_limit'] = df['loan_limit'].fillna('Unknown')
df['approv_in_adv'] = df['approv_in_adv'].fillna('Unknown')
df['loan_purpose'] = df['loan_purpose'].fillna('Unknown')
df['Neg_ammortization'] = df['Neg_ammortization'].fillna('not_neg')
df['credit_bin'] = df['credit_bin'].astype(str).fillna('Unknown')
mode_val = df['submission_of_application'].mode()[0]
df['submission_of_application'] = df['submission_of_application'].fillna(mode_val)


In [None]:
# feature engg
df['loan_to_income'] = df['loan_amount'] / df['income']
df['loan_to_property'] = df['loan_amount'] / df['property_value']


import numpy as np
df['ltv_bin'] = pd.cut(df['LTV'], bins=[0,60,80,90,100])
df['log_income'] = np.log1p(df['income'])
df['log_loan_amount'] = np.log1p(df['loan_amount'])
df['log_upfront_charges'] = np.log1p(df['Upfront_charges'])
df['log_rate_of_interest'] = np.log1p(df['rate_of_interest'])
df['log_interest_rate_spread'] = np.log1p(df['Interest_rate_spread'])
df['log_property_value'] = np.log1p(df['property_value'])

df['ltv_bin'] = df['ltv_bin'].astype(str).fillna('Unknown')
df['log_interest_rate_spread'].fillna(
    df['log_interest_rate_spread'].median(),
    inplace=True
)

### Feature Engineering
- Created financial ratios such as loan-to-income and loan-to-property.
- Applied logarithmic transformations to reduce skew and outlier impact.
- Discretized continuous variables into risk bins where appropriate.


In [None]:
drop_cols = [
    'rate_of_interest',
    'Interest_rate_spread',
    'Upfront_charges',
    'log_interest_rate_spread',
    'Neg_ammortization',
    'approv_in_adv',
    'interest_only'
]
df.drop(columns=drop_cols, inplace=True)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_cols = [
    'lump_sum_payment',
    'open_credit',
    'Credit_Worthiness',
    'total_units',
    'ltv_bin'
]

ordinal_categories = [
    ['not_lpsm', 'lpsm'],               # lump_sum_payment
    ['nopc', 'opc'],                    # open_credit
    ['l1', 'l2'],                 # Credit_Worthiness
    ['1U', '2U', '3U', '4U'],            # total_units
    ['(0, 60]', '(60, 80]', '(80, 90]', '(90, 100]']  # ltv_bin
]

ordinal_encoder = OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value',
    unknown_value=-1)
df[ordinal_cols] = ordinal_encoder.fit_transform(df[ordinal_cols])




In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
onehot_cols = [
    'loan_limit',
    'Gender',
    'loan_type',
    'loan_purpose',
    'business_or_commercial',
    'construction_type',
    'occupancy_type',
    'Secured_by',
    'credit_type',
    'age',
    'submission_of_application',
    'Region',
    'Security_Type',
    'credit_bin',
    'co-applicant_credit_type'
]
df = pd.get_dummies(
    df,
    columns=onehot_cols,
    drop_first=True
)

### Encoding Strategy
- Ordinal encoding applied where natural ordering exists.
- One-hot encoding applied to nominal categorical variables.
- Encoding choices avoid introducing artificial relationships.


In [None]:
df['loan_to_income'] = df['loan_to_income'].replace([np.inf, -np.inf], np.nan)
df['loan_to_income'] = df['loan_to_income'].fillna(df['loan_to_income'].median())

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Status', axis=1)
y = df['Status']
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
# finding where infinity exists
np.isinf(X_train).sum()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

modelDT = DecisionTreeClassifier(random_state=42, max_depth=2, class_weight='balanced')

modelRF = RandomForestClassifier(max_depth=3, random_state=42, n_estimators=100, class_weight='balanced')

modelLR = LogisticRegression(random_state=42, class_weight='balanced')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    modelDT,
    X,
    y,
    cv=cv,
    scoring='recall'
)
scores, scores.mean()
print("DT CV recall scores:", scores)
print("DT mean CV recall:", scores.mean())


modelDT.fit(X_train, y_train)

y_pred = modelDT.predict(X_test)
from sklearn.metrics import recall_score, f1_score, confusion_matrix, classification_report, accuracy_score
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))


### Model Selection
- Decision Tree used as primary model for interpretability.
- Random Forest and Logistic Regression used for comparison.
- Class imbalance handled using `class_weight='balanced'`.

### Cross-Validation
- Stratified 5-fold cross-validation used.
- Recall chosen as the primary evaluation metric.
- Ensures model stability across different data splits.

### Model Evaluation
- Evaluated using recall, F1-score, confusion matrix, and accuracy.
- Emphasis placed on minimizing false negatives.


In [None]:
y_shuffled = y.sample(frac=1.0, random_state=42).values

modelDT.fit(X_train, y_shuffled[:len(X_train)])
modelDT.score(X_test, y_shuffled[len(X_train):])


### Validation: Label Shuffling
- Target labels were randomly shuffled and model retrained.
- Performance dropped to near-random levels.
- Confirms absence of data leakage and genuine learning.


In [None]:
from sklearn.tree import plot_tree
plot_tree(modelDT, max_depth=2, feature_names=X.columns, class_names=['Non-default','Default'])


### Decision Tree Visualization
- Visualized top levels of the decision tree.
- Highlights key risk drivers used for classification.
