
# Task 2 — Credit Risk Prediction

**Objective:** Predict loan default/approval using Kaggle Loan Prediction dataset.  
**Note:** Put `train.csv` in `data/loan/`.


In [None]:

import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix, classification_report


In [None]:

# Load data
data_dir = Path('data/loan')
for candidate in [data_dir/'train.csv', Path('../data/loan/train.csv'), Path('/mnt/data/devhub_ds_tasks/data/loan/train.csv')]:
    if candidate.exists():
        train_csv = candidate; break
else:
    raise FileNotFoundError("Download Kaggle 'Loan Prediction' train.csv to ./data/loan/train.csv")

df = pd.read_csv(train_csv)
print("Shape:", df.shape)
display(df.head())
display(df.isna().sum().sort_values(ascending=False))


In [None]:

# Target & features
target_col = 'Loan_Status' if 'Loan_Status' in df.columns else None
if target_col is None:
    raise KeyError("Expected 'Loan_Status' in dataset.")
X = df.drop(columns=[c for c in ['Loan_ID', target_col] if c in df.columns])
y = df[target_col].astype(str).str.upper().map({'Y':1,'N':0}).astype(int)

numeric = X.select_dtypes(include=[np.number]).columns.tolist()
categorical = X.select_dtypes(exclude=[np.number]).columns.tolist()

pre = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), numeric),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                      ('ohe', OneHotEncoder(handle_unknown='ignore'))]), categorical)
])


In [None]:

# Models
log_reg = Pipeline([('pre', pre), ('clf', LogisticRegression(max_iter=1000))])
tree = Pipeline([('pre', pre), ('clf', DecisionTreeClassifier(random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
log_reg.fit(X_train, y_train); tree.fit(X_train, y_train)
preds_lr = log_reg.predict(X_test)
preds_tr = tree.predict(X_test)

acc_lr = accuracy_score(y_test, preds_lr)
acc_tr = accuracy_score(y_test, preds_tr)
print(f"Accuracy — Logistic Regression: {acc_lr:.3f}")
print(f"Accuracy — Decision Tree     : {acc_tr:.3f}")


In [None]:

# Confusion matrix + report for best model
best_preds, best_name = (preds_lr, "Logistic Regression") if acc_lr>=acc_tr else (preds_tr, "Decision Tree")
cm = confusion_matrix(y_test, best_preds)
ConfusionMatrixDisplay(confusion_matrix=cm).plot()
plt.title(f"Confusion Matrix — {best_name}")
plt.show()
print("\nClassification Report —", best_name)
print(classification_report(y_test, best_preds, digits=3))



# Conclusion

- **Preprocessing:** Median (numeric) + most-frequent (categorical) imputation, then OHE.  
- **Performance:** Logistic Regression / Decision Tree provide usable baselines; confusion matrix shows majority-class advantage.  
- **Insight:** Expect credit history, income, and loan amount to matter; engineer ratio features for lift.  
- **Next:** Add CV + hyperparameter tuning; track ROC-AUC and recall for the positive class per risk policy.
