
# Task 2 — Credit Risk Prediction

**Objective:** Predict whether a loan applicant will default.  
**Dataset:** Kaggle *Loan Prediction* (train/test CSVs).  
**Notes:** This notebook assumes you've downloaded the dataset from Kaggle and placed `train.csv` (and optionally `test.csv`) under `./data/loan/`.


In [None]:

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix, classification_report


In [None]:

# Load data
data_dir = Path('data/loan')
train_csv = None
for p in [data_dir/'train.csv', Path('../data/loan/train.csv'), Path('/mnt/data/devhub_ds_tasks/data/loan/train.csv')]:
    if p.exists():
        train_csv = p
        break

if train_csv is None:
    raise FileNotFoundError(
        "Please download the Kaggle 'Loan Prediction' dataset and place train.csv at ./data/loan/train.csv"
    )

df = pd.read_csv(train_csv)
df.head()


In [None]:

# Basic EDA
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
display(df.head())
display(df.isna().sum().sort_values(ascending=False))


In [None]:

# Target & features
# Common versions of this dataset use 'Loan_Status' (Y/N) as target.
target_col = 'Loan_Status' if 'Loan_Status' in df.columns else None
if target_col is None:
    raise KeyError("Could not find 'Loan_Status' column. Please confirm the dataset version.")

X = df.drop(columns=[target_col, 'Loan_ID'] if 'Loan_ID' in df.columns else [target_col])
y = (df[target_col].astype(str).str.upper().map({'Y':1, 'N':0})).astype(int)

# Identify types
numeric = X.select_dtypes(include=[np.number]).columns.tolist()
categorical = X.select_dtypes(exclude=[np.number]).columns.tolist()

# Preprocess
pre = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), numeric),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical)
])


In [None]:

# Model: Logistic Regression (baseline)
log_reg = Pipeline([
    ('pre', pre),
    ('clf', LogisticRegression(max_iter=1000))
])

# Alternative: Decision Tree
tree = Pipeline([
    ('pre', pre),
    ('clf', DecisionTreeClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
log_reg.fit(X_train, y_train)
tree.fit(X_train, y_train)

preds_lr = log_reg.predict(X_test)
preds_tr = tree.predict(X_test)

acc_lr = accuracy_score(y_test, preds_lr)
acc_tr = accuracy_score(y_test, preds_tr)
print(f"Accuracy — Logistic Regression: {acc_lr:.3f}")
print(f"Accuracy — Decision Tree     : {acc_tr:.3f}")


In [None]:

# Confusion matrix (best model)
best_preds = preds_lr if acc_lr >= acc_tr else preds_tr
best_name = "Logistic Regression" if acc_lr >= acc_tr else "Decision Tree"
cm = confusion_matrix(y_test, best_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title(f"Confusion Matrix — {best_name}")
plt.show()

print("\nClassification Report —", best_name)
print(classification_report(y_test, best_preds, digits=3))


In [None]:

# Brief conclusion (edit as needed)
print("""
• We handled missing values via median (numeric) and most_frequent (categorical), then OHE for categoricals.
• Baseline models show above-chance accuracy. Tune hyperparameters (grid search) and add cross-validation for improvements.
• Consider feature scaling for algorithms like SVM/LogReg and try ensemble models (RandomForest, XGBoost).
""")
