# Home Loan Data Analysis â€“ Course-end Project 1
**Simplilearn Deep Learning | Predict loan repayment (default)**

**Objective:** Perform data preprocessing and build a deep learning model to predict whether an applicant will repay a loan using historical data. Dataset is highly imbalanced.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
print("TensorFlow:", tf.__version__)

## 1. Load Data & Data Quality
Load loan data. Target: **TARGET** (1 = repaid, 0 = default). We predict default (class 0).

In [None]:
# Load dataset (use the CSV in same folder)
df = pd.read_csv('loan_data (1).csv')
print("Shape:", df.shape)
print("Target distribution (TARGET: 1=repaid, 0=default):")
print(df['TARGET'].value_counts())
print("\\nMissing values (top 15):")
print(df.isnull().sum().sort_values(ascending=False).head(15))
df.head()

## 2. Data Preprocessing
Drop ID, drop columns with too many missing values, fill remaining, encode categoricals, and prepare numeric features.

In [None]:
# Drop identifier
if 'SK_ID_CURR' in df.columns:
    df = df.drop(columns=['SK_ID_CURR'])
# Sample for faster training (optional: use full data by removing .sample)
df_work = df.sample(n=min(50000, len(df)), random_state=42)
y = df_work['TARGET']
X_raw = df_work.drop(columns=['TARGET'])
# Drop columns with >50% missing
thresh = len(X_raw) * 0.5
X_clean = X_raw.dropna(axis=1, thresh=thresh)
print("Columns kept after dropping >50% missing:", X_clean.shape[1])
# Separate numeric and categorical
numeric_cols = X_clean.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_clean.select_dtypes(include=['object']).columns.tolist()
print("Numeric:", len(numeric_cols), "| Categorical:", len(cat_cols))

In [None]:
# Fill missing: numeric with median, categorical with mode
X_processed = X_clean.copy()
for c in numeric_cols:
    if c in X_processed.columns and X_processed[c].isnull().any():
        X_processed[c] = X_processed[c].fillna(X_processed[c].median())
for c in cat_cols:
    if c in X_processed.columns:
        X_processed[c] = X_processed[c].fillna(X_processed[c].mode().iloc[0] if len(X_processed[c].mode()) > 0 else 'Unknown')
# Label-encode categoricals
for c in cat_cols:
    if c in X_processed.columns:
        X_processed[c] = LabelEncoder().fit_transform(X_processed[c].astype(str))
# Use only numeric + encoded categoricals (all numeric now)
X_final = X_processed.select_dtypes(include=[np.number])
print("Final feature matrix shape:", X_final.shape)
print("Remaining missing:", X_final.isnull().sum().sum())

## 3. Train/Test Split, Scale, SMOTE
Stratified split, standardize features, and apply SMOTE on training set only to handle class imbalance.

In [None]:
# Drop any remaining rows with NaN
mask = ~X_final.isnull().any(axis=1)
X_final = X_final.loc[mask]
y = y.loc[mask]
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
# SMOTE on training data
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_bal, y_train_bal = smote.fit_resample(X_train_s, y_train)
print("After SMOTE - train labels:", pd.Series(y_train_bal).value_counts().to_dict())

## 4. Deep Learning Model
Build a feedforward neural network with Dense layers, Dropout, and early stopping.

In [None]:
n_features = X_train_bal.shape[1]
model = keras.Sequential([
    layers.Input(shape=(n_features,)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Train with early stopping
early = callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor='val_loss')
history = model.fit(X_train_bal, y_train_bal, validation_split=0.2, epochs=30, batch_size=256, callbacks=[early], verbose=1)

In [None]:
# Training history
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.legend()
plt.title('Loss')
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.legend()
plt.title('Accuracy')
plt.tight_layout()
plt.show()

## 5. Evaluation on Test Set
Evaluate on original (unbalanced) test set using accuracy, precision, recall, F1, ROC-AUC.

In [None]:
y_pred_proba = model.predict(X_test_s)
y_pred = (y_pred_proba >= 0.5).astype(int).flatten()
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("Precision:", round(precision_score(y_test, y_pred, zero_division=0), 4))
print("Recall:", round(recall_score(y_test, y_pred, zero_division=0), 4))
print("F1-Score:", round(f1_score(y_test, y_pred, zero_division=0), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_pred_proba), 4))
print("\\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Default', 'Repaid']))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=['Default', 'Repaid'], yticklabels=['Default', 'Repaid'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()