# Lending Club Loan Data Analysis – Course-end Project 2
**Simplilearn Deep Learning | Predict loan default (2007–2015 historical data)**

**Objective:** Perform data preprocessing and build a deep learning model to predict whether a loan will default. Dataset is highly imbalanced.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
print("TensorFlow:", tf.__version__)

## 1. Load Data & Data Quality
Target: **not.fully.paid** (1 = default, 0 = repaid).

In [None]:
# Load dataset
df = pd.read_csv('loan_data.csv')
print("Shape:", df.shape)
print("Columns:", list(df.columns))
print("Target distribution (not.fully.paid: 1=default, 0=repaid):")
print(df['not.fully.paid'].value_counts())
print("Missing values:")
print(df.isnull().sum())
df.head(10)

## 2. Data Preprocessing
Encode categorical (purpose), fill missing if any, and prepare features.

In [None]:
# Target
y = df['not.fully.paid']
X_raw = df.drop(columns=['not.fully.paid'])
# Encode 'purpose' (categorical)
if 'purpose' in X_raw.columns:
    X_raw['purpose'] = LabelEncoder().fit_transform(X_raw['purpose'].astype(str))
# Fill any missing with column median
X_final = X_raw.fillna(X_raw.median(numeric_only=True))
# Ensure all numeric
X_final = X_final.select_dtypes(include=[np.number])
print("Feature matrix shape:", X_final.shape)
print("Missing:", X_final.isnull().sum().sum())

## 3. Train/Test Split, Scale, SMOTE
Stratified split, standardize, SMOTE on training set for class imbalance.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
# SMOTE (use k_neighbors <= min class count)
min_class = min((y_train == 0).sum(), (y_train == 1).sum())
k = min(5, min_class - 1) if min_class > 1 else 1
smote = SMOTE(random_state=42, k_neighbors=k)
X_train_bal, y_train_bal = smote.fit_resample(X_train_s, y_train)
print("After SMOTE - train labels:", pd.Series(y_train_bal).value_counts().to_dict())

## 4. Deep Learning Model
Feedforward network with Dense layers, Dropout, EarlyStopping.

In [None]:
n_features = X_train_bal.shape[1]
model = keras.Sequential([
    layers.Input(shape=(n_features,)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
early = callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor='val_loss')
history = model.fit(X_train_bal, y_train_bal, validation_split=0.2, epochs=50, batch_size=256, callbacks=[early], verbose=1)

In [None]:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.legend()
plt.title('Loss')
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.legend()
plt.title('Accuracy')
plt.tight_layout()
plt.show()

## 5. Evaluation on Test Set

In [None]:
y_pred_proba = model.predict(X_test_s)
y_pred = (y_pred_proba >= 0.5).astype(int).flatten()
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("Precision:", round(precision_score(y_test, y_pred, zero_division=0), 4))
print("Recall:", round(recall_score(y_test, y_pred, zero_division=0), 4))
print("F1-Score:", round(f1_score(y_test, y_pred, zero_division=0), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_pred_proba), 4))
print("\\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Repaid', 'Default']))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=['Repaid', 'Default'], yticklabels=['Repaid', 'Default'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()