# Lending Club Analysis
## Goal: Maximize profit

In [None]:
# Requirements: pandas, numpy, scikit-learn, xgboost, shap
# !pip install pandas numpy scikit-learn xgboost shap seaborn

## Import Libraries

In [None]:
import os
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams.update({"figure.figsize": (8,5), "font.size": 11})
sns.set_style("whitegrid")

# Output folder
OUT = Path.cwd() / "lc_outputs"
OUT.mkdir(parents=True, exist_ok=True)

# CSV path
CSV_PATH = Path.cwd() / "Lending Club Data - DR_Demo_Lending_Club.csv"


## Load Dataset

In [None]:
# ==========
# Load data
# ==========
print("Loading:", CSV_PATH)
df = pd.read_csv(CSV_PATH)
print("Rows:", df.shape[0])
print("Columns:", len(df.columns))

# quick peek
display(df.head())


## Summary Statistics

In [None]:
df.info()

In [None]:
df.describe()

## Target Analysis

In [None]:
target_col = "is_bad"

print("Using target:", target_col)
print("Target distribution:")
print(df[target_col].value_counts(dropna=False, normalize=True))


## Feature Engineering

In [None]:
# ======================================
# Basic cleaning & drop columns
# ======================================
# Drop obvious ID/text columns not useful for modeling
drop_candidates = ["Id","emp_title","Notes","purpose","zip_code","addr_state","mths_since_last_record",
                   "initial_list_status","collections_12_mths_ex_med","policy_code"]

print("Dropping:", drop_candidates)
df = df.drop(columns=drop_candidates)


In [None]:
# ======================
# Convert date into age
# ======================
# Ensure column is in datetime format
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], format='%m/%d/%y')

# Calculate age in years
df['credit_age_years'] = (datetime.now() - df['earliest_cr_line']).dt.days / 365.25

# Handle invalid or missing dates
df['credit_age_years'] = df['credit_age_years'].fillna(df['credit_age_years'].median())

# Check results
print(df[['earliest_cr_line', 'credit_age_years']].head())

# Convert the emp_length to numeric
df['emp_length'] = pd.to_numeric(df['emp_length'], errors='coerce')
print(df['emp_length'].value_counts())

# Select all object-typed columns
object_cols = df.select_dtypes(include=['object']).columns

# Convert selected columns to 'category' dtype
for col in object_cols:
    df[col] = df[col].astype('category')

## Feature Selection

In [None]:
# ==================
# Feature selection  
# ==================

# Choose numeric and categorical features for modeling,
NUMERIC = df.select_dtypes(include=["int64","float64"]).columns.tolist()
NUMERIC = [c for c in NUMERIC if c != target_col]  # exclude target

CATEGORICAL = df.select_dtypes(include=["object","category","bool"]).columns.tolist()
# limit categorical to those with <= 50 unique values (adjustable)
CATEGORICAL = [c for c in CATEGORICAL if df[c].nunique() <= 50]

# Remove any columns with > 95% missing
cols_to_drop = [c for c in df.columns if df[c].isna().mean() > 0.95]
print("Drop columns with most missing values:", cols_to_drop)
if cols_to_drop:
    print("Dropping >95% missing:", cols_to_drop)
    df = df.drop(columns=cols_to_drop)
    NUMERIC = [c for c in NUMERIC if c not in cols_to_drop]
    CATEGORICAL = [c for c in CATEGORICAL if c not in cols_to_drop]

print("Numeric features chosen:", len(NUMERIC))
print("Categorical features chosen:", len(CATEGORICAL))

# keep final feature lists (you can customize manually)
FEATURES = NUMERIC + CATEGORICAL
print("Total features to use:", len(FEATURES))

## Exploratory Data Analysis

In [None]:
# ================================
# Exploratory Data Analysis (EDA) 
# ================================
# Basic distributions
print("\n--- EDA summary ---")
print("Default rate:", df[target_col].mean())

# Missingness overview (top 15)
missing = df[FEATURES + [target_col]].isna().sum().sort_values(ascending=False)
print("Top missing counts:\n", missing.head(15))


### Numerical Features - Distribution

In [None]:
# Target distribution
plt.figure(figsize=(4,3))
sns.countplot(x=target_col, data=df)
plt.title("Target distribution (0 = good, 1 = default)")
plt.tight_layout()
plt.show()

# Numeric histograms
for col in NUMERIC:
    plt.figure(figsize=(6,3))
    sns.histplot(df[col].dropna(), bins=40, kde=False)
    plt.title(f"Histogram: {col}")
    plt.tight_layout()
    plt.show()


### Correlation Analysis

In [None]:
# Correlation heatmap of numeric features
if len(NUMERIC) >= 2:
    num_corr = df[NUMERIC].corr().abs()
    plt.figure(figsize=(10,8))
    sns.heatmap(df[NUMERIC + [target_col]].corr(), cmap="coolwarm", center=0, annot=True)
    plt.title("Numeric Features - Correlations")
    plt.xticks(rotation=45, ha="right")
    # plt.yticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


In [None]:
if NUMERIC:
    corr_with_target = df[NUMERIC].corrwith(df[target_col]).sort_values(ascending=False)

corr_with_target

### Numerical Features - Binned Analysis

In [None]:
# Numeric feature vs target: binned default rate
for col in NUMERIC:
    tmp = df[[col, target_col]].dropna()
    tmp["bin_index"] = pd.qcut(tmp[col], q=10, duplicates="drop")
    br = tmp.groupby("bin_index")[target_col].mean()
    plt.figure(figsize=(6,4))
    br.plot(kind="bar")
    plt.title(f"Default rate by {col} decile")
    plt.ylabel("Default rate")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


### Categorical Features - Top 10 values

In [None]:
# Categorical vs target (top categories)
for col in CATEGORICAL:
    plt.figure(figsize=(6,4))
    order = df[col].value_counts(ascending=False).index[:10]
    sns.barplot(x=col, y=target_col, data=df, order=order)
    plt.xticks(rotation=45, ha="right")
    plt.title(f"Default rate by {col} (top categories)")
    plt.tight_layout()
    plt.show()


## Address Class Imbalance

In [None]:
from sklearn.utils import resample

# filter classes
majority = df[df['is_bad'] == 0] # Negative class - Good 
minority = df[df['is_bad'] == 1] # Positive class - Default

# Set target size (e.g., average of both classes)
target_size = int((len(majority) + len(minority)) / 2)

# Downsample majority class
majority_downsampled = resample(majority,
                                replace=False,
                                n_samples=target_size,
                                random_state=42)

# Upsample minority class
minority_upsampled = resample(minority,
                              replace=True,
                              n_samples=target_size,
                              random_state=42)

# Combine into a new DataFrame
balanced_df = pd.concat([majority_downsampled, minority_upsampled])

# Shuffle the dataset (optional)
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(balanced_df.shape)

### Verify Class Balance

In [None]:
print("Using target:", target_col)
print("Target distribution:")
print(balanced_df[target_col].value_counts(dropna=False, normalize=True))

## Preprocessing Pipeline

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# numeric pipeline
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# categorical pipeline
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", dtype=np.float32))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, NUMERIC),
        ("cat", cat_transformer, CATEGORICAL),
    ],
    remainder="drop", verbose_feature_names_out=False
)

# Fit-transform once to get output feature dimension
X = balanced_df[FEATURES].copy()
y = balanced_df[target_col].astype(int)

preprocessor.fit(X)
print("Preprocessor fitted.")


## Split the dataset for Training

In [None]:
# ===========================================
# Preprocessing pipelines & train/test split
# ===========================================
from sklearn.model_selection import train_test_split

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
print("Train / Test:", X_train.shape, X_test.shape)


## Model Training

In [None]:
# ======================================================================
# Model training (Logistic Regression, RandomForest, XGBoost/GradBoost)
# ======================================================================
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE 
from imblearn.pipeline import Pipeline as ImbPipeline 

models = {}

smote = SMOTE(random_state=42)

# Logistic Regression (balanced)
pipe_lr = Pipeline([
    ("preproc", preprocessor),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", solver="saga"))
])
pipe_lr.fit(X_train, y_train)
models["LogisticRegression"] = pipe_lr
print("Trained LogisticRegression")

# Random Forest
pipe_rf = Pipeline([
    ("preproc", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=1000, random_state=42, class_weight="balanced", n_jobs=-10))
])
pipe_rf.fit(X_train, y_train)
models["RandomForest"] = pipe_rf
print("Trained RandomForest")

# XGBoost
from xgboost import XGBClassifier
pipe_xgb = Pipeline([
    ("preproc", preprocessor),
    ("clf", XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42, n_estimators=1000, n_jobs=10))
])
pipe_xgb.fit(X_train, y_train)
models["XGBoost"] = pipe_xgb
print("Trained XGBoost")


## Model Evaluation

In [None]:
# ==================================
# Evaluation helper & results table
# ==================================
def evaluate_model(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    return {"accuracy":acc, "precision":prec, "recall":rec, "f1":f1, "auc":auc, "confusion_matrix":cm.tolist(), "y_proba":y_proba, "y_pred":y_pred}

results_list = []
for name, pipe in models.items():
    print("Evaluating:", name)
    res = evaluate_model(pipe, X_test, y_test)
    res_row = {"model": name, "accuracy": res["accuracy"], "precision": res["precision"],
               "recall": res["recall"], "f1": res["f1"], "auc": res["auc"], "confusion_matrix": res["confusion_matrix"]}
    results_list.append(res_row)
    
results_df = pd.DataFrame(results_list).sort_values(by="f1", ascending=False)
display(results_df)
results_df.to_csv(OUT / "model_comparison.csv", index=False)
print("Saved model_comparison.csv")

# Plot ROC curves if probabilities available
from sklearn.metrics import roc_curve, auc
plt.figure()
for name, pipe in models.items():
    y_proba = pipe.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.3f})")

plt.plot([0,1],[0,1],"k--")
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curves")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig(OUT / "roc_curves.png")
plt.show()
print("Saved AUC-ROC curve plot - roc_curves.png")

## Feature Importance

### Logistic Regression Feature Importance

In [None]:
# get the classifier from the pipeline
lr_clf = models['LogisticRegression'].named_steps['clf']

# Get coefficients
feature_coefficients = lr_clf.coef_[0] 

# Get feature names
feature_names = models['LogisticRegression'][:-1].get_feature_names_out()

feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': feature_coefficients})
# feature_importance['Absolute_Coefficient'] = abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)

# Sort features by importance
sorted_idx = np.argsort(feature_coefficients)
sorted_importances = feature_coefficients[sorted_idx]
sorted_feature_names = [feature_names[i] for i in sorted_idx]

plt.figure(figsize=(10, 8))
plt.barh(range(len(sorted_importances)), sorted_importances, tick_label=sorted_feature_names)
plt.xticks(rotation=90)
plt.title("Logistic Regression Feature Importances")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

### RandomForest Feature Importance

In [None]:
# get the classifier from the pipeline
rf_clf = models['RandomForest'].named_steps['clf']

# get feature importances
feature_importances = rf_clf.feature_importances_

# feature names
feature_names = models['RandomForest'][:-1].get_feature_names_out()

# Sort features by importance
sorted_idx = np.argsort(feature_importances)
sorted_importances = feature_importances[sorted_idx]
sorted_feature_names = [feature_names[i] for i in sorted_idx]

plt.figure(figsize=(10, 8))
plt.barh(range(len(sorted_importances[:20])), sorted_importances[:20], tick_label=sorted_feature_names[:20])
plt.xticks(rotation=90)
plt.title("Random Forest Feature Importances")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

### XGBoost Feature Importance

In [None]:
# get the classifier from the pipeline
xgb_clf = models['XGBoost'].named_steps['clf']

# get feature importances
feature_importances = xgb_clf.feature_importances_

# feature names
feature_names = models['XGBoost'][:-1].get_feature_names_out()

# Sort features by importance
sorted_idx = np.argsort(feature_importances)
sorted_importances = feature_importances[sorted_idx]
sorted_feature_names = [feature_names[i] for i in sorted_idx]

plt.figure(figsize=(10, 8))
plt.barh(range(len(sorted_importances[:20])), sorted_importances[:20], tick_label=sorted_feature_names[:20])
plt.xticks(rotation=90)
plt.title("XGBoost Feature Importances")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()