# Gold Layer Exploration and Logistic Regression Modeling

This script:
1. Loads and explores the gold layer feature store
2. Performs EDA on features and target variable
3. Prepares data for modeling (feature selection, train/test split)
4. Trains a logistic regression model
5. Evaluates model performance with metrics and visualizations

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import pyspark
from pyspark.sql.functions import col
import pyspark.sql.functions as F

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve,
    precision_recall_curve,
    average_precision_score
)

In [2]:
# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [3]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("ModelTraining") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 11:25:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/09 11:25:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# Configuration
GOLD_DIR = "/app/datamart/gold/feature_store/"

# LOAD GOLD LAYER DATA

In [4]:
# Initialize Spark
spark = pyspark.sql.SparkSession.builder \
    .appName("explore_gold") \
    .master("local[*]") \
    .config("spark.sql.parquet.mergeSchema", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 11:16:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Configuration
FEATURE_DIR = "/app/datamart/gold/feature_store/"
LABEL_DIR = "/app/datamart/gold/label_store/"
APPLICATION_DIR = "/app/datamart/gold/application_store/"

In [6]:
spark.read.parquet(APPLICATION_DIR).printSchema()

                                                                                

root
 |-- Customer_ID: string (nullable = true)
 |-- loan_amt: integer (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- application_date: date (nullable = true)



In [7]:
application = spark.read.parquet(APPLICATION_DIR)

In [8]:
application.groupBy("Customer_ID").count().filter("count > 1").show()



+-----------+-----+
|Customer_ID|count|
+-----------+-----+
+-----------+-----+



                                                                                

In [9]:
# Get all parquet files
feature_files = spark.read.parquet(FEATURE_DIR)
feature_files.show()

                                                                                

+-----------+--------+------+-------------+----------------+------------------+--------------------+------------------+-------------------------------------------------+------------------------------------------------+-------------------------------------------------+------------------------------------------------+--------------------------------------------------+-------------------------------------------------+-------------------+---------------+--------------+----------------------+--------------------------------+--------------------------+-----------------------------+--------------------------+-------------------------+------------------------------------+------------------------+-----------------------+------------------------+-------------------------+-------------------------+--------------------------+-------------+----------------+----+--------------+--------------+--------------+--------------+-----------+-----------+------------+-----------+-----------+------------+-----

In [10]:
# Get all parquet files
label_files_pattern = LABEL_DIR + "gold_label_store_*.parquet"

label_files = spark.read.parquet(label_files_pattern)
label_files.show()

                                                                                

+--------------------+-----------+-----+----------+-------------+
|             loan_id|Customer_ID|label| label_def|snapshot_date|
+--------------------+-----------+-----+----------+-------------+
|CUS_0x1037_2023_0...| CUS_0x1037|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1069_2023_0...| CUS_0x1069|    0|30dpd_6mob|   2023-07-01|
|CUS_0x114a_2023_0...| CUS_0x114a|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1184_2023_0...| CUS_0x1184|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1297_2023_0...| CUS_0x1297|    1|30dpd_6mob|   2023-07-01|
|CUS_0x12fb_2023_0...| CUS_0x12fb|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1325_2023_0...| CUS_0x1325|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1341_2023_0...| CUS_0x1341|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1375_2023_0...| CUS_0x1375|    1|30dpd_6mob|   2023-07-01|
|CUS_0x13a8_2023_0...| CUS_0x13a8|    0|30dpd_6mob|   2023-07-01|
|CUS_0x13ef_2023_0...| CUS_0x13ef|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1440_2023_0...| CUS_0x1440|    0|30dpd_6mob|   2023-07-01|
|CUS_0x144

In [11]:
# Convert to Pandas for easier analysis and modeling
print("\nConverting to Pandas...")
feature_pd = feature_files.toPandas()
print(f"✓ Pandas DataFrame shape: {feature_pd.shape}")


Converting to Pandas...


                                                                                

✓ Pandas DataFrame shape: (12500, 164)


In [12]:
# Convert to Pandas for easier analysis and modeling
print("\nConverting to Pandas...")
label_pd = label_files.toPandas()
print(f"✓ Pandas DataFrame shape: {label_pd.shape}")


Converting to Pandas...




✓ Pandas DataFrame shape: (21474, 5)


                                                                                

In [None]:
# Convert to Pandas for easier analysis and modeling
print("\nConverting to Pandas...")
df = df_spark.toPandas()
print(f"✓ Pandas DataFrame shape: {df.shape}")

# BASIC EXPLORATION

In [None]:
print("\n--- Dataset Overview ---")
print(f"Total applications: {len(df):,}")
print(f"Total features: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

In [None]:
# Check for target variable
if 'default_label' not in df.columns:
    print("\n❌ Error: 'default_label' column not found!")
    print("Available columns:", df.columns.tolist())
    spark.stop()
    exit(1)

In [None]:
# Target distribution
print("\n--- Target Variable Distribution ---")
default_counts = df['default_label'].value_counts()
default_rate = df['default_label'].mean() * 100
print(f"Non-defaults (0): {default_counts.get(0, 0):,} ({100-default_rate:.2f}%)")
print(f"Defaults (1): {default_counts.get(1, 0):,} ({default_rate:.2f}%)")

In [None]:
# Check for class imbalance
if default_rate < 10 or default_rate > 90:
    print(f"⚠️  WARNING: Imbalanced dataset (default rate: {default_rate:.2f}%)")
    print("   Consider using class weights or resampling techniques")
else:
    print("Not imbalanced")

In [None]:
# Date range
if 'application_date' in df.columns:
    print("\n--- Application Date Range ---")
    print(f"From: {df['application_date'].min()}")
    print(f"To: {df['application_date'].max()}")
    print(f"Unique dates: {df['application_date'].nunique()}")

In [None]:
# Missing values
print("\n--- Missing Values Summary ---")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'missing_count': missing,
    'missing_pct': missing_pct
})
missing_df = missing_df[missing_df['missing_count'] > 0].sort_values('missing_pct', ascending=False)

if len(missing_df) > 0:
    print(f"\nFeatures with missing values: {len(missing_df)}")
    print("\nTop 10 features with most missing values:")
    print(missing_df.head(10))
else:
    print("✓ No missing values found!")

# FEATURE ANALYSIS

In [None]:
# Identify feature types
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove non-feature columns
exclude_cols = ['loan_id', 'Customer_ID', 'application_date', 'snapshot_date', 'default_label']
numeric_features = [f for f in numeric_features if f not in exclude_cols]
categorical_features = [f for f in categorical_features if f not in exclude_cols]

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

In [None]:
# Key numeric features
key_numeric = ['DTI', 'log_Annual_Income', 'Annual_Income', 'Age', 
               'Credit_History_Age_Year', 'Num_of_Loan_active',
               'Num_of_Delayed_Payment_12m', 'EMI_to_income', 'loan_amt']
key_numeric = [f for f in key_numeric if f in df.columns]

print("\n--- Key Numeric Features Statistics ---")
df[key_numeric].describe()

In [None]:
# Correlation with target
print("\n--- Top 15 Features Correlated with Default ---")
correlations = []
for col_name in numeric_features:
    if col_name in df.columns:
        try:
            corr = df[['default_label', col_name]].corr().iloc[0, 1]
            if not np.isnan(corr):
                correlations.append((col_name, corr))
        except:
            pass

correlations.sort(key=lambda x: abs(x[1]), reverse=True)

print("\nFeature                              Correlation")
print("-" * 60)
for feat, corr in correlations[:15]:
    direction = "↑ Risk+" if corr > 0 else "↓ Risk-"
    print(f"{feat:35s} {corr:+.4f}  {direction}")

In [None]:
# Visualization: Target distribution
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
df['default_label'].value_counts().plot(kind='bar', color=['green', 'red'], alpha=0.7, edgecolor='black')
plt.xlabel('Default Label')
plt.ylabel('Count')
plt.title('Target Variable Distribution')
plt.xticks([0, 1], ['Non-Default (0)', 'Default (1)'], rotation=0)
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
if 'income_band' in df.columns:
    default_by_income = df.groupby('income_band')['default_label'].mean() * 100
    default_by_income.plot(kind='bar', color='coral', alpha=0.7, edgecolor='black')
    plt.xlabel('Income Band')
    plt.ylabel('Default Rate (%)')
    plt.title('Default Rate by Income Band')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 3)
if 'age_band' in df.columns:
    default_by_age = df.groupby('age_band')['default_label'].mean() * 100
    default_by_age.plot(kind='bar', color='skyblue', alpha=0.7, edgecolor='black')
    plt.xlabel('Age Band')
    plt.ylabel('Default Rate (%)')
    plt.title('Default Rate by Age Band')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('gold_target_distribution.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✓ Saved: gold_target_distribution.png")

In [None]:
# Visualization: Feature distributions by default status
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

plot_features = ['DTI', 'Annual_Income', 'Age', 'Num_of_Delayed_Payment_12m', 
                'Credit_History_Age_Year', 'EMI_to_income']
plot_features = [f for f in plot_features if f in df.columns]

for idx, feature in enumerate(plot_features[:6]):
    ax = axes[idx]
    
    non_default = df[df['default_label'] == 0][feature].dropna()
    default = df[df['default_label'] == 1][feature].dropna()
    
    # Handle outliers for better visualization
    if feature in ['DTI', 'EMI_to_income']:
        non_default = non_default.clip(upper=non_default.quantile(0.95))
        default = default.clip(upper=default.quantile(0.95))
    
    ax.hist(non_default, bins=50, alpha=0.5, label='Non-Default', 
           color='green', edgecolor='black', density=True)
    ax.hist(default, bins=50, alpha=0.5, label='Default', 
           color='red', edgecolor='black', density=True)
    
    ax.set_xlabel(feature)
    ax.set_ylabel('Density')
    ax.set_title(f'{feature} Distribution by Default Status')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('gold_feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved: gold_feature_distributions.png")


# DATA PREPARATION FOR MODELING

In [None]:
# Select features for modeling
print("\n--- Feature Selection ---")

# Strategy: Use numeric features with low missing values
# Filter out features with >50% missing values
valid_features = []
for feat in numeric_features:
    missing_pct = df[feat].isnull().sum() / len(df) * 100
    if missing_pct < 50:
        valid_features.append(feat)

print(f"Total numeric features: {len(numeric_features)}")
print(f"Valid features (< 50% missing): {len(valid_features)}")

In [None]:
# Create modeling dataset
print("\n--- Creating Modeling Dataset ---")

# Select features and target
X = df[valid_features].copy()
y = df['default_label'].copy()

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

In [None]:
# Handle missing values (simple imputation with median)
print("\n--- Handling Missing Values ---")
missing_before = X.isnull().sum().sum()
X = X.fillna(X.median())
missing_after = X.isnull().sum().sum()
print(f"Missing values before imputation: {missing_before:,}")
print(f"Missing values after imputation: {missing_after:,}")

In [None]:
# Handle infinite values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

In [None]:
# Remove constant features (zero variance)
print("\n--- Removing Constant Features ---")
constant_features = [col for col in X.columns if X[col].nunique() <= 1]
if constant_features:
    print(f"Removing {len(constant_features)} constant features")
    X = X.drop(columns=constant_features)
    print(f"Features after removal: {len(X.columns)}")

In [None]:
# Feature importance using correlation (for feature selection)
print("\n--- Feature Selection by Correlation ---")
correlations_with_target = []
for col_name in X.columns:
    corr = np.corrcoef(X[col_name], y)[0, 1]
    if not np.isnan(corr):
        correlations_with_target.append((col_name, abs(corr)))

correlations_with_target.sort(key=lambda x: x[1], reverse=True)

# Select top 20 most correlated features
top_n_features = 20
selected_features = [feat for feat, corr in correlations_with_target[:top_n_features]]

print(f"\nSelecting top {top_n_features} features by correlation:")
for i, (feat, corr) in enumerate(correlations_with_target[:top_n_features], 1):
    print(f"  {i:2d}. {feat:40s} {corr:.4f}")

X_selected = X[selected_features].copy()

print(f"\n✓ Final feature matrix shape: {X_selected.shape}")

In [None]:
# Train-test split
print("\n--- Train-Test Split ---")
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"Training default rate: {y_train.mean()*100:.2f}%")
print(f"Test default rate: {y_test.mean()*100:.2f}%")


In [None]:
# Feature scaling
print("\n--- Feature Scaling (Standardization) ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Features standardized (mean=0, std=1)")

# LOGISTIC REGRESSION MODEL

In [None]:
# Calculate class weights for imbalanced data
class_weight_ratio = (len(y_train) - y_train.sum()) / y_train.sum()
print(f"\nClass imbalance ratio: {class_weight_ratio:.2f}:1 (non-default:default)")

In [None]:
# Train logistic regression
print("\nTraining Logistic Regression with class weights...")

lr_model = LogisticRegression(
    class_weight='balanced',  # Handle class imbalance
    max_iter=1000,
    random_state=42,
    solver='lbfgs'
)

lr_model.fit(X_train_scaled, y_train)

print("✓ Model trained successfully")

In [None]:
# Feature coefficients
print("\n--- Feature Coefficients (Top 10) ---")
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'coefficient': lr_model.coef_[0]
})
feature_importance['abs_coefficient'] = feature_importance['coefficient'].abs()
feature_importance = feature_importance.sort_values('abs_coefficient', ascending=False)

print("\nFeature                              Coefficient   Impact")
print("-" * 70)
for idx, row in feature_importance.head(10).iterrows():
    impact = "↑ Increases Risk" if row['coefficient'] > 0 else "↓ Decreases Risk"
    print(f"{row['feature']:35s} {row['coefficient']:+8.4f}   {impact}")

# MODEL EVALUATION

In [None]:
# Predictions
y_train_pred = lr_model.predict(X_train_scaled)
y_test_pred = lr_model.predict(X_test_scaled)

y_train_proba = lr_model.predict_proba(X_train_scaled)[:, 1]
y_test_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Training metrics
print("\n--- Training Set Performance ---")
print(classification_report(y_train, y_train_pred, 
                          target_names=['Non-Default', 'Default']))

train_auc = roc_auc_score(y_train, y_train_proba)
print(f"ROC-AUC Score: {train_auc:.4f}")

# Test metrics
print("\n--- Test Set Performance ---")
print(classification_report(y_test, y_test_pred, 
                          target_names=['Non-Default', 'Default']))

test_auc = roc_auc_score(y_test, y_test_proba)
print(f"ROC-AUC Score: {test_auc:.4f}")

# Check for overfitting
auc_diff = train_auc - test_auc
if auc_diff > 0.05:
    print(f"\n⚠️  WARNING: Possible overfitting detected (AUC diff: {auc_diff:.4f})")
else:
    print(f"\n✓ Model generalizes well (AUC diff: {auc_diff:.4f})")

# Confusion matrix
print("\n--- Confusion Matrix (Test Set) ---")
cm = confusion_matrix(y_test, y_test_pred)
print("\n                 Predicted")
print("               Non-Default  Default")
print(f"Actual Non-Default  {cm[0,0]:6d}     {cm[0,1]:6d}")
print(f"       Default      {cm[1,0]:6d}     {cm[1,1]:6d}")

# VISUALIZATIONS

In [None]:
# Create comprehensive visualization
fig = plt.figure(figsize=(18, 12))

# 1. Confusion Matrix
ax1 = plt.subplot(2, 3, 1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Non-Default', 'Default'],
            yticklabels=['Non-Default', 'Default'])
plt.title('Confusion Matrix (Test Set)')
plt.ylabel('Actual')
plt.xlabel('Predicted')

# 2. ROC Curve
ax2 = plt.subplot(2, 3, 2)
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
plt.plot(fpr, tpr, linewidth=2, label=f'ROC Curve (AUC = {test_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, alpha=0.3)

# 3. Precision-Recall Curve
ax3 = plt.subplot(2, 3, 3)
precision, recall, _ = precision_recall_curve(y_test, y_test_proba)
avg_precision = average_precision_score(y_test, y_test_proba)
plt.plot(recall, precision, linewidth=2, label=f'PR Curve (AP = {avg_precision:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True, alpha=0.3)

# 4. Feature Coefficients
ax4 = plt.subplot(2, 3, 4)
top_features = feature_importance.head(15).sort_values('coefficient')
colors = ['red' if x > 0 else 'green' for x in top_features['coefficient']]
plt.barh(range(len(top_features)), top_features['coefficient'], color=colors, alpha=0.7, edgecolor='black')
plt.yticks(range(len(top_features)), top_features['feature'], fontsize=8)
plt.xlabel('Coefficient')
plt.title('Top 15 Feature Coefficients')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.grid(True, alpha=0.3)

# 5. Predicted Probability Distribution
ax5 = plt.subplot(2, 3, 5)
plt.hist(y_test_proba[y_test == 0], bins=50, alpha=0.5, label='Non-Default', 
         color='green', edgecolor='black', density=True)
plt.hist(y_test_proba[y_test == 1], bins=50, alpha=0.5, label='Default', 
         color='red', edgecolor='black', density=True)
plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('Predicted Probability Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

# 6. Calibration plot (optional)
ax6 = plt.subplot(2, 3, 6)
from sklearn.calibration import calibration_curve
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_test, y_test_proba, n_bins=10
)
plt.plot(mean_predicted_value, fraction_of_positives, marker='o', linewidth=2, label='Model')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Perfect Calibration')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Plot')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('logistic_regression_evaluation.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✓ Saved: logistic_regression_evaluation.png")

# MODEL SUMMARY

In [None]:
print(f"""
📊 DATASET:
   • Total applications: {len(df):,}
   • Training samples: {len(X_train):,}
   • Test samples: {len(X_test):,}
   • Features used: {len(selected_features)}
   • Default rate: {df['default_label'].mean()*100:.2f}%

🎯 MODEL PERFORMANCE (Test Set):
   • ROC-AUC: {test_auc:.4f}
   • Precision (Default class): {classification_report(y_test, y_test_pred, output_dict=True)['1']['precision']:.4f}
   • Recall (Default class): {classification_report(y_test, y_test_pred, output_dict=True)['1']['recall']:.4f}
   • F1-Score (Default class): {classification_report(y_test, y_test_pred, output_dict=True)['1']['f1-score']:.4f}

🔝 TOP 5 RISK FACTORS (Positive Coefficients):
""")

for idx, row in feature_importance[feature_importance['coefficient'] > 0].head(5).iterrows():
    print(f"   {row['feature']:35s} +{row['coefficient']:.4f}")

print(f"""
🛡️  TOP 5 PROTECTIVE FACTORS (Negative Coefficients):
""")

for idx, row in feature_importance[feature_importance['coefficient'] < 0].head(5).iterrows():
    print(f"   {row['feature']:35s} {row['coefficient']:.4f}")

print("\n" + "="*80)
print("✅ EXPLORATION AND MODELING COMPLETED")
print("="*80)

# 💡 Next Steps:
   1. Try more advanced models (Random Forest, XGBoost, LightGBM)
   2. Perform hyperparameter tuning
   3. Implement cross-validation
   4. Feature engineering (polynomial features, interactions)
   5. Ensemble methods
   6. Deploy model to production

In [None]:
# Cleanup
spark.stop()
print("\n✓ Spark session stopped")