# üöÄ Credit Approval ML Pipeline

> **Clean Architecture + MLOps-Ready Production Architecture**

Bu notebook, kredi onay tahmin modeli i√ßin tam ML pipeline i√ßerir.

---

## üìã Table of Contents

1. [Environment Setup](#1-environment-setup)
2. [Configuration](#2-configuration)
3. [Data Loading & Validation](#3-data-loading--validation)
4. [Feature Engineering](#4-feature-engineering)
5. [Model Training](#5-model-training)
6. [Model Evaluation](#6-model-evaluation)
7. [Results & Business Analysis](#7-results--business-analysis)

---
## 1. Environment Setup

In [None]:
# ============================================
# 1.1 Install Dependencies (Colab/Kaggle)
# ============================================

import sys
import os
from pathlib import Path

# Detect environment
IN_COLAB = 'google.colab' in sys.modules
IN_KAGGLE = os.path.exists('/kaggle')

print(f"üåê Environment: {'Colab' if IN_COLAB else 'Kaggle' if IN_KAGGLE else 'Local'}")

# Install required packages
if IN_COLAB or IN_KAGGLE:
    !pip install -q xgboost lightgbm catboost optuna scikit-learn pandas numpy matplotlib seaborn pyyaml

In [None]:
# ============================================
# 1.2 Mount Google Drive (Colab only)
# ============================================

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Change to project directory if exists
    project_path = '/content/drive/MyDrive/credit-approval'
    if os.path.exists(project_path):
        os.chdir(project_path)
        print(f"üìÅ Changed to: {project_path}")

In [None]:
# ============================================
# 1.3 Import Libraries
# ============================================

import warnings
warnings.filterwarnings('ignore')

# Core
import numpy as np
import pandas as pd
from datetime import datetime
from dataclasses import dataclass, field
from typing import Dict, Any, Optional, List, Tuple
from pathlib import Path
import json
import gc
import logging

# ML
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Optional GPU libraries
try:
    import xgboost as xgb
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

try:
    import lightgbm as lgb
    HAS_LGB = True
except ImportError:
    HAS_LGB = False

try:
    from catboost import CatBoostClassifier
    HAS_CAT = True
except ImportError:
    HAS_CAT = False

try:
    import optuna
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    HAS_OPTUNA = True
except ImportError:
    HAS_OPTUNA = False

print("‚úÖ Libraries imported")
print(f"   XGBoost: {HAS_XGB}, LightGBM: {HAS_LGB}, CatBoost: {HAS_CAT}, Optuna: {HAS_OPTUNA}")

In [None]:
# ============================================
# 1.4 Check GPU
# ============================================

import subprocess

def check_gpu():
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        if result.returncode == 0:
            print("‚úÖ GPU Available")
            # Show GPU info
            for line in result.stdout.split('\n'):
                if 'NVIDIA' in line or 'MiB' in line:
                    print(f"   {line.strip()}")
            return True
    except:
        pass
    print("‚ö†Ô∏è No GPU detected, using CPU")
    return False

USE_GPU = check_gpu()

---
## 2. Configuration

In [None]:
# ============================================
# 2.1 Configuration Class
# ============================================

@dataclass
class Config:
    """Pipeline configuration."""
    
    # Data paths
    data_paths: Dict[str, str] = field(default_factory=dict)
    
    # Model parameters
    cv_folds: int = 5
    test_size: float = 0.1
    val_size: float = 0.2
    random_state: int = 42
    n_jobs: int = -1
    
    # Optuna
    optuna_trials: int = 30
    optuna_timeout: int = 600  # 10 minutes
    
    # Business parameters
    cost_false_positive: float = 5000
    cost_false_negative: float = 500
    revenue_per_approval: float = 1200
    
    # Output
    output_dir: str = "ml_pipeline_output"
    
    # GPU
    use_gpu: bool = True
    
    def __post_init__(self):
        if not self.data_paths:
            self.data_paths = self._find_data()
        self._create_output_dirs()
    
    def _find_data(self) -> Dict[str, str]:
        """Find data files."""
        paths_to_try = [
            # Colab Drive
            {'app': '/content/drive/MyDrive/credit-approval/data/raw/application_record.csv',
             'credit': '/content/drive/MyDrive/credit-approval/data/raw/credit_record.csv'},
            # Colab local
            {'app': '/content/application_record.csv', 'credit': '/content/credit_record.csv'},
            # Kaggle
            {'app': '/kaggle/input/credit-card-approval-prediction/application_record.csv',
             'credit': '/kaggle/input/credit-card-approval-prediction/credit_record.csv'},
            # Local
            {'app': 'data/raw/application_record.csv', 'credit': 'data/raw/credit_record.csv'},
            {'app': 'application_record.csv', 'credit': 'credit_record.csv'},
        ]
        
        for paths in paths_to_try:
            if Path(paths['app']).exists() and Path(paths['credit']).exists():
                print(f"‚úÖ Data found: {Path(paths['app']).parent}")
                return {'application': paths['app'], 'credit': paths['credit']}
        
        print("‚ö†Ô∏è Data not found, please set config.data_paths manually")
        return {'application': 'application_record.csv', 'credit': 'credit_record.csv'}
    
    def _create_output_dirs(self):
        """Create output directories."""
        for subdir in ['models', 'plots', 'results', 'logs', 'final_model']:
            Path(f"{self.output_dir}/{subdir}").mkdir(parents=True, exist_ok=True)

# Create config
config = Config(use_gpu=USE_GPU)
print(f"\nüìã Configuration loaded")
print(f"   Random state: {config.random_state}")
print(f"   CV folds: {config.cv_folds}")
print(f"   GPU: {config.use_gpu}")

---
## 3. Data Loading & Validation

In [None]:
# ============================================
# 3.1 Load Data
# ============================================

print("üì• Loading data...")

app_data = pd.read_csv(config.data_paths['application'])
credit_data = pd.read_csv(config.data_paths['credit'])

print(f"\nüìä Application data: {app_data.shape}")
print(f"üìä Credit data: {credit_data.shape}")

# Show sample
display(app_data.head(3))
display(credit_data.head(3))

In [None]:
# ============================================
# 3.2 Data Validation
# ============================================

print("üîç Validating data...")

# Check required columns
assert 'ID' in app_data.columns, "Missing ID in application data"
assert 'ID' in credit_data.columns, "Missing ID in credit data"
assert 'MONTHS_BALANCE' in credit_data.columns, "Missing MONTHS_BALANCE"
assert 'STATUS' in credit_data.columns, "Missing STATUS"

# ID overlap
app_ids = set(app_data['ID'].unique())
credit_ids = set(credit_data['ID'].unique())
common_ids = app_ids & credit_ids

print(f"\n‚úÖ Validation passed")
print(f"   Application IDs: {len(app_ids):,}")
print(f"   Credit IDs: {len(credit_ids):,}")
print(f"   Common IDs: {len(common_ids):,} ({len(common_ids)/len(app_ids)*100:.1f}%)")

In [None]:
# ============================================
# 3.3 Create Target Variable (Temporal Split)
# ============================================

print("üéØ Creating target variable...")

BAD_STATUSES = ['2', '3', '4', '5']  # 60+ days overdue
TEMPORAL_CUTOFF = -6

# Split credit data temporally
observed = credit_data[credit_data['MONTHS_BALANCE'] < TEMPORAL_CUTOFF]
future = credit_data[credit_data['MONTHS_BALANCE'] >= TEMPORAL_CUTOFF]

print(f"   Observed records: {len(observed):,}")
print(f"   Future records: {len(future):,}")

# Find customers with bad credit in future
bad_customers = future[future['STATUS'].astype(str).isin(BAD_STATUSES)]['ID'].unique()

# Filter to customers with both observed and future
valid_ids = set(observed['ID'].unique()) & set(future['ID'].unique())
print(f"   Valid customers: {len(valid_ids):,}")

# Create target
data = app_data[app_data['ID'].isin(valid_ids)].copy()
data['target'] = data['ID'].isin(bad_customers).astype(int)

print(f"\nüìä Target distribution:")
print(data['target'].value_counts())
print(f"   Bad rate: {data['target'].mean()*100:.2f}%")

---
## 4. Feature Engineering

In [None]:
# ============================================
# 4.1 Data Splitting
# ============================================

print("‚úÇÔ∏è Splitting data...")

X = data.drop('target', axis=1)
y = data['target']

# First split: train+val vs test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=config.test_size, 
    random_state=config.random_state, stratify=y
)

# Second split: train vs val
val_size_adj = config.val_size / (1 - config.test_size)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_size_adj,
    random_state=config.random_state, stratify=y_temp
)

print(f"\nüìä Split sizes:")
print(f"   Train: {len(X_train):,} ({len(X_train)/len(data)*100:.1f}%)")
print(f"   Val: {len(X_val):,} ({len(X_val)/len(data)*100:.1f}%)")
print(f"   Test: {len(X_test):,} ({len(X_test)/len(data)*100:.1f}%)")

In [None]:
# ============================================
# 4.2 Feature Engineering
# ============================================

class FeatureEngineer:
    """Feature engineering with fit-transform pattern."""
    
    def __init__(self):
        self.scalers = {}
        self.encoders = {}
        self.is_fitted = False
        self.feature_names = []
    
    def fit(self, X: pd.DataFrame) -> 'FeatureEngineer':
        """Fit on training data."""
        X_feat = self._create_features(X.copy())
        
        # Fit scalers for numeric
        numeric_cols = X_feat.select_dtypes(include=[np.number]).columns.tolist()
        for col in numeric_cols:
            scaler = StandardScaler()
            valid = X_feat[col].dropna()
            if len(valid) > 0:
                scaler.fit(valid.values.reshape(-1, 1))
                self.scalers[col] = scaler
        
        # Fit encoders for categorical
        cat_cols = X_feat.select_dtypes(include=['object', 'category']).columns.tolist()
        for col in cat_cols:
            encoder = LabelEncoder()
            valid = X_feat[col].dropna().astype(str)
            if len(valid) > 0:
                encoder.fit(valid)
                self.encoders[col] = encoder
        
        self.feature_names = numeric_cols + cat_cols
        self.is_fitted = True
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Transform using fitted parameters."""
        X_feat = self._create_features(X.copy())
        
        # Scale numeric
        for col, scaler in self.scalers.items():
            if col in X_feat.columns:
                valid_idx = X_feat[col].notna()
                if valid_idx.any():
                    X_feat.loc[valid_idx, col] = scaler.transform(
                        X_feat.loc[valid_idx, col].values.reshape(-1, 1)
                    ).flatten()
        
        # Encode categorical
        for col, encoder in self.encoders.items():
            if col in X_feat.columns:
                X_feat[col] = X_feat[col].fillna('Unknown').astype(str)
                X_feat[col] = X_feat[col].apply(
                    lambda x: encoder.transform([x])[0] if x in encoder.classes_ else -1
                )
        
        return X_feat.fillna(0)
    
    def fit_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Fit and transform."""
        return self.fit(X).transform(X)
    
    def _create_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create derived features."""
        # Age
        if 'DAYS_BIRTH' in df.columns:
            df['AGE_YEARS'] = (-df['DAYS_BIRTH'] / 365).astype(int)
        
        # Employment
        if 'DAYS_EMPLOYED' in df.columns:
            df['EMPLOYED_YEARS'] = df['DAYS_EMPLOYED'].apply(
                lambda x: 0 if x > 0 else int(-x / 365)
            )
            df['IS_EMPLOYED'] = (df['DAYS_EMPLOYED'] < 0).astype(int)
        
        # Income
        if 'AMT_INCOME_TOTAL' in df.columns:
            df['INCOME_LOG'] = np.log1p(df['AMT_INCOME_TOTAL'])
        
        # Family
        if 'CNT_CHILDREN' in df.columns:
            df['HAS_CHILDREN'] = (df['CNT_CHILDREN'] > 0).astype(int)
        
        # Income per person
        if 'AMT_INCOME_TOTAL' in df.columns and 'CNT_FAM_MEMBERS' in df.columns:
            fam = df['CNT_FAM_MEMBERS'].replace(0, 1)
            df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / fam
        
        return df

# Apply feature engineering
print("üî¨ Engineering features...")

fe = FeatureEngineer()
X_train_fe = fe.fit_transform(X_train)
X_val_fe = fe.transform(X_val)
X_test_fe = fe.transform(X_test)

print(f"\n‚úÖ Feature engineering complete")
print(f"   Features: {len(fe.feature_names)}")
print(f"   Numeric scalers: {len(fe.scalers)}")
print(f"   Categorical encoders: {len(fe.encoders)}")

---
## 5. Model Training

In [None]:
# ============================================
# 5.1 Model Factory
# ============================================

def create_models(use_gpu: bool = False) -> Dict[str, Any]:
    """Create all available models."""
    models = {
        'LogisticRegression': LogisticRegression(
            max_iter=1000, random_state=config.random_state, n_jobs=config.n_jobs
        ),
        'RandomForest': RandomForestClassifier(
            n_estimators=100, max_depth=10,
            random_state=config.random_state, n_jobs=config.n_jobs
        ),
        'GradientBoosting': GradientBoostingClassifier(
            n_estimators=100, max_depth=5, learning_rate=0.1,
            random_state=config.random_state
        ),
    }
    
    if HAS_XGB:
        xgb_params = {
            'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1,
            'random_state': config.random_state, 'n_jobs': config.n_jobs,
            'eval_metric': 'logloss'
        }
        if use_gpu:
            xgb_params['tree_method'] = 'gpu_hist'
        models['XGBoost'] = xgb.XGBClassifier(**xgb_params)
    
    if HAS_LGB:
        lgb_params = {
            'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1,
            'random_state': config.random_state, 'n_jobs': config.n_jobs, 'verbose': -1
        }
        if use_gpu:
            lgb_params['device'] = 'gpu'
        models['LightGBM'] = lgb.LGBMClassifier(**lgb_params)
    
    if HAS_CAT:
        cat_params = {
            'iterations': 100, 'depth': 6, 'learning_rate': 0.1,
            'random_seed': config.random_state, 'verbose': False
        }
        if use_gpu:
            cat_params['task_type'] = 'GPU'
        models['CatBoost'] = CatBoostClassifier(**cat_params)
    
    return models

models = create_models(config.use_gpu)
print(f"üì¶ Available models: {list(models.keys())}")

In [None]:
# ============================================
# 5.2 Train All Models
# ============================================

print("üèãÔ∏è Training models...\n")

results = {}

for name, model in models.items():
    print(f"   Training {name}...", end=" ")
    start = datetime.now()
    
    try:
        # Train
        model.fit(X_train_fe, y_train)
        
        # Predict
        y_pred = model.predict(X_val_fe)
        y_proba = model.predict_proba(X_val_fe) if hasattr(model, 'predict_proba') else None
        
        # Metrics
        val_acc = accuracy_score(y_val, y_pred)
        val_auc = roc_auc_score(y_val, y_proba, multi_class='ovr') if y_proba is not None else 0
        val_f1 = f1_score(y_val, y_pred, average='weighted')
        
        # CV
        cv_scores = cross_val_score(model, X_train_fe, y_train, cv=config.cv_folds, scoring='roc_auc_ovr')
        
        duration = (datetime.now() - start).total_seconds()
        
        results[name] = {
            'model': model,
            'val_accuracy': val_acc,
            'val_auc': val_auc,
            'val_f1': val_f1,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'duration': duration,
            'success': True
        }
        
        print(f"‚úÖ Acc={val_acc:.4f}, AUC={val_auc:.4f}, CV={cv_scores.mean():.4f}¬±{cv_scores.std():.4f} ({duration:.1f}s)")
        
    except Exception as e:
        print(f"‚ùå Failed: {str(e)[:50]}")
        results[name] = {'success': False, 'error': str(e)}

print(f"\n‚úÖ Training complete: {sum(r.get('success', False) for r in results.values())}/{len(models)} models")

---
## 6. Model Evaluation

In [None]:
# ============================================
# 6.1 Test Set Evaluation
# ============================================

print("üìä Evaluating on test set...\n")

test_results = {}

for name, result in results.items():
    if not result.get('success'):
        continue
    
    model = result['model']
    
    y_pred = model.predict(X_test_fe)
    y_proba = model.predict_proba(X_test_fe) if hasattr(model, 'predict_proba') else None
    
    test_acc = accuracy_score(y_test, y_pred)
    test_auc = roc_auc_score(y_test, y_proba, multi_class='ovr') if y_proba is not None else 0
    test_f1 = f1_score(y_test, y_pred, average='weighted')
    
    test_results[name] = {
        'accuracy': test_acc,
        'auc': test_auc,
        'f1': test_f1,
        'predictions': y_pred,
        'probabilities': y_proba
    }
    
    print(f"   {name}: Accuracy={test_acc:.4f}, AUC={test_auc:.4f}, F1={test_f1:.4f}")

In [None]:
# ============================================
# 6.2 Select Best Model
# ============================================

print("\nüèÜ Selecting best model...")

# Composite score
model_scores = {}
for name, test_res in test_results.items():
    train_res = results[name]
    
    test_auc = test_res['auc']
    cv_mean = train_res['cv_mean']
    cv_std = train_res['cv_std']
    
    stability = 1 / (1 + cv_std)
    composite = 0.5 * test_auc + 0.3 * cv_mean + 0.2 * stability
    
    model_scores[name] = composite

best_model_name = max(model_scores, key=model_scores.get)
best_model = results[best_model_name]['model']

print(f"\nü•á Best Model: {best_model_name}")
print(f"   Composite Score: {model_scores[best_model_name]:.4f}")
print(f"   Test AUC: {test_results[best_model_name]['auc']:.4f}")

In [None]:
# ============================================
# 6.3 Visualizations
# ============================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Model comparison
ax1 = axes[0]
model_names = list(test_results.keys())
aucs = [test_results[n]['auc'] for n in model_names]
colors = ['#2ecc71' if n == best_model_name else '#3498db' for n in model_names]

bars = ax1.barh(model_names, aucs, color=colors)
ax1.set_xlabel('Test AUC')
ax1.set_title('Model Comparison')
ax1.set_xlim(0, 1)

for bar, auc in zip(bars, aucs):
    ax1.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2, 
             f'{auc:.4f}', va='center')

# Confusion matrix for best model
ax2 = axes[1]
cm = confusion_matrix(y_test, test_results[best_model_name]['predictions'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2)
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
ax2.set_title(f'Confusion Matrix - {best_model_name}')

plt.tight_layout()
plt.savefig(f"{config.output_dir}/plots/model_comparison.png", dpi=150)
plt.show()

---
## 7. Results & Business Analysis

In [None]:
# ============================================
# 7.1 Business Impact Analysis
# ============================================

print("üí∞ Business Impact Analysis\n")

y_pred_best = test_results[best_model_name]['predictions']

# Confusion matrix elements
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_best).ravel()

# Costs
cost_fp_total = fp * config.cost_false_negative  # Rejected good customers
cost_fn_total = fn * config.cost_false_positive  # Approved bad customers
revenue = tn * config.revenue_per_approval

total_cost = cost_fp_total + cost_fn_total
net_profit = revenue - total_cost
roi = (net_profit / total_cost * 100) if total_cost > 0 else 0

print(f"üìä Confusion Matrix:")
print(f"   True Positives: {tp:,} (correctly identified bad)")
print(f"   True Negatives: {tn:,} (correctly identified good)")
print(f"   False Positives: {fp:,} (rejected good customers)")
print(f"   False Negatives: {fn:,} (approved bad customers)")

print(f"\nüíµ Financial Impact:")
print(f"   Cost of rejecting good: ${cost_fp_total:,.0f}")
print(f"   Cost of approving bad: ${cost_fn_total:,.0f}")
print(f"   Revenue from approvals: ${revenue:,.0f}")
print(f"   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
print(f"   Net Profit: ${net_profit:,.0f}")
print(f"   ROI: {roi:.1f}%")

In [None]:
# ============================================
# 7.2 Save Results
# ============================================

import joblib

print("üíæ Saving results...")

# Save best model
model_path = f"{config.output_dir}/final_model/model.joblib"
joblib.dump(best_model, model_path)
print(f"   ‚úÖ Model saved: {model_path}")

# Save feature engineer
fe_path = f"{config.output_dir}/final_model/feature_engineer.joblib"
joblib.dump(fe, fe_path)
print(f"   ‚úÖ Feature engineer saved: {fe_path}")

# Save results summary
summary = {
    'best_model': best_model_name,
    'test_accuracy': test_results[best_model_name]['accuracy'],
    'test_auc': test_results[best_model_name]['auc'],
    'test_f1': test_results[best_model_name]['f1'],
    'net_profit': net_profit,
    'roi': roi,
    'timestamp': datetime.now().isoformat()
}

with open(f"{config.output_dir}/results/summary.json", 'w') as f:
    json.dump(summary, f, indent=2)

print(f"   ‚úÖ Summary saved")

In [None]:
# ============================================
# 7.3 Final Summary
# ============================================

print("\n" + "="*60)
print("üéâ PIPELINE COMPLETE")
print("="*60)
print(f"\nüèÜ Best Model: {best_model_name}")
print(f"\nüìä Test Metrics:")
print(f"   Accuracy: {test_results[best_model_name]['accuracy']:.4f}")
print(f"   AUC: {test_results[best_model_name]['auc']:.4f}")
print(f"   F1 Score: {test_results[best_model_name]['f1']:.4f}")
print(f"\nüí∞ Business Impact:")
print(f"   Net Profit: ${net_profit:,.0f}")
print(f"   ROI: {roi:.1f}%")
print(f"\nüìÅ Outputs: {config.output_dir}/")
print("="*60)

---

## üìù How to Make Predictions

```python
import joblib

# Load model and feature engineer
model = joblib.load('ml_pipeline_output/final_model/model.joblib')
fe = joblib.load('ml_pipeline_output/final_model/feature_engineer.joblib')

# Prepare new data
new_customer = pd.DataFrame([{
    'ID': 999,
    'DAYS_BIRTH': -10000,
    'DAYS_EMPLOYED': -2000,
    'AMT_INCOME_TOTAL': 150000,
    'CNT_CHILDREN': 1,
    'CNT_FAM_MEMBERS': 3,
    # ... other features
}])

# Transform and predict
X_new = fe.transform(new_customer)
prediction = model.predict(X_new)[0]
probability = model.predict_proba(X_new)[0]

print(f"Prediction: {'Bad Credit' if prediction == 1 else 'Good Credit'}")
print(f"Confidence: {max(probability)*100:.1f}%")
```