# 3. Model Training (Model Eğitimi)

Bu notebook'ta churn tahmin modellerini eğiteceğiz.

**Amaçlar:**
- Veriyi train/test olarak bölmek
- Baseline model eğitmek
- XGBoost model eğitmek
- Model performansını değerlendirmek
- En iyi modeli kaydetmek

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import sys
sys.path.append('..')

from src.data.loader import DataLoader
from src.data.preprocessor import DataPreprocessor
from src.features.engineer import FeatureEngineer
from src.models.trainer import ModelTrainer
from src.models.evaluator import ModelEvaluator
from src.config import settings

%matplotlib inline

## 3.1 Veri Hazırlama

In [None]:
# Load and preprocess data
loader = DataLoader(data_dir='../data')
preprocessor = DataPreprocessor()
engineer = FeatureEngineer()

# Try loading processed data first
try:
    df = loader.load_csv('processed_features.csv', directory='processed')
    print("Loaded processed data")
except:
    # Load and process from scratch
    try:
        df = loader.load_customer_360()
    except:
        df = loader.load_telco_churn()
    
    df = preprocessor.clean_data(df)
    df = preprocessor.handle_missing_values(df)
    df = engineer.create_all_features(df)
    df = preprocessor.encode_categorical(df, fit=True)
    df = preprocessor.scale_numeric(df, fit=True)
    print("Processed from raw data")

print(f"Shape: {df.shape}")

In [None]:
# Prepare features and target
churn_col = 'churned' if 'churned' in df.columns else 'Churn'

# Get feature columns (numeric only, excluding target and IDs)
exclude_cols = ['customer_id', churn_col, 'churn_date', 'churn_reason', 'created_at', 'updated_at']
feature_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude_cols]

print(f"Number of features: {len(feature_cols)}")

X = df[feature_cols].copy()
y = df[churn_col].astype(int)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nClass distribution:")
print(y.value_counts())

## 3.2 Train/Test Split

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=settings.test_size,
    random_state=settings.random_state,
    stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

print(f"\nTraining class distribution:")
print(y_train.value_counts(normalize=True))

## 3.3 Model Training

In [None]:
# Initialize trainer
trainer = ModelTrainer(model_dir='../models')

# Train all models
models = trainer.train_all_models(X_train, y_train, tune_hyperparameters=False)

print(f"\nTrained models: {list(models.keys())}")

## 3.4 Cross-Validation

In [None]:
# Cross-validate and compare models
comparison = trainer.compare_models(X_train, y_train)
print("Model Comparison (5-Fold CV):")
comparison

In [None]:
# Visualize comparison
plt.figure(figsize=(10, 5))
comparison.plot(kind='bar', x='model', y='mean_roc_auc', yerr='std_roc_auc', capsize=4)
plt.title('Model Comparison - ROC AUC')
plt.ylabel('ROC AUC Score')
plt.ylim(0.5, 1.0)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3.5 Model Evaluation

In [None]:
# Initialize evaluator
evaluator = ModelEvaluator(threshold=0.5)

# Evaluate best model on test set
best_model = trainer.best_model
best_model_name = trainer.best_model_name

print(f"Best Model: {best_model_name}")

# Get predictions
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Evaluate
metrics = evaluator.evaluate(y_test.values, y_pred_proba)
evaluator.print_report()

In [None]:
# Classification report
print("\nDetailed Classification Report:")
print(evaluator.get_classification_report(y_test.values, y_pred_proba))

In [None]:
# Find optimal threshold
optimal_threshold, best_f1 = evaluator.find_optimal_threshold(y_test.values, y_pred_proba, metric='f1')
print(f"\nOptimal Threshold: {optimal_threshold:.2f}")
print(f"Best F1 Score: {best_f1:.4f}")

In [None]:
# Performance at different thresholds
threshold_perf = evaluator.get_performance_at_thresholds(y_test.values, y_pred_proba)
threshold_perf

## 3.6 ROC Curve

In [None]:
from sklearn.metrics import roc_curve, auc

# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

for model_name, model in trainer.models.items():
    y_pred = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3.7 Feature Importance

In [None]:
# Get feature importance
importance_df = trainer.get_feature_importance(feature_cols)

# Plot top 15 features
plt.figure(figsize=(10, 8))
top_features = importance_df.head(15)
plt.barh(top_features['feature'], top_features['importance'])
plt.xlabel('Importance')
plt.title('Top 15 Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Most Important Features:")
importance_df.head(10)

## 3.8 Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

# Get predictions with optimal threshold
y_pred = (y_pred_proba >= optimal_threshold).astype(int)
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Churn', 'Churn'],
            yticklabels=['No Churn', 'Churn'])
plt.title(f'Confusion Matrix (threshold={optimal_threshold:.2f})')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

## 3.9 Save Model

In [None]:
# Save the best model
model_path = trainer.save_model(model_name=best_model_name, version='v1.0.0')
print(f"Model saved to: {model_path}")

# Also save as 'latest'
import shutil
latest_path = model_path.parent / f"{best_model_name}_latest.pkl"
shutil.copy(model_path, latest_path)
print(f"Also saved as: {latest_path}")

In [None]:
# Summary
print("=" * 50)
print("MODEL TRAINING SUMMARY")
print("=" * 50)
print(f"Best Model: {best_model_name}")
print(f"ROC-AUC Score: {metrics['roc_auc']:.4f}")
print(f"F1 Score: {metrics['f1_score']:.4f}")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"Optimal Threshold: {optimal_threshold:.2f}")
print(f"\nModel saved to: {model_path}")