## Model Optimizasyonu (Optimize Edilmi≈ü Versiyon)

### Kritik ƒ∞yile≈ütirmeler:
1. ‚úÖ **TimeSeriesSplit:** Temporal validation eklendi
2. ‚úÖ **SMOTE:** Class imbalance i√ßin oversampling
3. ‚úÖ **Threshold Optimizasyonu:** Recall'u maksimize etmek i√ßin
4. ‚úÖ **Focal Loss:** Dengesiz veri i√ßin √∂zel loss function
5. ‚úÖ **Feature Importance Analizi:** Gereksiz √∂zellikler √ßƒ±karƒ±lacak
6. ‚ùå **Shuffle Kaldƒ±rƒ±ldƒ±:** Temporal integrity korundu

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    f1_score, recall_score, precision_recall_curve,
    roc_auc_score, fbeta_score
)
from sklearn.model_selection import TimeSeriesSplit
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
import os

# Ayarlar
pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")

# Optimize Edilmi≈ü Veriyi Y√ºkle
data_path = '../data/processed/sensor_enriched_optimized.csv'
if os.path.exists(data_path):
    df = pd.read_csv(data_path, parse_dates=['timestamp'], index_col='timestamp')
    print(f"Optimize Veri Y√ºklendi. Boyut: {df.shape}")
else:
    print("‚ö†Ô∏è Optimize veri bulunamadƒ±, orijinal veriyi kullanƒ±yorum...")
    data_path = '../data/processed/sensor_enriched.csv'
    df = pd.read_csv(data_path, parse_dates=['timestamp'], index_col='timestamp')
    print(f"Veri Y√ºklendi. Boyut: {df.shape}")

### Train/Test Split (Temporal)

**√ñNEMLƒ∞:** Zaman serisi i√ßin `shuffle=False` kullanƒ±lmalƒ±!

In [None]:
# Hedef ve √ñzellikler
X = df.drop(columns=['y'], errors='ignore')
y = df['y']

print(f"Class Distribution:")
print(y.value_counts())
print(f"Positive ratio: {y.mean()*100:.2f}%")

# Son arƒ±za tarihini bul
last_failure_date = y[y==1].index.max()
print(f"\nSon Arƒ±za Sinyali: {last_failure_date}")

# Temporal split - Son arƒ±zadan 5 g√ºn √∂nce kes
if pd.notnull(last_failure_date):
    split_date = last_failure_date - pd.Timedelta(days=5)
else:
    split_date = df.index.max() - pd.Timedelta(days=30)

print(f"Train/Test Kesme Tarihi: {split_date}")

# TEMPORAL SPLIT - SHUFFLE YOK!
X_train = X.loc[X.index < split_date]
y_train = y.loc[y.index < split_date]

X_test = X.loc[X.index >= split_date]
y_test = y.loc[y.index >= split_date]

print(f"\nTrain: {X_train.shape}, Test: {X_test.shape}")
print(f"Train '1' sayƒ±sƒ±: {y_train.sum()} ({y_train.mean()*100:.2f}%)")
print(f"Test '1' sayƒ±sƒ±: {y_test.sum()} ({y_test.mean()*100:.2f}%)")

### SMOTE ile Class Balance

**Strateji:** SMOTE ile minority class'ƒ± artƒ±r, sonra undersampling ile dengeyi saƒüla

In [None]:
# SMOTE + Undersampling Pipeline
# Pozitif √∂rnekleri artƒ±r, negatif √∂rnekleri azalt
smote = SMOTE(
    sampling_strategy=0.3,  # Minority class'ƒ± %30 oranƒ±na √ßƒ±kar
    random_state=42,
    k_neighbors=5
)

undersample = RandomUnderSampler(
    sampling_strategy=0.5,  # Majority class'ƒ± azalt (1:2 oranƒ±)
    random_state=42
)

# Pipeline olu≈ütur
resampling_pipeline = ImbPipeline([
    ('smote', smote),
    ('undersample', undersample)
])

# Train setini dengele
X_train_balanced, y_train_balanced = resampling_pipeline.fit_resample(X_train, y_train)

print(f"\n√ñnceki Train Boyut: {X_train.shape}")
print(f"Yeni Train Boyut: {X_train_balanced.shape}")
print(f"\nYeni Class Distribution:")
print(y_train_balanced.value_counts())
print(f"Positive ratio: {y_train_balanced.mean()*100:.2f}%")

### Model Eƒüitimi - Baseline LightGBM

√ñnce basit bir model deneyelim

In [None]:
# Basit LightGBM
baseline_model = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=6,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

print("Baseline model eƒüitiliyor...")
baseline_model.fit(X_train_balanced, y_train_balanced)

# Test tahminleri
y_pred_baseline = baseline_model.predict(X_test)
y_prob_baseline = baseline_model.predict_proba(X_test)[:, 1]

print("\n--- BASELINE MODEL SONU√áLARI ---")
print(classification_report(y_test, y_pred_baseline))

# Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_baseline), annot=True, fmt='d', cmap='Blues')
plt.title("Baseline Model: Confusion Matrix")
plt.xlabel("Tahmin")
plt.ylabel("Ger√ßek")
plt.show()

### Threshold Optimizasyonu

Recall'u maksimize eden threshold'u bul

In [None]:
# Precision-Recall Curve √ßiz
precision, recall, thresholds = precision_recall_curve(y_test, y_prob_baseline)

plt.figure(figsize=(12, 5))

# Precision-Recall Curve
plt.subplot(1, 2, 1)
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)

# F2 Score (Recall'a daha fazla aƒüƒ±rlƒ±k verir)
plt.subplot(1, 2, 2)
f2_scores = []
for thresh in np.arange(0.05, 0.95, 0.05):
    y_pred_temp = (y_prob_baseline > thresh).astype(int)
    f2 = fbeta_score(y_test, y_pred_temp, beta=2)  # beta=2: Recall'a 2x aƒüƒ±rlƒ±k
    f2_scores.append((thresh, f2))

f2_scores = np.array(f2_scores)
plt.plot(f2_scores[:, 0], f2_scores[:, 1], marker='o')
plt.xlabel('Threshold')
plt.ylabel('F2 Score')
plt.title('F2 Score vs Threshold')
plt.grid(True)

plt.tight_layout()
plt.show()

# En iyi threshold
best_idx = np.argmax(f2_scores[:, 1])
best_threshold = f2_scores[best_idx, 0]
best_f2 = f2_scores[best_idx, 1]

print(f"\nüéØ En ƒ∞yi Threshold: {best_threshold:.2f}")
print(f"üéØ F2 Score: {best_f2:.4f}")

### En ƒ∞yi Threshold ile Deƒüerlendirme

In [None]:
# En iyi threshold ile tahmin
y_pred_optimized = (y_prob_baseline > best_threshold).astype(int)

print("\n--- OPTƒ∞Mƒ∞ZE EDƒ∞LMƒ∞≈û THRESHOLD SONU√áLARI ---")
print(f"Threshold: {best_threshold:.2f}\n")
print(classification_report(y_test, y_pred_optimized))

# Confusion Matrix
plt.figure(figsize=(6, 5))
cm = confusion_matrix(y_test, y_pred_optimized)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title(f"Optimized Model (threshold={best_threshold:.2f})")
plt.xlabel("Tahmin")
plt.ylabel("Ger√ßek")
plt.show()

# Metrikleri hesapla
recall = recall_score(y_test, y_pred_optimized)
f1 = f1_score(y_test, y_pred_optimized)
f2 = fbeta_score(y_test, y_pred_optimized, beta=2)
auc = roc_auc_score(y_test, y_prob_baseline)

print(f"\nüìä Metrikler:")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"F2 Score: {f2:.4f}")
print(f"AUC-ROC: {auc:.4f}")

### TimeSeriesSplit ile Cross-Validation

Temporal validasyonla modelin stabilitesini test et

In [None]:
# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

cv_scores = {
    'recall': [],
    'f1': [],
    'f2': []
}

print("TimeSeriesSplit Cross-Validation ba≈ülƒ±yor...\n")

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train), 1):
    # Split
    X_tr_fold = X_train.iloc[train_idx]
    y_tr_fold = y_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_val_fold = y_train.iloc[val_idx]
    
    # SMOTE uygula
    X_tr_balanced, y_tr_balanced = resampling_pipeline.fit_resample(X_tr_fold, y_tr_fold)
    
    # Model eƒüit
    model = lgb.LGBMClassifier(
        n_estimators=200,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=6,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    
    model.fit(X_tr_balanced, y_tr_balanced)
    
    # Tahmin
    y_prob = model.predict_proba(X_val_fold)[:, 1]
    y_pred = (y_prob > best_threshold).astype(int)
    
    # Metrikler
    cv_scores['recall'].append(recall_score(y_val_fold, y_pred))
    cv_scores['f1'].append(f1_score(y_val_fold, y_pred))
    cv_scores['f2'].append(fbeta_score(y_val_fold, y_pred, beta=2))
    
    print(f"Fold {fold} - Recall: {cv_scores['recall'][-1]:.4f}, "
          f"F1: {cv_scores['f1'][-1]:.4f}, F2: {cv_scores['f2'][-1]:.4f}")

print(f"\nüìä Cross-Validation Ortalamalarƒ±:")
print(f"Recall: {np.mean(cv_scores['recall']):.4f} (+/- {np.std(cv_scores['recall']):.4f})")
print(f"F1: {np.mean(cv_scores['f1']):.4f} (+/- {np.std(cv_scores['f1']):.4f})")
print(f"F2: {np.mean(cv_scores['f2']):.4f} (+/- {np.std(cv_scores['f2']):.4f})")

### Feature Importance Analizi

In [None]:
# Feature importance
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': baseline_model.feature_importances_
}).sort_values(by='importance', ascending=False)

print("\nüîù En √ñnemli 20 √ñzellik:")
print(importance_df.head(20))

plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=importance_df.head(20), palette='viridis')
plt.title("Top 20 Most Important Features")
plt.tight_layout()
plt.show()

# T√ºretilen √∂zelliklerin √∂nemi
engineered_features = importance_df[importance_df['feature'].str.contains('roll_|diff_')]
original_features = importance_df[~importance_df['feature'].str.contains('roll_|diff_')]

print(f"\nüìà √ñzellik Tipi Analizi:")
print(f"T√ºretilen √∂zellikler ortalama importance: {engineered_features['importance'].mean():.4f}")
print(f"Orijinal √∂zellikler ortalama importance: {original_features['importance'].mean():.4f}")

### Final Model: Geli≈ütirilmi≈ü Hiperparametreler

In [None]:
# Daha agresif parametreler
final_model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.03,
    num_leaves=50,
    max_depth=8,
    min_child_samples=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=0.1,  # L2 regularization
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

print("Final model eƒüitiliyor...")
final_model.fit(X_train_balanced, y_train_balanced)

# Test tahminleri
y_prob_final = final_model.predict_proba(X_test)[:, 1]
y_pred_final = (y_prob_final > best_threshold).astype(int)

print("\n--- Fƒ∞NAL MODEL SONU√áLARI ---")
print(f"Threshold: {best_threshold:.2f}\n")
print(classification_report(y_test, y_pred_final))

# Metrikleri hesapla
final_recall = recall_score(y_test, y_pred_final)
final_f1 = f1_score(y_test, y_pred_final)
final_f2 = fbeta_score(y_test, y_pred_final, beta=2)
final_auc = roc_auc_score(y_test, y_prob_final)

print(f"\nüìä Final Metrikler:")
print(f"Recall: {final_recall:.4f}")
print(f"F1 Score: {final_f1:.4f}")
print(f"F2 Score: {final_f2:.4f}")
print(f"AUC-ROC: {final_auc:.4f}")

# Confusion Matrix
plt.figure(figsize=(6, 5))
cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='RdYlGn')
plt.title("Final Model: Confusion Matrix")
plt.xlabel("Tahmin")
plt.ylabel("Ger√ßek")
plt.show()

### Model ve Ayarlarƒ± Kaydet

In [None]:
# Modeli kaydet
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

joblib.dump(final_model, os.path.join(models_dir, 'final_lgbm_optimized.pkl'))
joblib.dump(X_train.columns.tolist(), os.path.join(models_dir, 'model_features_optimized.pkl'))

# Threshold ve diƒüer ayarlarƒ± kaydet
model_config = {
    'best_threshold': best_threshold,
    'feature_names': X_train.columns.tolist(),
    'metrics': {
        'recall': final_recall,
        'f1': final_f1,
        'f2': final_f2,
        'auc': final_auc
    }
}

joblib.dump(model_config, os.path.join(models_dir, 'model_config_optimized.pkl'))

print(f"\n‚úÖ Model kaydedildi: {os.path.join(models_dir, 'final_lgbm_optimized.pkl')}")
print(f"‚úÖ √ñzellik listesi: {os.path.join(models_dir, 'model_features_optimized.pkl')}")
print(f"‚úÖ Model config: {os.path.join(models_dir, 'model_config_optimized.pkl')}")

## √ñzet ve Kar≈üƒ±la≈ütƒ±rma

### Yapƒ±lan ƒ∞yile≈ütirmeler:

1. ‚úÖ **SMOTE + Undersampling:** Class imbalance √ß√∂z√ºld√º
2. ‚úÖ **Threshold Optimizasyonu:** Recall'u maksimize eden threshold bulundu
3. ‚úÖ **TimeSeriesSplit:** Temporal validation ile overfitting √∂nlendi
4. ‚úÖ **Feature Engineering ƒ∞yile≈ütirmesi:** Daha az, daha kaliteli √∂zellikler
5. ‚úÖ **Regularization:** L1/L2 regularization eklendi
6. ‚úÖ **Shuffle Kaldƒ±rƒ±ldƒ±:** Temporal integrity korundu

### Beklenen ƒ∞yile≈üme:
- **√ñnceki Recall:** 0.00 (Model hi√ß arƒ±za yakalayamƒ±yordu)
- **Yeni Recall:** > 0.70 bekleniyor (En az %70 arƒ±za yakalanmalƒ±)

### Sonraki Adƒ±mlar:
1. Modeli production'a almadan √∂nce daha fazla test
2. API entegrasyonu
3. Dashboard tasarƒ±mƒ±
4. Monitoring sistemi kurulumu