# 2. Feature Engineering (Özellik Mühendisliği)

Bu notebook'ta model için yeni özellikler türeteceğiz.

**Amaçlar:**
- Kategorik değişkenleri encode etmek
- Sayısal değişkenleri normalize etmek
- Yeni türetilmiş özellikler oluşturmak
- Risk skorları hesaplamak

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')

from src.data.loader import DataLoader
from src.data.preprocessor import DataPreprocessor
from src.features.engineer import FeatureEngineer

%matplotlib inline

## 2.1 Veri Yükleme

In [None]:
loader = DataLoader(data_dir='../data')

try:
    df = loader.load_customer_360()
    print("Loaded from database")
except:
    df = loader.load_telco_churn()
    print("Loaded from CSV")

print(f"Shape: {df.shape}")

## 2.2 Veri Ön İşleme

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Clean data
df_cleaned = preprocessor.clean_data(df)
print("Data cleaned")

# Handle missing values
df_cleaned = preprocessor.handle_missing_values(df_cleaned)
print("Missing values handled")

df_cleaned.head()

## 2.3 Feature Engineering

In [None]:
# Initialize feature engineer
engineer = FeatureEngineer()

# Create all features
df_features = engineer.create_all_features(df_cleaned)

print(f"Original columns: {len(df_cleaned.columns)}")
print(f"After feature engineering: {len(df_features.columns)}")
print(f"\nNew features created: {len(df_features.columns) - len(df_cleaned.columns)}")

In [None]:
# List new features
new_features = engineer.get_feature_list()
print("Engineered Features:")
for i, feat in enumerate(new_features, 1):
    if feat in df_features.columns:
        print(f"  {i}. {feat}")

In [None]:
# View sample of engineered features
feature_cols = ['customer_id', 'tenure_group', 'is_new_customer', 'is_loyal_customer',
                'avg_monthly_charge', 'service_count', 'has_premium_support',
                'contract_risk_score', 'payment_risk_score', 'combined_risk_score']

available_cols = [c for c in feature_cols if c in df_features.columns]
df_features[available_cols].head(10)

## 2.4 Risk Score Analysis

In [None]:
# Risk score distribution
if 'combined_risk_score' in df_features.columns:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Contract risk
    df_features['contract_risk_score'].hist(ax=axes[0], bins=10, edgecolor='black')
    axes[0].set_title('Contract Risk Score Distribution')
    
    # Payment risk
    df_features['payment_risk_score'].hist(ax=axes[1], bins=10, edgecolor='black')
    axes[1].set_title('Payment Risk Score Distribution')
    
    # Combined risk
    df_features['combined_risk_score'].hist(ax=axes[2], bins=10, edgecolor='black')
    axes[2].set_title('Combined Risk Score Distribution')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Risk score vs Churn
churn_col = 'churned' if 'churned' in df_features.columns else 'Churn'

if 'combined_risk_score' in df_features.columns and churn_col in df_features.columns:
    plt.figure(figsize=(10, 5))
    
    # Box plot
    df_features.boxplot(column='combined_risk_score', by=churn_col)
    plt.title('Combined Risk Score by Churn Status')
    plt.suptitle('')
    plt.ylabel('Risk Score')
    plt.show()
    
    # Statistics
    print("\nRisk Score Statistics:")
    print(df_features.groupby(churn_col)['combined_risk_score'].describe())

## 2.5 Service Count Analysis

In [None]:
# Service count distribution
if 'service_count' in df_features.columns:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # Distribution
    df_features['service_count'].value_counts().sort_index().plot(kind='bar', ax=axes[0])
    axes[0].set_title('Service Count Distribution')
    axes[0].set_xlabel('Number of Services')
    
    # Churn rate by service count
    churn_by_services = df_features.groupby('service_count')[churn_col].mean() * 100
    churn_by_services.plot(kind='bar', ax=axes[1], color='coral')
    axes[1].set_title('Churn Rate by Service Count')
    axes[1].set_ylabel('Churn Rate (%)')
    
    plt.tight_layout()
    plt.show()

## 2.6 Encoding Categorical Variables

In [None]:
# Encode categorical variables
df_encoded = preprocessor.encode_categorical(df_features, fit=True)

# View encoded columns
encoded_cols = [c for c in df_encoded.columns if '_encoded' in c]
print(f"Encoded columns: {encoded_cols}")

df_encoded[encoded_cols].head()

## 2.7 Scaling Numerical Features

In [None]:
# Scale numerical features
df_scaled = preprocessor.scale_numeric(df_encoded, fit=True)

# View scaled columns
scaled_cols = [c for c in df_scaled.columns if '_scaled' in c]
print(f"Scaled columns: {scaled_cols}")

df_scaled[scaled_cols].describe()

## 2.8 Feature Correlation Analysis

In [None]:
# Correlation with churn
numeric_df = df_scaled.select_dtypes(include=[np.number])

if churn_col in numeric_df.columns or 'churned' in df_scaled.columns:
    # Ensure churn is numeric
    if churn_col not in numeric_df.columns:
        numeric_df[churn_col] = df_scaled[churn_col].astype(int)
    
    correlations = numeric_df.corr()[churn_col].sort_values(ascending=False)
    
    # Top positive and negative correlations
    print("Top 10 Features Positively Correlated with Churn:")
    print(correlations.head(10))
    
    print("\nTop 10 Features Negatively Correlated with Churn:")
    print(correlations.tail(10))

In [None]:
# Visualize top correlations
plt.figure(figsize=(12, 8))
top_corr = correlations.drop(churn_col).abs().sort_values(ascending=False).head(15)
correlations[top_corr.index].plot(kind='barh')
plt.title('Feature Correlations with Churn')
plt.xlabel('Correlation')
plt.tight_layout()
plt.show()

## 2.9 Save Processed Data

In [None]:
# Save processed data
output_path = loader.save_to_csv(df_scaled, 'processed_features.csv', directory='processed')
print(f"Saved to: {output_path}")
print(f"Shape: {df_scaled.shape}")

In [None]:
# Summary
print("=" * 50)
print("FEATURE ENGINEERING SUMMARY")
print("=" * 50)
print(f"Original Features: {len(df.columns)}")
print(f"Total Features: {len(df_scaled.columns)}")
print(f"Encoded Features: {len(encoded_cols)}")
print(f"Scaled Features: {len(scaled_cols)}")
print(f"Engineered Features: {len(new_features)}")