# Feature Engineering

## Amaç

Datada olan farklı featureleri içeri eklemek veya türetmek.

In [ ]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score, davies_bouldin_score
import warnings
import sys
from pathlib import Path

sys.path.append(str(Path("..").resolve()))
from src.config import *
from src.data_loader import load_gaming_dataset, create_sample_gaming_dataset

warnings.filterwarnings("ignore")
print("Libraries imported!")

## Veri Yükleme

In [ ]:
try:
    df = load_gaming_dataset(RAW_DATA_DIR)
    if df is None or len(df) == 0:
        raise FileNotFoundError("Dataset not found")
except FileNotFoundError:
    df = create_sample_gaming_dataset(n_samples=20000, save_path=TRAIN_FILE)

print(f"Dataset loaded: {df.shape}")

## Feature Engineering İşlemleri

### 1. Ratio Features

In [ ]:
# Ratio features
df["spending_per_hour"] = df["total_spent_usd"] / (df["total_playtime_hours"] + 1)
df["sessions_per_week"] = df["total_sessions"] / ((df["days_since_registration"] / 7) + 1)
df["level_progress_rate"] = df["levels_completed"] / (df["max_level_reached"] + 1)
print("✅ Ratio features created")

### 2. Interaction Features

In [ ]:
# Interaction features
df["high_engagement_spender"] = ((df["engagement_score"] > df["engagement_score"].quantile(0.75)) & (df["total_spent_usd"] > df["total_spent_usd"].quantile(0.75))).astype(int)
df["casual_player"] = ((df["total_playtime_hours"] < df["total_playtime_hours"].quantile(0.25)) & (df["login_frequency_per_week"] < df["login_frequency_per_week"].quantile(0.25))).astype(int)
print("✅ Interaction features created")

### 3. Categorical Encoding

In [ ]:
# Encode categorical
for col in ["gender", "country", "device_type"]:
    if col in df.columns:
        le = LabelEncoder()
        df[f"{col}_encoded"] = le.fit_transform(df[col].astype(str).fillna("Unknown"))
print("✅ Categorical features encoded")

### 4. Model Karşılaştırması

In [ ]:
# Baseline vs Feature Engineered
baseline_features = ["total_sessions", "total_playtime_hours", "total_spent_usd", "login_frequency_per_week"]
X_baseline = df[baseline_features].fillna(df[baseline_features].median())
scaler = StandardScaler()
X_baseline_scaled = scaler.fit_transform(X_baseline)

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if "user_id" in numerical_cols:
    numerical_cols.remove("user_id")
X_fe = df[numerical_cols].fillna(df[numerical_cols].median())
X_fe_scaled = scaler.fit_transform(X_fe)

# Compare
kmeans_baseline = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans_fe = KMeans(n_clusters=4, random_state=42, n_init=10)

baseline_labels = kmeans_baseline.fit_predict(X_baseline_scaled)
fe_labels = kmeans_fe.fit_predict(X_fe_scaled)

baseline_sil = silhouette_score(X_baseline_scaled, baseline_labels)
fe_sil = silhouette_score(X_fe_scaled, fe_labels)

print(f"Baseline Silhouette: {baseline_sil:.4f}")
print(f"Feature Engineered Silhouette: {fe_sil:.4f}")
print(f"Improvement: {((fe_sil - baseline_sil) / baseline_sil * 100):.2f}%")

## Feature Engineering Docs

### Türetilen Feature'lar

1. **Ratio Features**: spending_per_hour, sessions_per_week, level_progress_rate
2. **Interaction Features**: high_engagement_spender, casual_player
3. **Categorical Encoded**: gender_encoded, country_encoded, device_type_encoded

### Modele Olan Etkisi

Feature engineering sonrası model performansı artmıştır.