<a href="https://colab.research.google.com/github/nurcoz/Advertising/blob/main/Untitled29.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
============================================================================
MAKALE REPLƒ∞KASYONU: Ali et al. (2021) - DATA LEAKAGE VERSƒ∞YONU
============================================================================
‚ö†Ô∏è  Dƒ∞KKAT: Bu kod MAKALENƒ∞N MUHTEMEL HATALARINI taklit ediyor!
    Ama√ß: Makalenin %80+ accuracy'sini elde etmek i√ßin ne yaptƒ±klarƒ±nƒ± g√∂rmek

‚ùå KASITLI HATALAR:
1. LAG YOK - Same-day features ‚Üí next-day target (LEAK!)
2. Normalize BEFORE split (LEAK!)
3. Shuffle=True in CV (LEAK!)
4. No proper time-series handling

Bu sonu√ßlar GER√áEK√áƒ∞ DEƒûƒ∞L! Sadece makalenin hatalarƒ±nƒ± kanƒ±tlamak i√ßin.
============================================================================
"""

import sys
import subprocess
print("üì¶ K√ºt√ºphaneler y√ºkleniyor...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                      "yfinance", "ta", "scikit-learn", "pandas", "numpy"])

import yfinance as yf
import pandas as pd
import numpy as np
import ta
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                            precision_score, recall_score, f1_score, confusion_matrix)
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Hazƒ±r!\n")

# ============================================================================
# 1. VERƒ∞ √áEKME
# ============================================================================
print("="*80)
print("VERƒ∞ √áEKME")
print("="*80)

tickers = {
    'KSE100': '^KSE',
    'KOSPI': '^KS11',
    'Nikkei225': '^N225',
    'SZSE': '000001.SS'
}

all_data = {}
for name, ticker in tickers.items():
    print(f"{name}...", end=" ")
    try:
        data = yf.download(ticker, start="2011-01-01", end="2020-09-27",
                          progress=False, auto_adjust=True)
        if len(data) == 0:
            print("‚ùå")
            continue

        if isinstance(data.columns, pd.MultiIndex):
            data.columns = data.columns.get_level_values(0)

        data = data[['Open', 'High', 'Low', 'Close', 'Volume']].dropna()
        all_data[name] = data
        print(f"‚úÖ {len(data)}")
    except Exception as e:
        print(f"‚ùå {e}")

print(f"\n‚úÖ {len(all_data)} borsa\n")

# ============================================================================
# 2. TEKNƒ∞K G√ñSTERGELER
# ============================================================================
print("="*80)
print("TEKNƒ∞K G√ñSTERGELER (15)")
print("="*80)

def calculate_indicators(df):
    df = df.copy()

    high = df['High'].squeeze()
    low = df['Low'].squeeze()
    close = df['Close'].squeeze()

    # 1-2. Stochastic
    stoch = ta.momentum.StochasticOscillator(high, low, close, window=14, smooth_window=3)
    df['Stochastic_K'] = stoch.stoch()
    df['Stochastic_D'] = stoch.stoch_signal()

    # 3. ROC
    df['ROC'] = ta.momentum.ROCIndicator(close, window=10).roc()

    # 4. Williams %R
    df['Williams_R'] = ta.momentum.WilliamsRIndicator(high, low, close, lbp=14).williams_r()

    # 5. Momentum
    df['Momentum'] = close.diff(4)

    # 6-7. Disparity
    ma5 = close.rolling(5).mean()
    ma14 = close.rolling(14).mean()
    df['Disparity_5'] = (close / ma5) * 100
    df['Disparity_14'] = (close / ma14) * 100

    # 8. OSCP
    ma10 = close.rolling(10).mean()
    df['OSCP'] = (ma5 - ma10) / ma5

    # 9. CCI
    tp = (high + low + close) / 3
    df['CCI'] = (tp - tp.rolling(20).mean()) / (0.015 * tp.rolling(20).std())

    # 10. RSI
    delta = close.diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = -delta.where(delta < 0, 0).rolling(14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))

    # 11-15. Pivot Points
    prev_high = high.shift(1)
    prev_low = low.shift(1)
    prev_close = close.shift(1)

    df['Pivot_Point'] = (prev_high + prev_low + prev_close) / 3
    df['S1'] = (df['Pivot_Point'] * 2) - prev_high
    df['S2'] = df['Pivot_Point'] - (prev_high - prev_low)
    df['R1'] = (df['Pivot_Point'] * 2) - prev_low
    df['R2'] = df['Pivot_Point'] + (prev_high - prev_low)

    df = df.replace([np.inf, -np.inf], np.nan)
    return df

all_data_indicators = {}
for name, data in all_data.items():
    print(f"{name}...", end=" ")
    try:
        result = calculate_indicators(data)
        all_data_indicators[name] = result
        print(f"‚úÖ {len(result)}")
    except Exception as e:
        print(f"‚ùå {e}")

print(f"\n‚úÖ G√∂stergeler hazƒ±r\n")

# ============================================================================
# 3. VERƒ∞ HAZIRLAMA (‚ö†Ô∏è LEAKAGE VERSION!)
# ============================================================================
print("="*80)
print("‚ö†Ô∏è  DATA PREPARATION (LEAKAGE VERSION - Makalenin Hatasƒ±)")
print("="*80)

def prepare_data_with_leakage(df, test_ratio=0.2):
    """
    ‚ö†Ô∏è  KASITLI HATALAR (Makalenin muhtemelen yaptƒ±ƒüƒ±):
    1. ‚ùå NO LAG - Same-day features ‚Üí next-day target
    2. ‚ùå NORMALIZE BEFORE SPLIT - Test data g√∂r√ºl√ºyor!
    3. ‚ùå Random split yerine temporal split (k√º√ß√ºk fark)
    """
    df = df.copy()

    features = ['Stochastic_K', 'Stochastic_D', 'ROC', 'Williams_R',
                'Momentum', 'Disparity_5', 'Disparity_14', 'OSCP',
                'CCI', 'RSI', 'Pivot_Point', 'S1', 'S2', 'R1', 'R2']

    # Target: Yarƒ±nƒ±n y√∂n√º
    df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
    df = df.iloc[:-1]
    df = df.dropna(subset=features + ['Target'])

    # ‚ùå HATA 1: NO LAG! Same-day features
    X = df[features].copy()
    y = df['Target'].copy()

    # ‚ùå HATA 2: NORMALIZE BEFORE SPLIT!
    scaler = MinMaxScaler()
    X_scaled = pd.DataFrame(
        scaler.fit_transform(X),
        columns=features,
        index=X.index
    )

    # Temporal split (bu kƒ±sƒ±m doƒüru)
    n_train = int(len(X_scaled) * (1 - test_ratio))
    X_train = X_scaled.iloc[:n_train]
    X_test = X_scaled.iloc[n_train:]
    y_train = y.iloc[:n_train].values
    y_test = y.iloc[n_train:].values

    return X_train, X_test, y_train, y_test

prepared_data = {}
for name, data in all_data_indicators.items():
    print(f"\n{name}:")
    try:
        X_train, X_test, y_train, y_test = prepare_data_with_leakage(data)
        prepared_data[name] = {
            'X_train': X_train, 'X_test': X_test,
            'y_train': y_train, 'y_test': y_test
        }
        print(f"  ‚ö†Ô∏è  Train: {len(X_train)} | UP: {y_train.mean()*100:.1f}%")
        print(f"  ‚ö†Ô∏è  Test:  {len(X_test)} | UP: {y_test.mean()*100:.1f}%")
    except Exception as e:
        print(f"  ‚ùå {e}")

print(f"\n‚ö†Ô∏è  {len(prepared_data)} borsa hazƒ±r (LEAKAGE ile!)\n")

# ============================================================================
# 4. GRID SEARCH (‚ö†Ô∏è SHUFFLE=TRUE - LEAKAGE!)
# ============================================================================
print("="*80)
print("‚ö†Ô∏è  GRID SEARCH (Shuffle=True - Makalenin Muhtemel Hatasƒ±)")
print("="*80)

def grid_search_with_leakage(X_train, y_train, kernel='linear'):
    """
    ‚ö†Ô∏è  KASITLI HATA: Shuffle=True
    Bu gelecek verilerinin training'de g√∂r√ºlmesine sebep olur!
    """

    # Makale parameter grid
    if kernel == 'linear':
        param_grid = {
            'C': [0.01, 0.1, 1, 10, 50, 100, 200, 500, 1000]
        }
    elif kernel == 'rbf':
        param_grid = {
            'C': [1, 10, 50, 100, 150, 200, 500, 1000],
            'gamma': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
        }
    else:  # poly
        param_grid = {
            'C': [10, 50, 100, 200, 314.52, 500],
            'gamma': [0.001, 0.01, 0.1, 0.5, 1],
            'degree': [1, 2, 3]
        }

    # ‚ùå HATA 3: Shuffle=True! (Makale bunu yapmƒ±≈ü olabilir)
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    svm = SVC(kernel=kernel, class_weight='balanced', max_iter=50000, random_state=42)

    grid = GridSearchCV(
        svm,
        param_grid,
        cv=cv,
        scoring='accuracy',  # Makale accuracy kullanmƒ±≈ü
        n_jobs=-1,
        verbose=0
    )

    grid.fit(X_train.values, y_train)

    return grid.best_estimator_, grid.best_params_, grid.best_score_

# ============================================================================
# 5. T√úM BORSALAR ƒ∞√áƒ∞N √áALI≈ûTIR
# ============================================================================
svm_results = {}

for name in prepared_data.keys():
    print(f"\n{'='*80}")
    print(f"üìä {name}")
    print(f"{'='*80}")

    data = prepared_data[name]
    svm_results[name] = {}

    for kernel in ['linear', 'rbf', 'poly']:
        print(f"\n‚ö†Ô∏è  {kernel.upper()} Kernel (LEAKAGE VERSION):")
        print("-" * 70)

        try:
            best_model, best_params, cv_score = grid_search_with_leakage(
                data['X_train'], data['y_train'], kernel=kernel
            )

            # Test
            y_pred = best_model.predict(data['X_test'].values)

            # Metrics
            acc = accuracy_score(data['y_test'], y_pred)
            bal_acc = balanced_accuracy_score(data['y_test'], y_pred)
            prec = precision_score(data['y_test'], y_pred, zero_division=0)
            rec = recall_score(data['y_test'], y_pred, zero_division=0)
            f1 = f1_score(data['y_test'], y_pred, zero_division=0)
            cm = confusion_matrix(data['y_test'], y_pred)

            svm_results[name][kernel] = {
                'params': best_params,
                'cv_score': cv_score,
                'acc': acc,
                'bal_acc': bal_acc,
                'precision': prec,
                'recall': rec,
                'f1': f1,
                'cm': cm
            }

            print(f"\n‚ö†Ô∏è  LEAKAGE SONU√áLARI:")
            print(f"   Best Params: {best_params}")
            print(f"   CV Accuracy: {cv_score*100:.2f}%")
            print(f"\nüìä TEST SONU√áLARI:")
            print(f"   Accuracy:      {acc*100:.2f}%")
            print(f"   Balanced Acc:  {bal_acc*100:.2f}%")

            print(f"\nüìà CONFUSION MATRIX:")
            print(f"                Predicted DOWN  Predicted UP")
            print(f"Actual DOWN          {cm[0,0]:<8}      {cm[0,1]:<8}")
            print(f"Actual UP            {cm[1,0]:<8}      {cm[1,1]:<8}")

            tn, fp, fn, tp = cm.ravel()
            down_recall = tn / (tn + fp) if (tn + fp) > 0 else 0
            up_recall = tp / (tp + fn) if (tp + fn) > 0 else 0

            print(f"\nüéØ CLASS-WISE RECALL:")
            print(f"   DOWN: {down_recall*100:.1f}% ({tn}/{tn+fp})")
            print(f"   UP:   {up_recall*100:.1f}% ({tp}/{tp+fn})")

        except Exception as e:
            print(f"‚ùå Hata: {e}")

# ============================================================================
# 6. KAR≈ûILA≈ûTIRMA
# ============================================================================
print("\n" + "="*80)
print("üìä LEAKAGE vs PAPER COMPARISON")
print("="*80)

paper_results = {
    'KOSPI': {'linear': 80.33, 'rbf': 81.80, 'poly': 80.33},
    'KSE100': {'linear': 85.19, 'rbf': 76.88, 'poly': 84.38},
    'Nikkei225': {'linear': 80.22, 'rbf': 76.26, 'poly': 78.28},
    'SZSE': {'linear': 89.98, 'rbf': 87.20, 'poly': 89.41}
}

for name in svm_results.keys():
    if name in paper_results:
        print(f"\n{name}:")
        print("-" * 70)
        print(f"{'Kernel':<10} {'Our (Leak)':<15} {'Paper':<12} {'Gap':<12}")
        print("-" * 70)

        for kernel in ['linear', 'rbf', 'poly']:
            if kernel in svm_results[name]:
                our_acc = svm_results[name][kernel]['acc'] * 100
                paper_acc = paper_results[name][kernel]
                gap = abs(our_acc - paper_acc)

                print(f"{kernel:<10} {our_acc:>5.2f}%         "
                      f"{paper_acc:>5.2f}%      {gap:>5.2f}%")

# ============================================================================
# 7. A√áIKLAMA
# ============================================================================
print("\n" + "="*80)
print("‚ö†Ô∏è  UYARI VE A√áIKLAMA")
print("="*80)
print("""
‚ùå BU KOD KASITLI OLARAK YANLI≈ûLAR ƒ∞√áERƒ∞YOR!

Makalenin muhtemel hatalarƒ±:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
1. ‚ùå NO LAG: Same-day features ‚Üí next-day target
   ‚Üí Model geleceƒüi "g√∂r√ºyor" (aynƒ± g√ºn verisi yarƒ±nƒ± tahmin ediyor)

2. ‚ùå Normalize BEFORE split
   ‚Üí Test datasƒ±nƒ±n istatistikleri training'de kullanƒ±lƒ±yor

3. ‚ùå Shuffle=True in CV
   ‚Üí Gelecek verisi training'e karƒ±≈üƒ±yor

SONU√á:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
Bu "leakage" versiyonu makalenin %80+ accuracy'sine yakla≈üacak!
Ama bu sonu√ßlar GER√áEK√áƒ∞ DEƒûƒ∞L ve ger√ßek trading'de KULLANILMAMALI!

Bƒ∞Zƒ∞M DOƒûRU VERSƒ∞YON:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- LAG kullanƒ±ldƒ± (t-1 features ‚Üí t+1 target)
- Normalize after split
- Shuffle=False
- TimeSeriesSplit
‚Üí Sonu√ß: %50-60 (GER√áEK√áƒ∞ ve DOƒûRU!)

üí° DERS: Makale %80+ accuracy = Data leakage
        Ger√ßek d√ºnya %50-60 = Normal ve beklenen
""")

print("="*80)
print("‚ö†Ô∏è  ANALƒ∞Z TAMAMLANDI (LEAKAGE VERSION)")
print("="*80)

üì¶ K√ºt√ºphaneler y√ºkleniyor...
‚úÖ Hazƒ±r!

VERƒ∞ √áEKME
KSE100... ‚úÖ 2346
KOSPI... ‚úÖ 2397
Nikkei225... ‚úÖ 2382
SZSE... ‚úÖ 2366

‚úÖ 4 borsa

TEKNƒ∞K G√ñSTERGELER (15)
KSE100... ‚úÖ 2346
KOSPI... ‚úÖ 2397
Nikkei225... ‚úÖ 2382
SZSE... ‚úÖ 2366

‚úÖ G√∂stergeler hazƒ±r

‚ö†Ô∏è  DATA PREPARATION (LEAKAGE VERSION - Makalenin Hatasƒ±)

KSE100:
  ‚ö†Ô∏è  Train: 1860 | UP: 53.9%
  ‚ö†Ô∏è  Test:  466 | UP: 51.9%

KOSPI:
  ‚ö†Ô∏è  Train: 1901 | UP: 51.3%
  ‚ö†Ô∏è  Test:  476 | UP: 56.3%

Nikkei225:
  ‚ö†Ô∏è  Train: 1889 | UP: 53.3%
  ‚ö†Ô∏è  Test:  473 | UP: 52.4%

SZSE:
  ‚ö†Ô∏è  Train: 1876 | UP: 52.7%
  ‚ö†Ô∏è  Test:  470 | UP: 53.6%

‚ö†Ô∏è  4 borsa hazƒ±r (LEAKAGE ile!)

‚ö†Ô∏è  GRID SEARCH (Shuffle=True - Makalenin Muhtemel Hatasƒ±)

üìä KSE100

‚ö†Ô∏è  LINEAR Kernel (LEAKAGE VERSION):
----------------------------------------------------------------------

‚ö†Ô∏è  LEAKAGE SONU√áLARI:
   Best Params: {'C': 1000}
   CV Accuracy: 58.55%

üìä TEST SONU√áLARI:
   Accuracy:      57.

KeyboardInterrupt: 