# üéØ COMPLEXITY (2021) - FINAL FIXED VERSION

**Article:** Ali, M., et al. (2021). Predicting the Direction Movement of Financial Time Series Using Artificial Neural Network and Support Vector Machine. *Complexity*, 2021.

**‚úÖ Fixes Applied:**
- MultiIndex DataFrame issue resolved
- Proper Series handling for all calculations
- Comprehensive evaluation metrics
- TimeSeriesSplit for cross-validation
- Detailed visualizations

**üöÄ Quick Start:** Runtime ‚Üí Run all

In [None]:
# Install and import libraries
!pip install yfinance -q

import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries loaded successfully!")

In [None]:
# Download data
print("="*70)
print("üì• DOWNLOADING DATA...")
print("="*70)

# Download KSE-100 data
data = yf.download("^KSE", start="2011-01-01", end="2020-09-27", progress=False)

# CRITICAL: Fix MultiIndex columns
if isinstance(data.columns, pd.MultiIndex):
    data.columns = data.columns.droplevel(1)
    print("‚úÖ MultiIndex fixed")

print(f"‚úÖ Downloaded {len(data)} days of data")
print(f"üìÖ Date range: {data.index[0]} ‚Üí {data.index[-1]}")
print(f"\nüìä First 5 rows:")
print(data.head())
print(f"\nüìä Data shape: {data.shape}")
print(f"‚ùì Missing values: {data.isnull().sum().sum()}")

In [None]:
# Technical Indicators - FIXED VERSION
print("\n" + "="*70)
print("üîß CALCULATING TECHNICAL INDICATORS...")
print("="*70)

df = data.copy()

# Convert to Series (handle MultiIndex issues)
close = df['Close'].values if isinstance(df['Close'], pd.Series) else df['Close'].iloc[:, 0].values
high = df['High'].values if isinstance(df['High'], pd.Series) else df['High'].iloc[:, 0].values
low = df['Low'].values if isinstance(df['Low'], pd.Series) else df['Low'].iloc[:, 0].values

# Create clean DataFrame
df = pd.DataFrame({
    'Close': close,
    'High': high,
    'Low': low
}, index=data.index)

# 1-2. Stochastic Oscillator
low_14 = df['Low'].rolling(14).min()
high_14 = df['High'].rolling(14).max()
df['Stochastic_K'] = 100 * ((df['Close'] - low_14) / (high_14 - low_14 + 1e-10))
df['Stochastic_D'] = df['Stochastic_K'].rolling(3).mean()

# 3. Rate of Change (ROC)
df['ROC'] = ((df['Close'] / df['Close'].shift(10)) - 1) * 100

# 4. Williams %R
df['Williams_R'] = -100 * ((high_14 - df['Close']) / (high_14 - low_14 + 1e-10))

# 5. Momentum
df['Momentum'] = df['Close'] - df['Close'].shift(4)

# 6-7. Disparity Index
ma5 = df['Close'].rolling(5).mean()
ma14 = df['Close'].rolling(14).mean()
df['Disparity_5'] = ((df['Close'] - ma5) / (ma5 + 1e-10)) * 100
df['Disparity_14'] = ((df['Close'] - ma14) / (ma14 + 1e-10)) * 100

# 8. OSCP (Oscillator of a Short-term Cycle)
ma10 = df['Close'].rolling(10).mean()
df['OSCP'] = ((ma5 - ma10) / (ma5 + 1e-10)) * 100

# 9. Commodity Channel Index (CCI)
tp = (df['High'] + df['Low'] + df['Close']) / 3
ma_tp = tp.rolling(20).mean()
md = tp.rolling(20).apply(lambda x: np.abs(x - x.mean()).mean())
df['CCI'] = (tp - ma_tp) / (0.015 * md + 1e-10)

# 10. Relative Strength Index (RSI)
delta = df['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
rs = gain / (loss + 1e-10)
df['RSI'] = 100 - (100 / (1 + rs))

# 11-15. Pivot Points (using previous day's data)
prev_high = df['High'].shift(1)
prev_low = df['Low'].shift(1)
prev_close = df['Close'].shift(1)

df['Pivot_Point'] = (prev_high + prev_low + prev_close) / 3
df['S1'] = (df['Pivot_Point'] * 2) - prev_high
df['S2'] = df['Pivot_Point'] - (prev_high - prev_low)
df['R1'] = (df['Pivot_Point'] * 2) - prev_low
df['R2'] = df['Pivot_Point'] + (prev_high - prev_low)

# Target: Next day's direction (1=Up, 0=Down)
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

# Remove NaN and infinite values
df = df.replace([np.inf, -np.inf], np.nan).dropna()

print(f"‚úÖ {len(df)} rows prepared")
print(f"\nüìä Target distribution:")
print(df['Target'].value_counts(normalize=True))
print(f"\nüî¢ Sample indicators:")
print(df[['RSI', 'CCI', 'Momentum', 'Pivot_Point', 'Target']].head())

In [None]:
# Data Preparation
print("\n" + "="*70)
print("üìä DATA PREPARATION...")
print("="*70)

feature_cols = ['Stochastic_K', 'Stochastic_D', 'ROC', 'Williams_R',
                'Momentum', 'Disparity_5', 'Disparity_14', 'OSCP',
                'CCI', 'RSI', 'Pivot_Point', 'S1', 'S2', 'R1', 'R2']

X = df[feature_cols].values
y = df['Target'].values
dates = df.index

# Chronological split (80/20)
train_size = int(len(X) * 0.8)
X_train = X[:train_size]
X_test = X[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

# Normalization (fit on train only)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"Train: {len(X_train)} samples | Up: {sum(y_train)}/{len(y_train)} ({sum(y_train)/len(y_train)*100:.1f}%)")
print(f"Test:  {len(X_test)} samples | Up: {sum(y_test)}/{len(y_test)} ({sum(y_test)/len(y_test)*100:.1f}%)")
print(f"Train period: {dates[:train_size][0]} ‚Üí {dates[:train_size][-1]}")
print(f"Test period:  {dates[train_size:][0]} ‚Üí {dates[train_size:][-1]}")

In [None]:
# Evaluation Function
def evaluate_model(model, X_test, y_test, model_name):
    """Comprehensive model evaluation"""
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"\n{'='*60}")
    print(f"{model_name}")
    print(f"{'='*60}")
    print(f"Accuracy:  {acc:.4f} ({acc*100:.2f}%)")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nConfusion Matrix:")
    print(cm)
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Down', 'Up']))
    
    return {
        'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1,
        'predictions': y_pred, 'confusion_matrix': cm
    }

print("‚úÖ Evaluation function defined")

In [None]:
# SVM Models - Article Parameters
print("\n" + "="*70)
print("ü§ñ SVM MODELS (ARTICLE PARAMETERS)")
print("="*70)

results = {}

# 1. Linear SVM
print("\n[1/3] Linear SVM (C=964.7736)...")
svm_linear = SVC(kernel='linear', C=964.7736, random_state=42)
svm_linear.fit(X_train, y_train)
results['Linear'] = evaluate_model(svm_linear, X_test, y_test, "LINEAR SVM")
print(f"üìù Article reports: 85.19%")

# 2. RBF SVM
print("\n[2/3] RBF SVM (C=137.20, gamma=60.51)...")
svm_rbf = SVC(kernel='rbf', C=137.20, gamma=60.51, random_state=42)
svm_rbf.fit(X_train, y_train)
results['RBF'] = evaluate_model(svm_rbf, X_test, y_test, "RBF SVM")
print(f"üìù Article reports: 76.88%")

# 3. Polynomial SVM
print("\n[3/3] Polynomial SVM (C=314.52, degree=2, coef0=0.5554)...")
svm_poly = SVC(kernel='poly', C=314.52, degree=2, coef0=0.5554, random_state=42)
svm_poly.fit(X_train, y_train)
results['Polynomial'] = evaluate_model(svm_poly, X_test, y_test, "POLYNOMIAL SVM")
print(f"üìù Article reports: 84.38%")

In [None]:
# Grid Search with TimeSeriesSplit
print("\n" + "="*70)
print("üîç GRID SEARCH (TimeSeriesSplit)")
print("="*70)

tscv = TimeSeriesSplit(n_splits=4)
results_grid = {}

# 1. Linear SVM
print("\n[1/3] Linear Grid Search...")
grid_linear = GridSearchCV(
    SVC(kernel='linear', random_state=42),
    {'C': [0.1, 1, 10, 100, 500, 964.7736, 1000]},
    cv=tscv, scoring='accuracy', n_jobs=-1, verbose=0
)
grid_linear.fit(X_train, y_train)
print(f"Best params: {grid_linear.best_params_} | CV score: {grid_linear.best_score_:.4f}")
results_grid['Linear'] = evaluate_model(grid_linear, X_test, y_test, "LINEAR SVM (Grid)")

# 2. RBF SVM
print("\n[2/3] RBF Grid Search...")
grid_rbf = GridSearchCV(
    SVC(kernel='rbf', random_state=42),
    {'C': [1, 10, 100, 137.20], 'gamma': [0.01, 0.1, 1, 10, 60.51, 'scale']},
    cv=tscv, scoring='accuracy', n_jobs=-1, verbose=0
)
grid_rbf.fit(X_train, y_train)
print(f"Best params: {grid_rbf.best_params_} | CV score: {grid_rbf.best_score_:.4f}")
results_grid['RBF'] = evaluate_model(grid_rbf, X_test, y_test, "RBF SVM (Grid)")

# 3. Polynomial SVM
print("\n[3/3] Polynomial Grid Search...")
grid_poly = GridSearchCV(
    SVC(kernel='poly', random_state=42),
    {'C': [10, 100, 314.52, 500], 'degree': [2, 3], 'coef0': [0, 0.5554, 1.0]},
    cv=tscv, scoring='accuracy', n_jobs=-1, verbose=0
)
grid_poly.fit(X_train, y_train)
print(f"Best params: {grid_poly.best_params_} | CV score: {grid_poly.best_score_:.4f}")
results_grid['Polynomial'] = evaluate_model(grid_poly, X_test, y_test, "POLYNOMIAL SVM (Grid)")

In [None]:
# Final Comparison
article = {'Linear': 0.8519, 'RBF': 0.7688, 'Polynomial': 0.8438}

print("\n" + "="*80)
print("üìä FINAL RESULTS COMPARISON")
print("="*80)
print(f"\n{'Model':<15} {'Article':<12} {'Exact':<12} {'Grid':<12} {'Diff':<10}")
print("-"*80)

for m in ['Linear', 'RBF', 'Polynomial']:
    art = article[m] * 100
    exact = results[m]['accuracy'] * 100
    grid = results_grid[m]['accuracy'] * 100
    diff = exact - art
    
    print(f"{m:<15} {art:>8.2f}%    {exact:>8.2f}%    {grid:>8.2f}%    {diff:>+7.2f}%")

avg_art = np.mean(list(article.values())) * 100
avg_exact = np.mean([results[m]['accuracy'] for m in ['Linear', 'RBF', 'Polynomial']]) * 100
avg_grid = np.mean([results_grid[m]['accuracy'] for m in ['Linear', 'RBF', 'Polynomial']]) * 100

print("-"*80)
print(f"{'AVERAGE':<15} {avg_art:>8.2f}%    {avg_exact:>8.2f}%    {avg_grid:>8.2f}%    {avg_exact-avg_art:>+7.2f}%")
print("\n" + "="*80)

# Interpretation
gap = abs(avg_exact - avg_art)
if gap <= 5:
    print("‚úÖ EXCELLENT: Results closely match the article!")
elif gap <= 10:
    print("‚úÖ GOOD: Results are reasonably close to the article.")
else:
    print("‚ö†Ô∏è MODERATE: Significant difference likely due to:")
    print("   1. Different data source (Yahoo Finance vs. article's source)")
    print("   2. KSE-100 data quality/availability issues")
    print("   3. Different market periods")
    print("   4. Preprocessing differences")
    print(f"\nüí° Suggestion: Test with SPY for better data quality:")
    print("   data = yf.download('SPY', start='2011-01-01', end='2020-09-27')")

In [None]:
# Visualizations
print("\nüìä Creating visualizations...")

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Accuracy Comparison
models = ['Linear', 'RBF', 'Polynomial']
x = np.arange(len(models))
width = 0.25

art_vals = [article[m] for m in models]
exact_vals = [results[m]['accuracy'] for m in models]
grid_vals = [results_grid[m]['accuracy'] for m in models]

axes[0, 0].bar(x - width, art_vals, width, label='Article', alpha=0.8, color='#2ecc71')
axes[0, 0].bar(x, exact_vals, width, label='Exact Params', alpha=0.8, color='#3498db')
axes[0, 0].bar(x + width, grid_vals, width, label='Grid Search', alpha=0.8, color='#e74c3c')
axes[0, 0].set_ylabel('Accuracy', fontsize=12)
axes[0, 0].set_title('Model Performance Comparison', fontweight='bold', fontsize=14)
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(models)
axes[0, 0].legend()
axes[0, 0].grid(axis='y', alpha=0.3)
axes[0, 0].set_ylim([0.4, 1.0])

# 2. F1-Score Comparison
f1_exact = [results[m]['f1'] for m in models]
f1_grid = [results_grid[m]['f1'] for m in models]

axes[0, 1].bar(x - width/2, f1_exact, width, label='Exact Params', alpha=0.8, color='#3498db')
axes[0, 1].bar(x + width/2, f1_grid, width, label='Grid Search', alpha=0.8, color='#e74c3c')
axes[0, 1].set_ylabel('F1-Score', fontsize=12)
axes[0, 1].set_title('F1-Score Comparison', fontweight='bold', fontsize=14)
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(models)
axes[0, 1].legend()
axes[0, 1].grid(axis='y', alpha=0.3)

# 3. Best Model Confusion Matrix
best_model = max(results_grid, key=lambda k: results_grid[k]['accuracy'])
cm = results_grid[best_model]['confusion_matrix']

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0],
            xticklabels=['Down', 'Up'], yticklabels=['Down', 'Up'])
axes[1, 0].set_title(f'Confusion Matrix - Best Model ({best_model})', fontweight='bold', fontsize=14)
axes[1, 0].set_ylabel('True Label')
axes[1, 0].set_xlabel('Predicted Label')

# 4. Accuracy Gap Analysis
gaps = [abs(results[m]['accuracy'] - article[m]) * 100 for m in models]
colors = ['red' if g > 10 else 'orange' if g > 5 else 'green' for g in gaps]

axes[1, 1].bar(models, gaps, alpha=0.8, color=colors)
axes[1, 1].set_ylabel('Accuracy Gap (%)', fontsize=12)
axes[1, 1].set_title('Difference from Article', fontweight='bold', fontsize=14)
axes[1, 1].grid(axis='y', alpha=0.3)
axes[1, 1].axhline(y=5, color='orange', linestyle='--', linewidth=2, label='¬±5% threshold')
axes[1, 1].axhline(y=10, color='red', linestyle='--', linewidth=2, label='¬±10% threshold')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("‚úÖ Visualizations complete!")

In [None]:
# Summary Report
print("\n" + "="*80)
print("üìù SUMMARY REPORT")
print("="*80)

print(f"\nüéØ Best Model: {best_model} SVM")
print(f"   Accuracy:  {results_grid[best_model]['accuracy']:.4f} ({results_grid[best_model]['accuracy']*100:.2f}%)")
print(f"   F1-Score:  {results_grid[best_model]['f1']:.4f}")
print(f"   Precision: {results_grid[best_model]['precision']:.4f}")
print(f"   Recall:    {results_grid[best_model]['recall']:.4f}")

print(f"\nüìä Data Information:")
print(f"   Total samples: {len(df)}")
print(f"   Train samples: {len(X_train)}")
print(f"   Test samples:  {len(X_test)}")
print(f"   Class balance (test): {sum(y_test)/len(y_test)*100:.1f}% Up")

print(f"\nüí° Key Findings:")
if avg_exact > avg_art:
    print(f"   ‚úÖ Our implementation performs {avg_exact-avg_art:.2f}% better than reported")
elif avg_exact > avg_art * 0.95:
    print(f"   ‚úÖ Results closely match the article (within 5%)")
else:
    print(f"   ‚ö†Ô∏è Performance gap: {avg_art-avg_exact:.2f}% lower than article")
    print(f"   This is normal due to data source differences")

print(f"\nüî¨ Technical Notes:")
print(f"   - TimeSeriesSplit used for proper temporal validation")
print(f"   - MinMaxScaler normalization applied")
print(f"   - 80/20 chronological train/test split")
print(f"   - All indicators computed with proper handling of edge cases")

print("\n" + "="*80)
print("‚úÖ ANALYSIS COMPLETE!")
print("="*80)