# Import Libraries 

In [67]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

# Download Stocks/Indices

In [171]:
tickers = ["MSFT", "AMZN", "^GSPC", "^DJI"]
data = yf.download(tickers, start="2003-01-01", end="2012-01-01", auto_adjust=False)

[*********************100%***********************]  4 of 4 completed


# Create features 

In [174]:
def continuous_data_features(df):
    """Compute 10 technical indicators exactly as per paper"""
    indicators = pd.DataFrame(index=df.index)
    close = df["Adj Close"]
    high = df["High"]
    low = df["Low"]
    vol = df["Volume"]
    
    # 1. Simple Moving Average 10 day
    indicators["SMA10"] = close.rolling(window=10).mean()
    
    # 2. Weighted Moving Average 10 day
    weights = np.arange(1, 11)
    indicators["WMA10"] = close.rolling(10).apply(lambda x: np.dot(x, weights)/weights.sum(), raw=True)
    
    # 3. Momentum 10 day
    indicators["MOM10"] = close - close.shift(9)
    
    # 4. Stochastic %K 14 day
    lowest_14 = low.rolling(14).min()
    highest_14 = high.rolling(14).max()
    indicators["STOCHK"] = 100 * (close - lowest_14) / (highest_14 - lowest_14 + 1e-10)
    
    # 5. Stochastic %D (3-day SMA of %K)
    indicators["STOCHD"] = indicators["STOCHK"].rolling(3).mean()
    
    # 6. RSI 14 day
    delta = close.diff()
    up = delta.clip(lower=0)
    down = -1 * delta.clip(upper=0)
    rs = up.rolling(14).mean() / (down.rolling(14).mean() + 1e-10)
    indicators["RSI14"] = 100 - (100 / (1 + rs))
    
    # 7. MACD (12-26 EMA)
    ema12 = close.ewm(span=12, adjust=False).mean()
    ema26 = close.ewm(span=26, adjust=False).mean()
    indicators["MACD"] = ema12 - ema26
    
    # 8. Williams %R 14 day
    indicators["WILLR"] = -100 * (highest_14 - close) / (highest_14 - lowest_14 + 1e-10)
    
    # 9. A/D Oscillator
    clv = ((close - low) - (high - close)) / (high - low + 1e-10)
    adl = (clv * vol).cumsum()
    indicators["ADOSC"] = adl - adl.shift(20)
    
    # 10. CCI 20 day
    tp = (high + low + close) / 3
    sma = tp.rolling(20).mean()
    mad = (tp - sma).abs().rolling(20).mean()
    indicators["CCI20"] = (tp - sma) / (0.015 * mad + 1e-10)
    
    return indicators

In [176]:
def trend_deterministic_data(indicators, df):
    """Convert continuous indicators to trend signals (+1 or -1)"""
    trend = pd.DataFrame(index=indicators.index)
    close = df["Adj Close"]
    
    trend["SMA10_T"] = np.where(close > indicators["SMA10"], 1, -1)
    trend["WMA10_T"] = np.where(close > indicators["WMA10"], 1, -1)
    trend["MOM10_T"] = np.where(indicators["MOM10"] > 0, 1, -1)
    trend["STOCHK_T"] = np.where(indicators["STOCHK"] > indicators["STOCHK"].shift(1), 1, -1)
    trend["STOCHD_T"] = np.where(indicators["STOCHD"] > indicators["STOCHD"].shift(1), 1, -1)
    
    rsi = indicators["RSI14"]
    trend["RSI14_T"] = np.select(
        [rsi > 70, rsi < 30, rsi > rsi.shift(1)],
        [-1, 1, 1],
        default=-1
    )
    
    trend["MACD_T"] = np.where(indicators["MACD"] > indicators["MACD"].shift(1), 1, -1)
    trend["WILLR_T"] = np.where(indicators["WILLR"] > indicators["WILLR"].shift(1), 1, -1)
    trend["ADOSC_T"] = np.where(indicators["ADOSC"] > indicators["ADOSC"].shift(1), 1, -1)
    
    cci = indicators["CCI20"]
    trend["CCI20_T"] = np.select(
        [cci > 200, cci < -200, cci > cci.shift(1)],
        [-1, 1, 1],
        default=-1
    )
    
    return trend


In [178]:
def create_target(df, horizon=1):
    """
    Create target based on next day close vs current day close.
    1 = up, 0 = down. NO threshold filtering (as per paper).
    """
    df = df.copy()
    df["Future_Close"] = df["Adj Close"].shift(-horizon)
    df["Target"] = np.where(df["Future_Close"] > df["Adj Close"], 1, 0)
    return df[["Target"]]


In [180]:
def year_stratified_split(df, y, train_ratio=0.5):
    """
    Split data by year maintaining class balance within each year.
    This matches the paper's methodology exactly.
    """
    df["Year"] = df.index.year
    train_idx = []
    test_idx = []
    
    for year in sorted(df["Year"].unique()):
        year_mask = df["Year"] == year
        year_indices = np.where(year_mask)[0]
        
        # Get up and down days in this year
        year_y = y.iloc[year_indices]
        up_indices = year_indices[year_y == 1]
        down_indices = year_indices[year_y == 0]
        
        # Split each class by ratio
        split_up = int(len(up_indices) * train_ratio)
        split_down = int(len(down_indices) * train_ratio)
        
        # Add to train/test
        train_idx.extend(up_indices[:split_up])
        train_idx.extend(down_indices[:split_down])
        test_idx.extend(up_indices[split_up:])
        test_idx.extend(down_indices[split_down:])
    
    train_idx = np.sort(train_idx)
    test_idx = np.sort(test_idx)
    
    X_train = df.iloc[train_idx]
    X_test = df.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]
    
    return X_train, X_test, y_train, y_test

In [182]:
results_summary = []

for ticker in tickers:
    print(f"\n{'='*70}")
    print(f"Processing {ticker}")
    print('='*70)
    
    df = data.xs(ticker, axis=1, level=1).copy()
    y_data = create_target(df, horizon=1)
    
    indicators = continuous_data_features(df)
    trend = trend_deterministic_data(indicators, df)
    
    # Combine all data
    df_combined = df[["Adj Close"]].join(indicators).join(trend).join(y_data)
    df_combined = df_combined.dropna()
    
    # Print class distribution
    print(f"\nClass distribution (all data):")
    total = len(df_combined)
    up_count = (df_combined['Target']==1).sum()
    down_count = (df_combined['Target']==0).sum()
    print(f"  Up (1):   {up_count} ({100*up_count/total:.1f}%)")
    print(f"  Down (0): {down_count} ({100*down_count/total:.1f}%)")
    
    continuous_cols = list(indicators.columns)
    trend_cols = list(trend.columns)
    
    y = df_combined["Target"]
    
    # ==== TEST CONTINUOUS FEATURES ====
    X_cont = df_combined[continuous_cols].copy()
    X_train_c, X_test_c, y_train_c, y_test_c = year_stratified_split(X_cont, y, 0.5)
    
    # Normalize using train set statistics
    train_min = X_train_c.min()
    train_max = X_train_c.max()
    X_train_c_norm = 2 * (X_train_c - train_min) / (train_max - train_min + 1e-10) - 1
    X_test_c_norm = 2 * (X_test_c - train_min) / (train_max - train_min + 1e-10) - 1
    
    rf_cont = RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    rf_cont.fit(X_train_c_norm, y_train_c)
    y_pred_c = rf_cont.predict(X_test_c_norm)
    
    cont_acc = accuracy_score(y_test_c, y_pred_c)
    cont_prec = precision_score(y_test_c, y_pred_c, zero_division=0)
    cont_rec = recall_score(y_test_c, y_pred_c, zero_division=0)
    cont_f1 = f1_score(y_test_c, y_pred_c, zero_division=0)
    
    results_summary.append({
        "Ticker": ticker,
        "Model": "RF_Continuous",
        "Accuracy": cont_acc,
        "Precision": cont_prec,
        "Recall": cont_rec,
        "F1": cont_f1
    })
    
    print(f"\nRandom Forest - Continuous Features")
    print(f"Accuracy: {cont_acc:.3f} | Precision: {cont_prec:.3f} | Recall: {cont_rec:.3f} | F1: {cont_f1:.3f}")
    print(classification_report(y_test_c, y_pred_c, digits=3))
    
    # ==== TEST TREND FEATURES ====
    X_disc = df_combined[trend_cols].copy()
    X_train_d, X_test_d, y_train_d, y_test_d = year_stratified_split(X_disc, y, 0.5)
    
    rf_disc = RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    rf_disc.fit(X_train_d, y_train_d)
    y_pred_d = rf_disc.predict(X_test_d)
    
    disc_acc = accuracy_score(y_test_d, y_pred_d)
    disc_prec = precision_score(y_test_d, y_pred_d, zero_division=0)
    disc_rec = recall_score(y_test_d, y_pred_d, zero_division=0)
    disc_f1 = f1_score(y_test_d, y_pred_d, zero_division=0)
    
    results_summary.append({
        "Ticker": ticker,
        "Model": "RF_Trend",
        "Accuracy": disc_acc,
        "Precision": disc_prec,
        "Recall": disc_rec,
        "F1": disc_f1
    })
    
    print(f"\nRandom Forest - Trend Features")
    print(f"Accuracy: {disc_acc:.3f} | Precision: {disc_prec:.3f} | Recall: {disc_rec:.3f} | F1: {disc_f1:.3f}")
    print(classification_report(y_test_d, y_pred_d, digits=3))

print("\n" + "="*70)
print("SUMMARY")
print("="*70)
results_df = pd.DataFrame(results_summary)
print(results_df.to_string(index=False))
print("\n" + "="*70)
print(f"Average Accuracy (Continuous): {results_df[results_df['Model']=='RF_Continuous']['Accuracy'].mean():.3f}")
print(f"Average Accuracy (Trend):      {results_df[results_df['Model']=='RF_Trend']['Accuracy'].mean():.3f}")


Processing MSFT

Class distribution (all data):
  Up (1):   1109 (49.8%)
  Down (0): 1120 (50.2%)

Random Forest - Continuous Features
Accuracy: 0.521 | Precision: 0.518 | Recall: 0.540 | F1: 0.529
              precision    recall  f1-score   support

           0      0.525     0.504     0.514       562
           1      0.518     0.540     0.529       556

    accuracy                          0.521      1118
   macro avg      0.522     0.522     0.521      1118
weighted avg      0.522     0.521     0.521      1118


Random Forest - Trend Features
Accuracy: 0.518 | Precision: 0.516 | Recall: 0.487 | F1: 0.501
              precision    recall  f1-score   support

           0      0.519     0.548     0.533       562
           1      0.516     0.487     0.501       556

    accuracy                          0.518      1118
   macro avg      0.518     0.518     0.517      1118
weighted avg      0.518     0.518     0.517      1118


Processing AMZN

Class distribution (all data):
  U