# Import Libraries

In [7]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.utils import resample, shuffle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Download Stocks/Indices 

In [10]:
tickers = ["MSFT", "AMZN", "^GSPC", "^DJI"]
data = yf.download(tickers, start="2003-01-01", end="2012-01-01", auto_adjust=False)
ohlcv = {t: data.xs(t, axis=1, level=1) for t in tickers}

[*********************100%***********************]  4 of 4 completed


# Create Features 

In [13]:
def compute_indicators(df):
    indicators = pd.DataFrame(index=df.index)
    
    close = df["Adj Close"]
    high = df["High"]
    low = df["Low"]
    vol = df["Volume"]
    
    # Simple Moving Average (10-day)
    indicators["SMA10"] = close.rolling(window=10).mean()
    
    # Weighted Moving Average (10-day)
    weights = np.arange(1, 11)
    indicators["WMA10"] = close.rolling(10).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True)
    
    # Momentum (10-day)
    indicators["MOM10"] = close - close.shift(9)
    
    # Stochastic %K (14-day)
    lowest_low = low.rolling(14).min()
    highest_high = high.rolling(14).max()
    indicators["STOCHK"] = 100 * (close - lowest_low) / (highest_high - lowest_low)
    
    # Stochastic %D (3-day SMA of %K)
    indicators["STOCHD"] = indicators["STOCHK"].rolling(3).mean()
    
    # Relative Strength Index (14-day)
    delta = close.diff()
    up = delta.clip(lower=0).rolling(14).mean()
    down = -delta.clip(upper=0).rolling(14).mean()
    rs = up / down
    indicators["RSI14"] = 100 - (100 / (1 + rs))
    
    # MACD (12-26 EMA difference)
    ema12 = close.ewm(span=12, adjust=False).mean()
    ema26 = close.ewm(span=26, adjust=False).mean()
    indicators["MACD"] = ema12 - ema26
    
    # Williams %R (14-day)
    indicators["WILLR"] = -100 * (highest_high - close) / (highest_high - lowest_low)
    
    # Accumulation/Distribution Oscillator
    clv = ((close - low) - (high - close)) / (high - low)
    clv = clv.fillna(0) 
    indicators["ADOSC"] = (clv * vol).cumsum()
    
    # Commodity Channel Index (20-day)
    tp = (high + low + close) / 3
    sma_tp = tp.rolling(20).mean()
    mad = (tp - sma_tp).abs().rolling(20).mean()
    indicators["CCI20"] = (tp - sma_tp) / (0.015 * mad)
    
    return indicators

In [15]:
def compute_trend_deterministic(indicators, prices):
    #computes trend deterministic data 
    trends = pd.DataFrame(index=indicators.index)
    
    # 1. SMA: +1 if price > SMA, -1 otherwise
    trends['SMA10'] = np.where(prices > indicators['SMA10'], 1, -1)
    
    # 2. WMA: +1 if price > WMA, -1 otherwise
    trends['WMA10'] = np.where(prices > indicators['WMA10'], 1, -1)
    
    # 3. Momentum: +1 if positive, -1 if negative
    trends['MOM10'] = np.where(indicators['MOM10'] > 0, 1, -1)
    
    # 4. Stochastic %K: +1 if increasing, -1 if decreasing
    trends['STOCHK'] = np.where(
        indicators['STOCHK'] > indicators['STOCHK'].shift(1), 1, -1
    )
    
    # 5. Stochastic %D: +1 if increasing, -1 if decreasing
    trends['STOCHD'] = np.where(
        indicators['STOCHD'] > indicators['STOCHD'].shift(1), 1, -1
    )
    
    # 6. RSI: Special rules for overbought/oversold
    rsi = indicators['RSI14']
    rsi_trend = np.where(rsi > rsi.shift(1), 1, -1)
    rsi_trend = np.where(rsi > 70, -1, rsi_trend)  # Overbought
    rsi_trend = np.where(rsi < 30, 1, rsi_trend)   # Oversold
    trends['RSI14'] = rsi_trend
    
    # 7. MACD: +1 if increasing, -1 if decreasing
    trends['MACD'] = np.where(
        indicators['MACD'] > indicators['MACD'].shift(1), 1, -1
    )
    
    # 8. Williams %R: +1 if increasing, -1 if decreasing
    trends['WILLR'] = np.where(
        indicators['WILLR'] > indicators['WILLR'].shift(1), 1, -1
    )
    
    # 9. A/D Oscillator: +1 if increasing, -1 if decreasing
    trends['ADOSC'] = np.where(
        indicators['ADOSC'] > indicators['ADOSC'].shift(1), 1, -1
    )
    
    # 10. CCI: Special rules for overbought/oversold
    cci = indicators['CCI20']
    cci_trend = np.where(cci > cci.shift(1), 1, -1)
    cci_trend = np.where(cci > 200, -1, cci_trend)   # Overbought
    cci_trend = np.where(cci < -200, 1, cci_trend)   # Oversold
    trends['CCI20'] = cci_trend
    
    return trends



In [17]:
def normalize_features(X_train, X_test):
    """Normalize continuous features to [-1, +1] range"""
    X_train_norm = X_train.copy()
    X_test_norm = X_test.copy()
    
    for col in X_train.columns:
        min_val = X_train[col].min()
        max_val = X_train[col].max()
        
        if max_val > min_val:  # Avoid division by zero
            X_train_norm[col] = 2 * (X_train[col] - min_val) / (max_val - min_val) - 1
            X_test_norm[col] = 2 * (X_test[col] - min_val) / (max_val - min_val) - 1
    
    return X_train_norm, X_test_norm


In [19]:
# Add after computing trends, before training:
print("\nFeature Statistics:")
print(f"Continuous features - any NaN: {indicators_final.isna().any().any()}")
print(f"Discrete features - any NaN: {trends_final.isna().any().any()}")
print(f"\nDiscrete feature value distribution:")
print((trends_final == 1).sum() / len(trends_final))  # Should be around 0.5

print(f"\nTarget distribution:")
print(f"Train: {y_train.value_counts(normalize=True)}")
print(f"Test: {y_test.value_counts(normalize=True)}")


Feature Statistics:


NameError: name 'indicators_final' is not defined

In [25]:
results_summary = []

for ticker in tickers:
    print(f"\n{'='*60}")
    print(f"Processing {ticker}")
    print('='*60)
    
    df = ohlcv[ticker].copy()
    
    # Compute continuous indicators
    indicators = compute_indicators(df)
    
    # Create target: 1 if next day's price goes up, 0 otherwise
    target = (df["Adj Close"].shift(-1) > df["Adj Close"]).astype(int)
    
    # Remove NaN values
    valid_idx = indicators.dropna().index
    indicators_clean = indicators.loc[valid_idx]
    target_clean = target.loc[valid_idx]
    prices_clean = df["Adj Close"].loc[valid_idx]
    
    # Compute trend deterministic representation
    trends = compute_trend_deterministic(indicators_clean, prices_clean)
    trends_clean = trends.dropna()
    
    # Align all data
    common_idx = trends_clean.index.intersection(target_clean.index)
    indicators_final = indicators_clean.loc[common_idx]
    trends_final = trends_clean.loc[common_idx]
    target_final = target_clean.loc[common_idx]

    print(f"Dataset size: {len(common_idx)} samples")
    print(f"Target distribution - Up: {target_final.sum()}, Down: {len(target_final) - target_final.sum()}")


    X_train_cont, X_test_cont, y_train, y_test, idx_train, idx_test = train_test_split(indicators_final, 
                                                                                       target_final, 
                                                                                       common_idx, 
                                                                                       test_size=0.2, 
                                                                                       shuffle=False,
                                                                                        random_state=42)
    # ============================================
    # APPROACH 1: Continuous-valued Input (GaussianNB)
    # ============================================
    print(f"\n--- Continuous Representation (GaussianNB) ---")
    
    X_train_cont = indicators_final.loc[idx_train]
    X_test_cont = indicators_final.loc[idx_test]
    y_train = target_final.loc[idx_train]
    y_test = target_final.loc[idx_test]
    
    # Normalize to [-1, +1] as per paper
    X_train_norm, X_test_norm = normalize_features(X_train_cont, X_test_cont)
    
    # Train GaussianNB
    gnb = GaussianNB()
    gnb.fit(X_train_norm, y_train)
    y_pred_cont = gnb.predict(X_test_norm)
    
    acc_cont = accuracy_score(y_test, y_pred_cont)
    f1_cont = f1_score(y_test, y_pred_cont, average='weighted')
    
    print(f"Accuracy: {acc_cont:.4f}")
    print(f"F-measure: {f1_cont:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_cont, target_names=['Down', 'Up']))
    
    # ============================================
    # APPROACH 2: Trend Deterministic Input (BernoulliNB)
    # ============================================
    print(f"\n--- Discrete Representation (BernoulliNB) ---")
    
    X_train_disc = trends_final.loc[train_idx]
    X_test_disc = trends_final.loc[test_idx]
    
    # Convert from {-1, +1} to {0, 1} for BernoulliNB
    X_train_binary = ((X_train_disc + 1) / 2).astype(int)
    X_test_binary = ((X_test_disc + 1) / 2).astype(int)
    
    # Train BernoulliNB
    bnb = BernoulliNB()
    bnb.fit(X_train_binary, y_train)
    y_pred_disc = bnb.predict(X_test_binary)
    
    acc_disc = accuracy_score(y_test, y_pred_disc)
    f1_disc = f1_score(y_test, y_pred_disc, average='weighted')
    
    print(f"Accuracy: {acc_disc:.4f}")
    print(f"F-measure: {f1_disc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_disc, target_names=['Down', 'Up']))
    
    # Store results
    results_summary.append({
        'Ticker': ticker,
        'Continuous_Accuracy': acc_cont,
        'Continuous_F1': f1_cont,
        'Discrete_Accuracy': acc_disc,
        'Discrete_F1': f1_disc,
        'Improvement': acc_disc - acc_cont
    })



Processing MSFT
Dataset size: 2229 samples
Target distribution - Up: 1108, Down: 1121

--- Continuous Representation (GaussianNB) ---


NameError: name 'f1_score' is not defined

In [27]:
print("FINAL SUMMARY")
results_df = pd.DataFrame(results_summary)
print(results_df.to_string(index=False))
print(f"\nAverage Continuous Accuracy: {results_df['Continuous_Accuracy'].mean():.4f}")
print(f"Average Discrete Accuracy: {results_df['Discrete_Accuracy'].mean():.4f}")
print(f"Average Improvement: {results_df['Improvement'].mean():.4f}")

FINAL SUMMARY
Empty DataFrame
Columns: []
Index: []


KeyError: 'Continuous_Accuracy'