In [11]:
import yfinance as yf
import pandas as pd
import numpy as np
import pandas_ta as ta
from sklearn.preprocessing import StandardScaler

# Define the tickers and the date range
tickers = ['SPY', 'IWM', 'DIA']
start_date = '2012-11-01'
end_date = '2024-12-31'

# Manual Calculation Functions

In [12]:
def calculate_cmfi(input_df, window):
    """Chaikin Money Flow for a specific window"""
    mf_multiplier = ((input_df['Close'] - input_df['Low']) - (input_df['High'] - input_df['Close'])) / (input_df['High'] - input_df['Low'])
    mf_volume = mf_multiplier * input_df['Volume']
    return mf_volume.rolling(window=window).sum() / input_df['Volume'].rolling(window=window).sum()

def calculate_dmi(input_df, window):
    """Directional Movement Index (ADX) with proper Pandas Series conversion"""
    # Calculate True Range
    tr = pd.Series(np.maximum(
        input_df['High'] - input_df['Low'],
        np.maximum(
            abs(input_df['High'] - input_df['Close'].shift(1)),
            abs(input_df['Low'] - input_df['Close'].shift(1))
        )
    ), index=input_df.index)

    # Calculate Directional Movement
    up = input_df['High'].diff()
    down = -input_df['Low'].diff()
    
    # Convert to Pandas Series immediately after np.where
    plus_dm = pd.Series(np.where((up > down) & (up > 0), up, 0), index=input_df.index)
    minus_dm = pd.Series(np.where((down > up) & (down > 0), down, 0), index=input_df.index)
    
    # Smoothing (now works because we're using Pandas Series)
    plus_di = 100 * (plus_dm.ewm(alpha=1/window, adjust=False).mean() / 
                    tr.ewm(alpha=1/window, adjust=False).mean())
    minus_di = 100 * (minus_dm.ewm(alpha=1/window, adjust=False).mean() / 
                     tr.ewm(alpha=1/window, adjust=False).mean())
    
    dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di)
    return dx.ewm(alpha=1/window, adjust=False).mean()  # ADX

def calculate_psar_window(high, low, initial_af=0.02, max_af=0.2, step=0.02):
    """Calculate Parabolic SAR for a range of windows by adjusting AF step size"""
    psar = [low.iloc[0]]
    ep = high.iloc[0]
    af = initial_af
    uptrend = True
    
    for i in range(1, len(high)):
        prev_psar = psar[-1]
        
        if uptrend:
            current_psar = prev_psar + af * (ep - prev_psar)
            if low.iloc[i] < current_psar:
                uptrend = False
                current_psar = ep
                ep = low.iloc[i]
                af = initial_af
            else:
                if high.iloc[i] > ep:
                    ep = high.iloc[i]
                    af = min(af + step, max_af)
        else:
            current_psar = prev_psar - af * (prev_psar - ep)
            if high.iloc[i] > current_psar:
                uptrend = True
                current_psar = ep
                ep = high.iloc[i]
                af = initial_af
            else:
                if low.iloc[i] < ep:
                    ep = low.iloc[i]
                    af = min(af + step, max_af)
        psar.append(current_psar)
    
    return pd.Series(psar, index=high.index)

def calculate_all_psar(df, windows=range(6, 21)):
    """Calculate PSAR for multiple window sizes by adjusting AF step"""
    for n in windows:
        # Scale AF step inversely with window size (shorter windows = faster AF increase)
        step = 0.02 * (14 / n)  # Normalized to 14-day baseline
        df[f'PSAR_{n}'] = calculate_psar_window(df['High'], df['Low'], step=step)
    return df

In [13]:
def apply_labelling(df, upper_mult=2.0, lower_mult=1.5, max_days=15):
    """Volatility-adjusted labeling using pandas_ta with first-hit priority"""
    close = df['Close'].values
    
    # Calculate ATR using pandas_ta
    df['ATR'] = ta.atr(df['High'], df['Low'], df['Close'], length=max_days)
    atr = df['ATR'].values
    
    labels = np.zeros(len(close))
    for i in range(len(close) - max_days):
        upper = close[i] + upper_mult * atr[i]
        lower = close[i] - lower_mult * atr[i]
        future_window = close[i+1:i+max_days+1]
        
        # Determine if upper or lower barrier is hit
        upper_hit_indices = np.where(future_window >= upper)[0]  # Find indices where upper is hit
        lower_hit_indices = np.where(future_window <= lower)[0]  # Find indices where lower is hit
        
        if upper_hit_indices.size > 0 and lower_hit_indices.size > 0:
            # If both upper and lower barriers are hit, assign label based on the first barrier hit
            if upper_hit_indices[0] < lower_hit_indices[0]:
                labels[i] = 2  # Upper barrier hit first
            else:
                labels[i] = 0  # Lower barrier hit first
        elif upper_hit_indices.size > 0:
            labels[i] = 2  # Upper barrier hit
        elif lower_hit_indices.size > 0:
            labels[i] = 0  # Lower barrier hit
        else:
            labels[i] = 1  # Time barrier
    
    df['Label'] = labels
    return df

# Get Data, Indicators & Labels

In [14]:
data = yf.download(tickers, start=start_date, end=end_date, interval='1d', group_by='ticker')

for ticker in tickers:
    print(f"\nProcessing {ticker}...")
    df = data[ticker].dropna().copy()
    df = df.reset_index()
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Pre-allocate all indicator data
    indicator_data = {}
    
    for n in range(6, 21):
        # Store indicators in dictionary first
        indicator_data[f'SMA_{n}'] = ta.sma(df['Close'], length=n)
        indicator_data[f'EMA_{n}'] = ta.ema(df['Close'], length=n)
        indicator_data[f'WMA_{n}'] = ta.wma(df['Close'], length=n)
        indicator_data[f'HMA_{n}'] = ta.hma(df['Close'], length=n)
        indicator_data[f'TEMA_{n}'] = ta.tema(df['Close'], length=n)
        # Momentum indicators
        indicator_data[f'RSI_{n}'] = ta.rsi(df['Close'], length=n)
        indicator_data[f'Williams_%R_{n}'] = ta.willr(df['High'], df['Low'], df['Close'], length=n)
        indicator_data[f'CMO_{n}'] = ta.cmo(df['Close'], length=n)
        indicator_data[f'ROC_{n}'] = ta.roc(df['Close'], length=n)
        # Composite indicators
        macd = ta.macd(df['Close'], fast=n, slow=min(2*n, 26), signal=9)
        indicator_data[f'MACD_{n}'] = macd[f'MACD_{n}_{min(2*n,26)}_9']
        ppo = ta.ppo(df['Close'], fast=n, slow=min(2*n, 26), signal=9)
        indicator_data[f'PPO_{n}'] = ppo[f'PPO_{n}_{min(2*n,26)}_9']
        indicator_data[f'CCI_{n}'] = ta.cci(df['High'], df['Low'], df['Close'], length=n)
        # Volume/trend
        indicator_data[f'CMFI_{n}'] = calculate_cmfi(df, n)
        indicator_data[f'DMI_{n}'] = calculate_dmi(df, n)
    
    # Add PSAR (windowed)
    psar_data = {f'PSAR_{n}': calculate_psar_window(df['High'], df['Low'], step=0.02*(14/n)) 
                for n in range(6, 21)}
    indicator_data.update(psar_data)

    # Combine all at once
    full_df = pd.concat([df, pd.DataFrame(indicator_data, index=df.index)], axis=1)

    # Apply labelling
    full_df = apply_labelling(full_df)

    # Drop unused columns
    full_df = full_df.drop(columns=['Open', 'High', 'Low', 'Volume'])
    # full_df = full_df.drop(columns=['Open', 'High', 'Low', 'Volume', 'Returns', 'SMA', 'Upper_BB', 'Lower_BB', 'RSI', 'ATR'])

    # Create 8-year rolling windows
    for year in range(2013, 2018):  # 2013-2020, 2014-2021, ..., 2017-2024
        window_start = f"{year}-01-01"
        window_end = f"{year+7}-12-31"
        window_df = full_df[(full_df['Date'] >= window_start) & (full_df['Date'] <= window_end)]
        
        if len(window_df) > 0:
            window_df.to_csv(f'./data/test_years/{ticker}_{year+6}_{year+7}.csv', index=False)
            print(f"Saved {ticker}_{year+6}_{year+7}.csv")

[*********************100%***********************]  3 of 3 completed



Processing SPY...
Saved SPY_2019_2020.csv
Saved SPY_2020_2021.csv
Saved SPY_2021_2022.csv
Saved SPY_2022_2023.csv
Saved SPY_2023_2024.csv

Processing IWM...
Saved IWM_2019_2020.csv
Saved IWM_2020_2021.csv
Saved IWM_2021_2022.csv
Saved IWM_2022_2023.csv
Saved IWM_2023_2024.csv

Processing DIA...
Saved DIA_2019_2020.csv
Saved DIA_2020_2021.csv
Saved DIA_2021_2022.csv
Saved DIA_2022_2023.csv
Saved DIA_2023_2024.csv


# Train Test Split

In [15]:
for ticker in tickers:
    for year in range(2019, 2024):
        df = pd.read_csv(f'./data/test_years/{ticker}_{year}_{year+1}.csv')
        df['Date'] = pd.to_datetime(df['Date'])

        # Split into periods
        train = df[df['Date'].dt.year <= year-2]
        val = df[df['Date'].dt.year == year-1]
        test = df[df['Date'].dt.year >= year]

        # Identify columns to normalize (all except Date/Close/Label)
        exclude_cols = ['Date', 'Close', 'Label']
        feature_cols = [col for col in df.columns if col not in exclude_cols]
        
        # Normalize using train stats
        scaler = StandardScaler()
        train.loc[:, feature_cols] = scaler.fit_transform(train[feature_cols])
        val.loc[:, feature_cols] = scaler.transform(val[feature_cols])
        test.loc[:, feature_cols] = scaler.transform(test[feature_cols])

        # Combine back and save
        normalised_df = pd.concat([train, val, test], axis=0)
        normalised_df.to_csv(f'./data/normalised/{ticker}_{year}_{year+1}.csv', index=False)

        print(f"Saved {ticker}_{year}_{year+1}.csv")

print('Normalised data saved.')

Saved SPY_2019_2020.csv
Saved SPY_2020_2021.csv
Saved SPY_2021_2022.csv
Saved SPY_2022_2023.csv
Saved SPY_2023_2024.csv
Saved IWM_2019_2020.csv
Saved IWM_2020_2021.csv
Saved IWM_2021_2022.csv
Saved IWM_2022_2023.csv
Saved IWM_2023_2024.csv
Saved DIA_2019_2020.csv
Saved DIA_2020_2021.csv
Saved DIA_2021_2022.csv
Saved DIA_2022_2023.csv
Saved DIA_2023_2024.csv
Normalised data saved.
