In [20]:
import yfinance as yf
import pandas as pd
import numpy as np
import pandas_ta as ta
from sklearn.preprocessing import StandardScaler

# Define the tickers and the date range
tickers = ['XLF', 'XLU', 'QQQ', 'SPY', 'XLP', 'EWZ', 'EWH', 'XLY', 'XLE']
start_date = '2013-11-01'
end_date = '2024-12-31'

# Manual Calculation Functions

In [21]:
# Calculate Chaikin Money Flow Indicator (CMFI)
def calculate_cmfi(input_df):
    ticker_data = input_df.copy()
    mf_multiplier = ((ticker_data['Close'] - ticker_data['Low']) - (ticker_data['High'] - ticker_data['Close'])) / (ticker_data['High'] - ticker_data['Low'])
    mf_volume = mf_multiplier * ticker_data['Volume']
    return mf_volume.rolling(window=20).sum() / ticker_data['Volume'].rolling(window=20).sum()





# Calculate Directional Movement Indicator (DMI)
def calculate_dmi(input_df):
    ticker_data = input_df.copy()
    ticker_data['TR'] = np.maximum(
        ticker_data['High'] - ticker_data['Low'],
        np.maximum(
            abs(ticker_data['High'] - ticker_data['Close'].shift(1)),
            abs(ticker_data['Low'] - ticker_data['Close'].shift(1))
        )
    )
    ticker_data['UpMove'] = ticker_data['High'].diff()
    ticker_data['DownMove'] = ticker_data['Low'].shift(1) - ticker_data['Low']

    ticker_data['+DMI'] = np.where((ticker_data['UpMove'] > ticker_data['DownMove']) & (ticker_data['UpMove'] > 0), ticker_data['UpMove'], 0)
    ticker_data['-DMI'] = np.where((ticker_data['DownMove'] > ticker_data['UpMove']) & (ticker_data['DownMove'] > 0), ticker_data['DownMove'], 0)

    ticker_data['Smoothed +DMI'] = ticker_data['+DMI'].ewm(span=14, adjust=False).mean()
    ticker_data['Smoothed -DMI'] = ticker_data['-DMI'].ewm(span=14, adjust=False).mean()
    ticker_data['Smoothed TR'] = ticker_data['TR'].ewm(span=14, adjust=False).mean()

    ticker_data['+DI'] = 100 * (ticker_data['Smoothed +DMI'] / ticker_data['Smoothed TR'])
    ticker_data['-DI'] = 100 * (ticker_data['Smoothed -DMI'] / ticker_data['Smoothed TR'])

    ticker_data['DX'] = 100 * (abs(ticker_data['+DI'] - ticker_data['-DI']) / (ticker_data['+DI'] + ticker_data['-DI']))
    return ticker_data['DX'].ewm(span=14, adjust=False).mean()  # Use ADX as the DMI column





# Calculate Parabolic SAR (PSAR)
def calculate_psar(high, low):
    # Initialize variables
    psar = [low.iloc[0]]  # Start with the first low as the initial SAR
    ep = high.iloc[0]  # Extreme Point (EP) starts as the first high
    af = 0.02  # Acceleration Factor (AF) starts at 0.02
    uptrend = True  # Assume the trend starts as rising

    for i in range(1, len(high)):
        prev_psar = psar[-1]

        if uptrend:
            # Rising SAR
            current_psar = prev_psar + af * (ep - prev_psar)
            if low.iloc[i] < current_psar:  # Trend reversal
                uptrend = False
                current_psar = ep  # Reset SAR to the EP
                ep = low.iloc[i]  # Reset EP to the current low
                af = 0.02  # Reset AF
            else:
                if high.iloc[i] > ep:  # Update EP and AF
                    ep = high.iloc[i]
                    af = min(af + 0.02, 0.2)  # Cap AF at 0.2
        else:
            # Falling SAR
            current_psar = prev_psar - af * (prev_psar - ep)
            if high.iloc[i] > current_psar:  # Trend reversal
                uptrend = True
                current_psar = ep  # Reset SAR to the EP
                ep = high.iloc[i]  # Reset EP to the current high
                af = 0.02  # Reset AF
            else:
                if low.iloc[i] < ep:  # Update EP and AF
                    ep = low.iloc[i]
                    af = min(af + 0.02, 0.2)  # Cap AF at 0.2

        psar.append(current_psar)

    return psar





def apply_labelling(data, window_size=11):
    """
    Apply the labelling algorithm to the data.
    Labels:
        - BUY: If the minimum price in the window is at the middle index.
        - SELL: If the maximum price in the window is at the middle index.
        - HOLD: Otherwise.
    """
    labels = []
    close_prices = data['Close'].values

    for counter_row in range(len(close_prices)):
        if counter_row + 1 >= window_size:
            # Define the window
            window_begin_index = counter_row + 1 - window_size
            window_end_index = counter_row
            window_middle_index = (window_begin_index + window_end_index) // 2

            # Extract the window of close prices
            window = close_prices[window_begin_index:window_end_index + 1]

            # Find min and max in the window
            min_price = window.min()
            max_price = window.max()
            min_index = window_begin_index + window.tolist().index(min_price)
            max_index = window_begin_index + window.tolist().index(max_price)

            # Assign labels
            if max_index == window_middle_index:
                labels.append('SELL')
            elif min_index == window_middle_index:
                labels.append('BUY')
            else:
                labels.append('HOLD')
        else:
            # Not enough data for the window
            labels.append('HOLD')

    # Add labels to the DataFrame
    data['Label'] = labels
    return data

# Get Data, Indicators & Labels

In [None]:
# Download the data
data = yf.download(tickers, start=start_date, end=end_date, interval='1d', group_by='ticker')

# Loop through each ticker and calculate indicators
for ticker in tickers:
    print(f"Processing {ticker}...")
    ticker_data = data[ticker].dropna()  # Get data for the specific ticker and drop NaN values

    # Calculate indicators
    ticker_data['RSI'] = ta.rsi(ticker_data['Close'])
    ticker_data['Williams %R'] = ta.willr(ticker_data['High'], ticker_data['Low'], ticker_data['Close'])
    ticker_data['SMA'] = ta.sma(ticker_data['Close'])
    ticker_data['EMA'] = ta.ema(ticker_data['Close'])
    ticker_data['WMA'] = ta.wma(ticker_data['Close'])
    ticker_data['HMA'] = ta.hma(ticker_data['Close'])
    ticker_data['TEMA'] = ta.tema(ticker_data['Close'])
    ticker_data['CCI'] = ta.cci(ticker_data['High'], ticker_data['Low'], ticker_data['Close'])
    ticker_data['CMO'] = ta.cmo(ticker_data['Close'])
    ticker_data['MACD'] = ta.macd(ticker_data['Close'])['MACD_12_26_9']
    ticker_data['PPO']= ta.ppo(ticker_data['Close'])['PPO_12_26_9']
    ticker_data['ROC'] = ta.roc(ticker_data['Close'])
    ticker_data['CMFI'] = calculate_cmfi(ticker_data)
    ticker_data['DMI'] = calculate_dmi(ticker_data)
    ticker_data['Parabolic SAR'] = calculate_psar(ticker_data['High'], ticker_data['Low'])

    # Apply labelling algorithm
    ticker_data = apply_labelling(ticker_data)

    # Drop unused columns
    ticker_data = ticker_data.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume'])

    # Extract data for 2014-2024
    ticker_data.loc['2014-01-01':'2024-12-31'].to_csv(f'./data/indicators/{ticker}_indicators.csv')

    # Checkpoint
    print(f"Saved {ticker}_indicators.csv")

# Done!
print("Processing complete.")

[*********************100%***********************]  9 of 9 completed


Processing XLF...
Saved XLF_indicators.csv
Processing XLU...
Saved XLU_indicators.csv
Processing QQQ...
Saved QQQ_indicators.csv
Processing SPY...
Saved SPY_indicators.csv
Processing XLP...
Saved XLP_indicators.csv
Processing EWZ...
Saved EWZ_indicators.csv
Processing EWH...
Saved EWH_indicators.csv
Processing XLY...
Saved XLY_indicators.csv
Processing XLE...
Saved XLE_indicators.csv
Processing complete.


# Generate 15-Day Windows

In [28]:
# Function to generate 15-day sliding windows for each column
def generate_15_day_windows(data, label_column, window_size=15):
    # Ensure the label column is not included in the sliding window
    feature_columns = [col for col in data.columns if col not in [label_column, 'Date']]
    
    # Create a new DataFrame to store the results
    result = []

    for i in range(window_size, len(data)):
        # Extract the 15-day window for each column
        row = {col: data[col].iloc[i - window_size:i].tolist() for col in feature_columns}
        # Add the label for the current day
        row[label_column] = data[label_column].iloc[i]
        # Add the date for the current day
        row['Date'] = data['Date'].iloc[i]
        result.append(row)

    # Convert the result to a DataFrame
    return pd.DataFrame(result)

# Train Test Split

In [29]:
# Define the sliding window parameters
start_year = 2014
end_year = 2024
window_size = 6  # 6 years per window

# Loop through each ticker and calculate indicators
for ticker in tickers:
    print(f"Processing {ticker}...")
    
    data = pd.read_csv(f'./data/indicators/{ticker}_indicators.csv', parse_dates=['Date'])

    # Iterate through the sliding windows
    for start in range(start_year, end_year - window_size + 2):
        # Define the start and end years for the current window
        window_start = start
        window_end = start + window_size - 1
        
        # Filter the data for the current window
        window_data = data[(data['Date'].dt.year >= window_start) & (data['Date'].dt.year <= window_end)]

        # Split into training (first 5 years) and test (final year) sets
        train_data = window_data[window_data['Date'].dt.year < window_end]
        test_data = window_data[window_data['Date'].dt.year == window_end]

        # Extract feature columns (exclude Date and Label columns)
        feature_columns = [col for col in window_data.columns if col not in ['Date', 'Label']]

        # Normalize the training set
        scaler = StandardScaler()
        train_data.loc[:, feature_columns] = scaler.fit_transform(train_data[feature_columns])

        # Apply the same transformation to the test set
        test_data.loc[:, feature_columns] = scaler.transform(test_data[feature_columns])
        
        # Combine the normalized training and test sets
        combined_data = pd.concat([train_data, test_data])

        # Generate 15 day lagged windows
        lagged_data = generate_15_day_windows(combined_data, "Label")

        # Save the filtered data to a CSV file
        lagged_data.to_csv(f'./data/test_years/{ticker}_{window_end}.csv', index=False)
        
        print(f"Saved data for window {window_start}-{window_end}.")

Processing XLF...
Saved data for window 2014-2019.
Saved data for window 2015-2020.
Saved data for window 2016-2021.
Saved data for window 2017-2022.
Saved data for window 2018-2023.
Saved data for window 2019-2024.
Processing XLU...
Saved data for window 2014-2019.
Saved data for window 2015-2020.
Saved data for window 2016-2021.
Saved data for window 2017-2022.
Saved data for window 2018-2023.
Saved data for window 2019-2024.
Processing QQQ...
Saved data for window 2014-2019.
Saved data for window 2015-2020.
Saved data for window 2016-2021.
Saved data for window 2017-2022.
Saved data for window 2018-2023.
Saved data for window 2019-2024.
Processing SPY...
Saved data for window 2014-2019.
Saved data for window 2015-2020.
Saved data for window 2016-2021.
Saved data for window 2017-2022.
Saved data for window 2018-2023.
Saved data for window 2019-2024.
Processing XLP...
Saved data for window 2014-2019.
Saved data for window 2015-2020.
Saved data for window 2016-2021.
Saved data for wind