In [13]:
import yfinance as yf
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from collections import defaultdict
import nltk
from nltk import CFG

# List of 10 tech stocks
#tech_stocks = ['MSFT', 'GOOGL', 'AMZN', 'META', 'TSLA', 'NVDA', 'NFLX', 'ADBE', 'INTC']
tech_stocks = ['MSFT', 'GOOGL', 'AMZN', 'META']

# Download data for these stocks
intraday_data = yf.download(tickers=tech_stocks, period='5d', interval='1m')

# Save to CSV for further analysis (optional)
intraday_data.to_csv(f'tech_stocks_intraday_data.csv')

[*********************100%***********************]  4 of 4 completed


In [16]:
adj_close_data = intraday_data['Adj Close']
adj_close_data.fillna(method='ffill', inplace=True)
adj_close_data.columns = [f'{stock}_last_price' for stock in adj_close_data.columns]

# Calculate the percentage return for each stock as the percentage change of 'last price'
stock_returns = adj_close_data.pct_change() * 100
stock_returns.columns = [f'{col}_return' for col in adj_close_data.columns]

# Normalize the last_price columns using Min-Max normalization
normalized_data = adj_close_data.copy()

# Perform Min-Max normalization for each 'last_price' column
for column in normalized_data.columns:
    # Apply Min-Max normalization formula
    min_value = normalized_data[column].min()
    max_value = normalized_data[column].max()
    normalized_data[column] = (normalized_data[column] - min_value) / (max_value - min_value)

combined_data = pd.concat([normalized_data, stock_returns], axis=1)

# Drop rows with any NaN values
combined_data.dropna(inplace=True)

# Display the cleaned data
combined_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_close_data.fillna(method='ffill', inplace=True)


Unnamed: 0_level_0,AMZN_last_price,GOOGL_last_price,META_last_price,MSFT_last_price,AMZN_last_price_return,GOOGL_last_price_return,META_last_price_return,MSFT_last_price_return
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-10-07 13:31:00+00:00,0.266685,0.993287,0.891456,0.565603,-0.102431,-0.032731,-0.044943,0.004819
2024-10-07 13:32:00+00:00,0.277894,0.930542,0.884333,0.589547,0.058786,-0.306030,-0.024154,0.065014
2024-10-07 13:33:00+00:00,0.242961,0.904174,0.936887,0.613474,-0.183094,-0.128999,0.178259,0.064928
2024-10-07 13:34:00+00:00,0.217414,0.953004,1.000000,0.634308,-0.134145,0.239197,0.213690,0.056500
2024-10-07 13:35:00+00:00,0.191346,0.900512,0.874755,0.677296,-0.137069,-0.256522,-0.423157,0.116506
...,...,...,...,...,...,...,...,...
2024-10-11 19:55:00+00:00,0.917101,0.403687,0.417240,0.627658,-0.065841,-0.104102,-0.029613,-0.060043
2024-10-11 19:56:00+00:00,0.913973,0.418335,0.385313,0.633865,-0.015864,0.073558,-0.110035,0.016824
2024-10-11 19:57:00+00:00,0.910844,0.412842,0.386051,0.652925,-0.015875,-0.027564,0.002544,0.051658
2024-10-11 19:58:00+00:00,0.894161,0.412842,0.366895,0.648049,-0.084632,0.000000,-0.066092,-0.013207


In [19]:
# Use quantiles to classify the data into A, B, C, D, E
def classify_quantiles(pct_change, q1, q2, q3, q4):
    if pct_change > q4:
        return 'A'  # Highest range
    elif pct_change > q3:
        return 'B'  # Second highest range
    elif pct_change > q2:
        return 'C'  # Middle range
    elif pct_change > q1:
        return 'D'  # Second lowest range
    else:
        return 'E'  # Lowest range

# Apply quantile-based classification to each ticker
classified_data_quantiles = pct_change_data.copy()

for ticker in pct_change_data.columns:
    q1 = pct_change_data[ticker].quantile(0.2)
    q2 = pct_change_data[ticker].quantile(0.4)
    q3 = pct_change_data[ticker].quantile(0.6)
    q4 = pct_change_data[ticker].quantile(0.8)
    
    # Apply classification for each stock
    classified_data_quantiles[ticker] = pct_change_data[ticker].apply(classify_quantiles, args=(q1, q2, q3, q4))

# Display the classified data
print(classified_data_quantiles.head())

Ticker                    ADBE AMZN GOOGL INTC META MSFT NFLX NVDA TSLA
Datetime                                                               
2024-10-07 13:31:00+00:00    B    E     D    A    E    C    E    A    E
2024-10-07 13:32:00+00:00    A    A     E    E    D    A    E    E    E
2024-10-07 13:33:00+00:00    A    E     E    A    A    A    E    A    B
2024-10-07 13:34:00+00:00    E    E     A    A    A    A    E    A    A
2024-10-07 13:35:00+00:00    B    E     E    E    E    A    E    E    A


In [21]:
for stock in classified_data_quantiles.columns:
    combined_data[f'{stock}_quantile_feature'] = classified_data_quantiles[stock]

# Now, we need to create the target label by shifting the quantile classifications by one time step
# This will create columns like '<stock>_target_label'
for stock in classified_data_quantiles.columns:
    combined_data[f'{stock}_target_label'] = classified_data_quantiles[stock].shift(-1)

# Drop any rows with NaN values that result from the shift
combined_data.dropna(inplace=True)
combined_data

Unnamed: 0_level_0,AMZN_last_price,GOOGL_last_price,META_last_price,MSFT_last_price,AMZN_last_price_return,GOOGL_last_price_return,META_last_price_return,MSFT_last_price_return,ADBE_quantile_feature,AMZN_quantile_feature,...,TSLA_quantile_feature,ADBE_target_label,AMZN_target_label,GOOGL_target_label,INTC_target_label,META_target_label,MSFT_target_label,NFLX_target_label,NVDA_target_label,TSLA_target_label
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-10-07 13:31:00+00:00,0.266685,0.993287,0.891456,0.565603,-0.102431,-0.032731,-0.044943,0.004819,B,E,...,E,A,A,E,E,D,A,E,E,E
2024-10-07 13:32:00+00:00,0.277894,0.930542,0.884333,0.589547,0.058786,-0.306030,-0.024154,0.065014,A,A,...,E,A,E,E,A,A,A,E,A,B
2024-10-07 13:33:00+00:00,0.242961,0.904174,0.936887,0.613474,-0.183094,-0.128999,0.178259,0.064928,A,E,...,B,E,E,A,A,A,A,E,A,A
2024-10-07 13:34:00+00:00,0.217414,0.953004,1.000000,0.634308,-0.134145,0.239197,0.213690,0.056500,E,E,...,A,B,E,E,E,E,A,E,E,A
2024-10-07 13:35:00+00:00,0.191346,0.900512,0.874755,0.677296,-0.137069,-0.256522,-0.423157,0.116506,B,E,...,A,D,E,E,C,A,D,E,A,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-11 19:54:00+00:00,0.930093,0.424439,0.425834,0.649822,-0.140066,-0.113159,0.045705,0.002404,E,E,...,A,E,E,E,E,D,E,C,B,D
2024-10-11 19:55:00+00:00,0.917101,0.403687,0.417240,0.627658,-0.065841,-0.104102,-0.029613,-0.060043,E,E,...,D,B,D,A,C,E,B,E,D,C
2024-10-11 19:56:00+00:00,0.913973,0.418335,0.385313,0.633865,-0.015864,0.073558,-0.110035,0.016824,B,D,...,C,C,D,D,E,C,A,C,C,E
2024-10-11 19:57:00+00:00,0.910844,0.412842,0.386051,0.652925,-0.015875,-0.027564,0.002544,0.051658,C,D,...,E,C,E,C,C,E,D,E,E,E


In [36]:
import nltk
from nltk import CFG, Nonterminal

# Recursive language parser to fully expand non-terminals
def language_parser(nt, grammar, level=0):
    indent = '  ' * level  # Indent debug output to show recursion level
    #print(f"{indent}Expanding non-terminal: {nt}")
    exprList = []

    # Get the production rules for the current non-terminal
    productions = [prod for prod in grammar.productions() if prod.lhs() == nt]
    #print(f"{indent}Found productions: {productions}")

    # Loop over all production rules for the non-terminal
    for production in productions:
        rhs_symbols = production.rhs()
        #print(f"{indent}Processing production: {production}")

        # Temporary list for combining sub-expressions
        sub_expr_combinations = [[]]

        for sym in rhs_symbols:
            new_combinations = []
            # Check if this symbol is a non-terminal
            if isinstance(sym, Nonterminal):
                #print(f"{indent}Recursing into non-terminal: {sym}")
                # Recursively expand this non-terminal and append the expanded result
                expanded_sub_exprs = language_parser(sym, grammar, level+1)
                for sub_expr in expanded_sub_exprs:
                    for combination in sub_expr_combinations:
                        new_combinations.append(combination + [sub_expr])
            else:
                # If it's a terminal, directly add it to the sub-expressions
                #print(f"{indent}Terminal found: {sym}")
                for combination in sub_expr_combinations:
                    new_combinations.append(combination + [str(sym)])

            sub_expr_combinations = new_combinations

        # Combine sub-expressions into full expressions
        for combination in sub_expr_combinations:
            full_expr = ' '.join(combination)
            #print(f"{indent}Generated expression: {full_expr}")
            exprList.append(full_expr)

    # Remove duplicates from the generated expressions
    exprList = list(set(exprList))
    #print(f"{indent}Unique expressions at level {level}: {exprList}")

    return exprList

# Define the grammars for Families 1-4
grammar_family_1 = CFG.fromstring("""
  L3 -> L2 '/' 'lag(L2, l)' | L2 '/' L2 | L2
  L2 -> L1 '-' 'lag(L1, l)' | L1 '/' 'sma(L1, n)' | 'meandev(L1, n)' | 'sum(L1, n)' | L1
  L1 -> 'L' 
""")

grammar_family_2 = CFG.fromstring("""
  L4 -> L3 '/' L3 | '(L3 - L3)' | L3
  L3 -> 'ema(L2, n)' | 'sma(L2, n)' | 'wma(L2, n)' | 'sma(ema(L2, n), n)' | L2
  L2 -> 'diff(L1)' | 'lag(L1, l)'
  L1 -> 'L' 
""")

grammar_family_3 = CFG.fromstring("""
  L4 -> L3 '/' L3 | '(L3 + L3)' | L3
  L3 -> 'roc(L2, n)' | 'rsi(L2, n)' | 'macd(L2, n)' | 'boll(L2, n)' | L2
  L2 -> 'diff(L1)' | 'lag(L1, l)'
  L1 -> 'L' 
""")

grammar_family_4 = CFG.fromstring("""
  L4 -> L3 '/' L3 | '(L3 - L3)' | L3
  L3 -> 'sd(L2, n)' | 'variance(L2, n)' | 'mean(L2, n)' | 'kurtosis(L2, n)' | L2
  L2 -> 'diff(L1)' | 'lag(L1, l)'
  L1 -> 'L'  
""")

# Generate expressions for Family 1 starting from L3
exprList_family1 = language_parser(Nonterminal('L3'), grammar_family_1)

# Generate expressions for Family 2 starting from L4
exprList_family2 = language_parser(Nonterminal('L4'), grammar_family_2)

# Generate expressions for Family 3 starting from L4
exprList_family3 = language_parser(Nonterminal('L4'), grammar_family_3)

# Generate expressions for Family 4 starting from L4
#print("\nGenerating expressions for Family 4:")
exprList_family4 = language_parser(Nonterminal('L4'), grammar_family_4)

# Display expanded expressions for Family 1
print("\nFamily 1 Expanded Expressions:")
for expr in exprList_family1:
    print(expr)

# Display expanded expressions for Family 2
print("\nFamily 2 Expanded Expressions:")
for expr in exprList_family2:
    print(expr)

# Display expanded expressions for Family 3
print("\nFamily 3 Expanded Expressions:")
for expr in exprList_family3:
    print(expr)

# Display expanded expressions for Family 4
print("\nFamily 4 Expanded Expressions:")
for expr in exprList_family4:
    print(expr)


Family 1 Expanded Expressions:
L / sma(L1, n) / meandev(L1, n)
L - lag(L1, l) / L
L - lag(L1, l) / lag(L2, l)
sum(L1, n) / lag(L2, l)
L - lag(L1, l) / meandev(L1, n)
L - lag(L1, l) / L / sma(L1, n)
L / L - lag(L1, l)
L / sma(L1, n) / L - lag(L1, l)
sum(L1, n) / L - lag(L1, l)
meandev(L1, n)
meandev(L1, n) / L / sma(L1, n)
L / L
sum(L1, n) / meandev(L1, n)
meandev(L1, n) / sum(L1, n)
L / lag(L2, l)
L
L / L / sma(L1, n)
sum(L1, n) / sum(L1, n)
L - lag(L1, l) / L - lag(L1, l)
L / sum(L1, n)
L - lag(L1, l)
meandev(L1, n) / lag(L2, l)
L / sma(L1, n) / L / sma(L1, n)
L / sma(L1, n)
meandev(L1, n) / L - lag(L1, l)
meandev(L1, n) / meandev(L1, n)
L / meandev(L1, n)
L / sma(L1, n) / L
meandev(L1, n) / L
L / sma(L1, n) / sum(L1, n)
sum(L1, n) / L
sum(L1, n) / L / sma(L1, n)
L - lag(L1, l) / sum(L1, n)
L / sma(L1, n) / lag(L2, l)
sum(L1, n)

Family 2 Expanded Expressions:
lag(L1, l) / sma(ema(L2, n), n)
lag(L1, l) / ema(L2, n)
(L3 - L3)
lag(L1, l) / lag(L1, l)
sma(L2, n) / lag(L1, l)
sma(L2, n) 

In [42]:
import pandas as pd
import numpy as np

# Define helper functions for common operations used in the grammar expressions
def lag(series, l):
    return series.shift(l)

def sma(series, n):
    return series.rolling(window=n).mean()

def ema(series, n):
    return series.ewm(span=n, adjust=False).mean()

def wma(series, n):
    weights = np.arange(1, n+1)
    return series.rolling(n).apply(lambda prices: np.dot(prices, weights) / weights.sum(), raw=True)

def diff(series):
    return series.diff()

def meandev(series, n):
    return series.rolling(window=n).apply(lambda x: np.mean(np.abs(x - np.mean(x))), raw=True)

def sum_series(series, n):
    return series.rolling(window=n).sum()

def roc(series, n):
    return series.pct_change(periods=n) * 100

def rsi(series, n):
    delta = series.diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=n).mean()
    avg_loss = loss.rolling(window=n).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

def macd(series, n_fast=12, n_slow=26):
    ema_fast = ema(series, n_fast)
    ema_slow = ema(series, n_slow)
    return ema_fast - ema_slow

def boll(series, n):
    sma_val = sma(series, n)
    std = series.rolling(window=n).std()
    upper_band = sma_val + 2 * std
    lower_band = sma_val - 2 * std
    return upper_band, lower_band

def sd(series, n):
    return series.rolling(window=n).std()

def variance(series, n):
    return series.rolling(window=n).var()

def mean(series, n):
    return sma(series, n)

def kurtosis(series, n):
    return series.rolling(window=n).kurt()

# Apply expression to stock
def apply_expression_to_stock(stock_series, expression, n=14, l=1):
    """
    Applies the expression generated from grammar to the stock's price series.
    - stock_series: pandas Series of stock prices
    - expression: grammar-generated expression as a string
    - n: default window size for moving averages, etc.
    - l: default lag period
    """
    try:
        # Split the expression into parts and check for the required operations
        if 'lag' in expression:
            # Example: L - lag(L, l)
            if '-' in expression:
                return stock_series - lag(stock_series, l)
            elif '/' in expression:
                return stock_series / lag(stock_series, l)
            else:
                return lag(stock_series, l)
        elif 'sma' in expression:
            if '/' in expression:
                return stock_series / sma(stock_series, n)
            else:
                return sma(stock_series, n)
        elif 'ema' in expression:
            return ema(stock_series, n)
        elif 'wma' in expression:
            return wma(stock_series, n)
        elif 'diff' in expression:
            return diff(stock_series)
        elif 'meandev' in expression:
            return meandev(stock_series, n)
        elif 'sum' in expression:
            return sum_series(stock_series, n)
        elif 'roc' in expression:
            return roc(stock_series, n)
        elif 'rsi' in expression:
            return rsi(stock_series, n)
        elif 'macd' in expression:
            return macd(stock_series)
        elif 'boll' in expression:
            upper_band, lower_band = boll(stock_series, n)
            return upper_band  # Return upper band as an example
        elif 'sd' in expression:
            return sd(stock_series, n)
        elif 'variance' in expression:
            return variance(stock_series, n)
        elif 'mean' in expression:
            return mean(stock_series, n)
        elif 'kurtosis' in expression:
            return kurtosis(stock_series, n)
        else:
            return stock_series  # Fallback to the original series if no special operation
    except Exception as e:
        print(f"Error evaluating expression {expression}: {e}")
        return pd.Series(np.nan, index=stock_series.index)  # Return NaN series in case of error

# Loop through each expanded expression in a grammar family
for expression in exprList_family1:  
    for stock in tech_stocks: 
        stock_column = f'{stock}_last_price'  # Dynamically refer to each stock's last price column
        stock_series = combined_data[stock_column]  
        # Create a unique column name for the new feature
        feature_name = f'{stock}_{expression}'.replace('(', '').replace(')', '').replace(',', '').replace(' ', '_').replace('-', 'minus')
        result = apply_expression_to_stock(stock_series, expression, n=14, l=1)
        combined_data[feature_name] = result
        
combined_data.dropna(inplace=True)

combined_data

Unnamed: 0_level_0,AMZN_last_price,GOOGL_last_price,META_last_price,MSFT_last_price,AMZN_last_price_return,GOOGL_last_price_return,META_last_price_return,MSFT_last_price_return,ADBE_quantile_feature,AMZN_quantile_feature,...,AMZN_L_minus_lagL1_l_/_sumL1_n,META_L_minus_lagL1_l_/_sumL1_n,MSFT_L_/_smaL1_n_/_lagL2_l,GOOGL_L_/_smaL1_n_/_lagL2_l,AMZN_L_/_smaL1_n_/_lagL2_l,META_L_/_smaL1_n_/_lagL2_l,MSFT_sumL1_n,GOOGL_sumL1_n,AMZN_sumL1_n,META_sumL1_n
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-10-07 13:44:00+00:00,0.092285,0.823609,0.760689,0.511524,-0.038614,-0.125133,-0.036358,-0.000147,E,E,...,-0.007298,-0.010678,0.999894,0.969997,0.926710,0.986157,8.202452,12.413051,2.400862,11.767509
2024-10-07 13:45:00+00:00,0.088635,0.818945,0.704813,0.516843,-0.019319,-0.022938,-0.190323,0.014464,E,D,...,-0.003650,-0.055876,1.010398,0.994337,0.960448,0.926546,8.153692,12.238709,2.222811,11.580866
2024-10-07 13:46:00+00:00,0.071950,0.800416,0.731829,0.545212,-0.088324,-0.091150,0.092197,0.077134,A,E,...,-0.016684,0.027016,1.054890,0.977374,0.811762,1.038331,8.109357,12.108583,2.016867,11.428362
2024-10-07 13:47:00+00:00,0.060480,0.788892,0.743616,0.533314,-0.060775,-0.056741,0.040189,-0.032326,D,E,...,-0.011470,0.011787,0.978176,0.985602,0.840579,1.016107,8.029197,11.993300,1.834386,11.235090
2024-10-07 13:48:00+00:00,0.077164,0.783326,0.701377,0.513296,0.088456,-0.027419,-0.143957,-0.054402,B,A,...,0.016684,-0.042239,0.962465,0.992945,1.275868,0.943198,7.908185,11.823622,1.694136,10.936468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-11 19:54:00+00:00,0.930093,0.424439,0.425834,0.649822,-0.140066,-0.113159,0.045705,0.002404,E,E,...,-0.027676,0.013259,1.001367,0.949482,0.971104,1.032138,8.659118,6.486586,13.300774,5.715140
2024-10-11 19:55:00+00:00,0.917101,0.403687,0.417240,0.627658,-0.065841,-0.104102,-0.029613,-0.060043,E,E,...,-0.012991,-0.008595,0.965894,0.951108,0.986032,0.979817,8.683053,6.406021,13.278356,5.737489
2024-10-11 19:56:00+00:00,0.913973,0.418335,0.385313,0.633865,-0.015864,0.073558,-0.110035,0.016824,B,D,...,-0.003128,-0.031927,1.009888,1.036285,0.996589,0.923481,8.714525,6.338273,13.247074,5.727368
2024-10-11 19:57:00+00:00,0.910844,0.412842,0.386051,0.652925,-0.015875,-0.027564,0.002544,0.051658,C,D,...,-0.003130,0.000737,1.030069,0.986870,0.996576,1.001914,8.783230,6.274552,13.210859,5.704774


In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def create_sequences(data, feature_columns, target_column, sequence_length=50):
    """
    Create sequences of features and corresponding labels.
    
    :param data: DataFrame containing the features and target columns.
    :param feature_columns: List of feature columns for the stock.
    :param target_column: The column that contains the target labels (shifted by 1 time step).
    :param sequence_length: The length of each input sequence.
    
    :return: Two numpy arrays (X, y) where X is the input features and y is the target labels.
    """
    X, y = [], []
    
    for i in range(len(data) - sequence_length):
        # Get the input sequence of features
        X_sequence = data[feature_columns].iloc[i:i+sequence_length].values
        # Get the corresponding target label (the label for the next time step)
        y_label = data[target_column].iloc[i + sequence_length]
        
        X.append(X_sequence)
        y.append(y_label)
    
    return np.array(X), np.array(y)

# Define the sequence length
sequence_length = 50

# Initialize dictionaries to store sequences for each stock
X_data = {}
y_data = {}

# List of stock symbols
stocks = ['AMZN', 'GOOGL', 'META', 'MSFT']  # Add other stocks as needed

# Iterate over each stock to create sequences
for stock in stocks:
    feature_columns = [col for col in combined_data.columns if stock in col and '_last_price' not in col]
    target_column = f'{stock}_quantile_feature'  # The target column is the quantile label
    
    # Create sequences for the current stock
    X, y = create_sequences(combined_data, feature_columns, target_column, sequence_length)
    
    # Store the sequences in the dictionary
    X_data[stock] = X
    y_data[stock] = y

# Now you have X_data and y_data for each stock
# Example: X_data['AMZN'] and y_data['AMZN'] contain the input features and target labels for AMZN

# Concatenate all stocks into one dataset for training/testing
X_all = np.concatenate([X_data[stock] for stock in stocks], axis=0)
y_all = np.concatenate([y_data[stock] for stock in stocks], axis=0)

# Perform an 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, shuffle=False)

# Print the shapes of the train and test sets
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_train shape: (6032, 50, 37), y_train shape: (6032,)
X_test shape: (1508, 50, 37), y_test shape: (1508,)
