In [4]:
import yfinance as yf
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from collections import defaultdict
import nltk
from nltk import CFG

# List of 10 tech stocks
tech_stocks = ['MSFT', 'GOOGL', 'AMZN', 'META', 'TSLA', 'NVDA', 'NFLX', 'ADBE', 'INTC']
#tech_stocks = ['MSFT', 'GOOGL', 'AMZN', 'META']

# Download data for these stocks
intraday_data = yf.download(tickers=tech_stocks, period='5d', interval='1m')

# Save to CSV for further analysis (optional)
intraday_data.to_csv(f'tech_stocks_intraday_data.csv')

[*********************100%***********************]  9 of 9 completed


In [87]:
adj_close_data = intraday_data['Adj Close']
adj_close_data.fillna(method='ffill', inplace=True)
adj_close_data.columns = [f'{stock}_last_price' for stock in adj_close_data.columns]


# Calculate the percentage return for each stock as the percentage change of 'last price'
stock_returns = adj_close_data.pct_change() * 100
stock_returns.columns = [f'{col}_return' for col in adj_close_data.columns]

# Normalize the last_price columns using Min-Max normalization
normalized_data = adj_close_data.copy()

# Perform Min-Max normalization for each 'last_price' column
for column in normalized_data.columns:
    # Apply Min-Max normalization formula
    min_value = normalized_data[column].min()
    max_value = normalized_data[column].max()
    normalized_data[column] = (normalized_data[column] - min_value) / (max_value - min_value)

combined_data = pd.concat([normalized_data, stock_returns], axis=1)

# Drop rows with any NaN values
combined_data.dropna(inplace=True)

# Display the cleaned data
combined_data

Unnamed: 0_level_0,ADBE_last_price,AMZN_last_price,GOOGL_last_price,INTC_last_price,META_last_price,MSFT_last_price,NFLX_last_price,NVDA_last_price,TSLA_last_price,ADBE_last_price_return,AMZN_last_price_return,GOOGL_last_price_return,INTC_last_price_return,META_last_price_return,MSFT_last_price_return,NFLX_last_price_return,NVDA_last_price_return,TSLA_last_price_return
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-10-14 13:31:00+00:00,0.381557,0.683613,0.236624,0.946969,0.772676,0.525117,0.461030,0.724517,0.768031,0.137536,-0.290469,0.006098,0.211951,0.104492,-0.008401,-0.174106,0.277534,0.836971
2024-10-14 13:32:00+00:00,0.427929,0.705262,0.287035,0.896465,0.796408,0.561052,0.468394,0.751360,0.640350,0.226917,0.068858,0.149460,-0.423005,0.102647,0.111324,0.080868,0.232715,-0.590941
2024-10-14 13:33:00+00:00,0.422235,0.656952,0.269547,0.835858,0.725575,0.542118,0.455914,0.748844,0.645712,-0.027801,-0.153549,-0.051772,-0.509774,-0.306054,-0.058589,-0.136947,-0.021759,0.024962
2024-10-14 13:34:00+00:00,0.501555,0.711140,0.310698,0.821212,0.739977,0.566099,0.455788,0.763100,0.521452,0.387378,0.172495,0.121890,-0.123823,0.062420,0.074250,-0.001385,0.123330,-0.578384
2024-10-14 13:35:00+00:00,0.548336,0.695253,0.348764,0.830808,0.755159,0.616693,0.456418,0.772324,0.447368,0.227579,-0.050484,0.112611,0.081224,0.065756,0.156541,0.006927,0.079705,-0.346841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-18 19:55:00+00:00,0.071797,0.707811,0.113169,0.497475,0.070455,0.537866,0.961393,0.763520,0.673490,-0.081790,-0.007728,0.000000,-0.109768,0.003466,-0.009574,0.019676,-0.021731,-0.031702
2024-10-18 19:56:00+00:00,0.081763,0.703580,0.120369,0.502525,0.094202,0.539413,0.967792,0.763939,0.675438,0.049517,-0.013446,0.021423,0.043957,0.105926,0.004791,0.066612,0.003626,0.009057
2024-10-18 19:57:00+00:00,0.073423,0.715238,0.138889,0.510101,0.103544,0.556415,0.974997,0.761004,0.696881,-0.041417,0.037057,0.055102,0.065902,0.041627,0.052647,0.074957,-0.025361,0.099670
2024-10-18 19:58:00+00:00,0.069966,0.711490,0.124485,0.510101,0.093423,0.555642,0.968006,0.760417,0.688109,-0.017177,-0.011909,-0.042832,0.000000,-0.045080,-0.002394,-0.072672,-0.005076,-0.040732


In [88]:
# Calculate the percentage change for each stock
pct_change_data = adj_close_data.pct_change() * 100  # Convert to percentage
pct_change_data.dropna(inplace=True)
pct_change_data.columns = [column.replace("_last_price", "") for column in pct_change_data.columns]

# Use quantiles to classify the data into A, B, C, D, E
def classify_quantiles(pct_change, q1, q2, q3, q4):
    if pct_change > q4:
        return 'A'  # Highest range
    elif pct_change > q3:
        return 'B'  
    elif pct_change > q2:
        return 'C'  
    elif pct_change > q1:
        return 'D'  
    else:
        return 'E'  # Lowest range

# Apply quantile-based classification to each ticker
classified_data_quantiles = pct_change_data.copy()

for ticker in pct_change_data.columns:
    q1 = pct_change_data[ticker].quantile(0.2)
    q2 = pct_change_data[ticker].quantile(0.4)
    q3 = pct_change_data[ticker].quantile(0.6)
    q4 = pct_change_data[ticker].quantile(0.8)
    
    # Apply classification for each stock
    classified_data_quantiles[ticker] = pct_change_data[ticker].apply(classify_quantiles, args=(q1, q2, q3, q4))
classified_data_quantiles.head()

Unnamed: 0_level_0,ADBE,AMZN,GOOGL,INTC,META,MSFT,NFLX,NVDA,TSLA
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-10-14 13:31:00+00:00,A,E,C,A,A,D,E,A,A
2024-10-14 13:32:00+00:00,A,A,A,E,A,A,A,A,E
2024-10-14 13:33:00+00:00,D,E,E,E,E,E,E,D,B
2024-10-14 13:34:00+00:00,A,A,A,E,A,A,C,A,E
2024-10-14 13:35:00+00:00,A,E,A,A,A,A,C,A,E


In [89]:
for stock in classified_data_quantiles.columns:
    combined_data[f'{stock}_quantile_feature'] = classified_data_quantiles[stock]

# Now, we need to create the target label by shifting the quantile classifications by one time step
# This will create columns like '<stock>_target_label'
for stock in classified_data_quantiles.columns:
    combined_data[f'{stock}_target_label'] = classified_data_quantiles[stock].shift(-1)

combined_data.dropna(inplace=True)
combined_data

Unnamed: 0_level_0,ADBE_last_price,AMZN_last_price,GOOGL_last_price,INTC_last_price,META_last_price,MSFT_last_price,NFLX_last_price,NVDA_last_price,TSLA_last_price,ADBE_last_price_return,...,TSLA_quantile_feature,ADBE_target_label,AMZN_target_label,GOOGL_target_label,INTC_target_label,META_target_label,MSFT_target_label,NFLX_target_label,NVDA_target_label,TSLA_target_label
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-10-14 13:31:00+00:00,0.381557,0.683613,0.236624,0.946969,0.772676,0.525117,0.461030,0.724517,0.768031,0.137536,...,A,A,A,A,E,A,A,A,A,E
2024-10-14 13:32:00+00:00,0.427929,0.705262,0.287035,0.896465,0.796408,0.561052,0.468394,0.751360,0.640350,0.226917,...,E,D,E,E,E,E,E,E,D,B
2024-10-14 13:33:00+00:00,0.422235,0.656952,0.269547,0.835858,0.725575,0.542118,0.455914,0.748844,0.645712,-0.027801,...,B,A,A,A,E,A,A,C,A,E
2024-10-14 13:34:00+00:00,0.501555,0.711140,0.310698,0.821212,0.739977,0.566099,0.455788,0.763100,0.521452,0.387378,...,E,A,E,A,A,A,A,C,A,E
2024-10-14 13:35:00+00:00,0.548336,0.695253,0.348764,0.830808,0.755159,0.616693,0.456418,0.772324,0.447368,0.227579,...,E,A,A,B,E,A,A,E,A,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-18 19:54:00+00:00,0.088271,0.710243,0.113169,0.510101,0.069678,0.540958,0.959503,0.766035,0.680312,0.077814,...,A,E,C,C,E,C,D,B,D,D
2024-10-18 19:55:00+00:00,0.071797,0.707811,0.113169,0.497475,0.070455,0.537866,0.961393,0.763520,0.673490,-0.081790,...,D,A,D,B,B,A,C,A,C,C
2024-10-18 19:56:00+00:00,0.081763,0.703580,0.120369,0.502525,0.094202,0.539413,0.967792,0.763939,0.675438,0.049517,...,C,D,A,A,A,A,A,A,D,A
2024-10-18 19:57:00+00:00,0.073423,0.715238,0.138889,0.510101,0.103544,0.556415,0.974997,0.761004,0.696881,-0.041417,...,A,D,D,E,C,E,C,E,C,D


In [90]:
import nltk
from nltk import CFG, Nonterminal

# Recursive language parser to fully expand non-terminals
def language_parser(nt, grammar, level=0):
    #indent = '  ' * level  # Indent debug output to show recursion level
    #print(f"{indent}Expanding non-terminal: {nt}")
    exprList = []

    # Get the production rules for the current non-terminal
    productions = [prod for prod in grammar.productions() if prod.lhs() == nt]
    #print(f"{indent}Found productions: {productions}")

    # Loop over all production rules for the non-terminal
    for production in productions:
        rhs_symbols = production.rhs()
        #print(f"{indent}Processing production: {production}")

        # Temporary list for combining sub-expressions
        sub_expr_combinations = [[]]

        for sym in rhs_symbols:
            new_combinations = []
            # Check if this symbol is a non-terminal
            if isinstance(sym, Nonterminal):
                # Recursively expand this non-terminal and append the expanded result
                expanded_sub_exprs = language_parser(sym, grammar, level+1)
                for sub_expr in expanded_sub_exprs:
                    for combination in sub_expr_combinations:
                        new_combinations.append(combination + [sub_expr])
            else:
                # Handle symbols like 'ema(L2, n)' which contain non-terminals and need recursive expansion
                if '(' in sym and ')' in sym:
                    inner_nonterminal = sym.split('(')[1].split(',')[0].strip()  # Extract non-terminal from within parentheses
                    #print(f"{indent}Expanding inner non-terminal: {inner_nonterminal}")

                    # Recursively expand the inner non-terminal
                    expanded_inner = language_parser(Nonterminal(inner_nonterminal), grammar, level+1)
                    for expansion in expanded_inner:
                        new_combinations.append([sym.replace(inner_nonterminal, expansion)])
                else:
                    # If it's a simple terminal like 'L', directly add it to the sub-expressions
                    #print(f"{indent}Terminal found: {sym}")
                    for combination in sub_expr_combinations:
                        new_combinations.append(combination + [str(sym)])

            sub_expr_combinations = new_combinations

        # Combine sub-expressions into full expressions
        for combination in sub_expr_combinations:
            full_expr = ' '.join(combination)
            #print(f"{indent}Generated expression: {full_expr}")
            exprList.append(full_expr)

    # Remove duplicates from the generated expressions
    exprList = list(set(exprList))
    #print(f"{indent}Unique expressions at level {level}: {exprList}")

    return exprList

# Define the grammars for Families 1-4
grammar_family_1 = CFG.fromstring("""
  L3 -> L2 '/' 'lag(L2, l)' | L2 '/' L2 | L2
  L2 -> L1 '-' 'lag(L1, l)' | L1 '/' 'sma(L1, n)' | 'meandev(L1, n)' | 'sum(L1, n)' | L1
  L1 -> 'L' 
""")

grammar_family_2 = CFG.fromstring("""
  L4 -> L3 '/' L3 | '(L3 - L3)' | L3
  L3 -> 'ema(L2, n)' | 'sma(L2, n)' | 'wma(L2, n)' | 'sma(ema(L2, n), n)' | L2
  L2 -> 'diff(L1)' | 'lag(L1, l)'
  L1 -> 'L' 
""")

grammar_family_3 = CFG.fromstring("""
  L4 -> L3 '/' L3 | '(L3 + L3)' | L3
  L3 -> 'roc(L2, n)' | 'rsi(L2, n)' | 'macd(L2, n)' | 'boll(L2, n)' | L2
  L2 -> 'diff(L1)' | 'lag(L1, l)'
  L1 -> 'L' 
""")

grammar_family_4 = CFG.fromstring("""
  L4 -> L3 '/' L3 | '(L3 - L3)' | L3
  L3 -> 'sd(L2, n)' | 'variance(L2, n)' | 'mean(L2, n)' | 'kurtosis(L2, n)' | L2
  L2 -> 'diff(L1)' | 'lag(L1, l)'
  L1 -> 'L'  
""")

# Generate expressions for Family 1 starting from L3
exprList_family1 = language_parser(Nonterminal('L3'), grammar_family_1)

# Generate expressions for Family 2 starting from L4
exprList_family2 = language_parser(Nonterminal('L4'), grammar_family_2)

# Generate expressions for Family 3 starting from L4
exprList_family3 = language_parser(Nonterminal('L4'), grammar_family_3)

# Generate expressions for Family 4 starting from L4
exprList_family4 = language_parser(Nonterminal('L4'), grammar_family_4)

# Display expanded expressions for Family 1
print("\nFamily 1 Expanded Expressions:")
for expr in exprList_family1:
    print(expr)

# Display expanded expressions for Family 2
print("\nFamily 2 Expanded Expressions:")
for expr in exprList_family2:
    print(expr)

# Display expanded expressions for Family 3
print("\nFamily 3 Expanded Expressions:")
for expr in exprList_family3:
    print(expr)

# Display expanded expressions for Family 4
print("\nFamily 4 Expanded Expressions:")
for expr in exprList_family4:
    print(expr)


Family 1 Expanded Expressions:
L / sum(L, n)
lag(L, l) / lag(L, l)
sum(L, n) / lag(L, l)
sum(L, n) / sum(L, n)
sum(L, n) / meandev(L, n)
sum(L, n) / L
sma(L, n)
lag(L, l) / meandev(L, n)
lag(sum(L, n), l)
sum(L, n)
L / meandev(L, n)
meandev(L, n) / sma(L, n)
sma(L, n) / meandev(L, n)
lag(lag(L, l), l)
L / lag(L, l)
sma(L, n) / sum(L, n)
meandev(L, n) / sum(L, n)
lag(L, l) / L
L / sma(L, n)
lag(L, l) / sma(L, n)
sma(L, n) / lag(L, l)
sma(L, n) / sma(L, n)
lag(meandev(L, n), l)
lag(sma(L, n), l)
lag(L, l) / sum(L, n)
lag(L, l)
meandev(L, n) / L
meandev(L, n) / meandev(L, n)
meandev(L, n) / lag(L, l)
L / L
meandev(L, n)
sum(L, n) / sma(L, n)
sma(L, n) / L
L

Family 2 Expanded Expressions:
ema(lag(L, l), n) / wma(lag(L, l), n)
lag(L, l) / lag(L, l)
sma(lag(L, l), n) / sma(lag(L, l), n)
sma(lag(L, l), n) / wma(lag(L, l), n)
wma(lag(L, l), n)
lag(L, l) / sma(lag(L, l), n)
sma(lag(L, l), n) / ema(lag(L, l), n)
wma(lag(L, l), n) / sma(lag(L, l), n)
sma(lag(L, l), n) / lag(L, l)
wma(lag(L, l),

In [91]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import re

def evaluate_token(token, stock_series, n=14, l=1):
    """
    Evaluates a single token in the expression.
    Example tokens: 'L', 'lag(L, l)', 'sma(L1, n)'
    """
    if token == 'L':
        return stock_series
    elif 'lag' in token:
        match = re.search(r'lag\(L1?, l\)', token)
        if match:
            return lag(stock_series, l)
    elif 'sma' in token:
        match = re.search(r'sma\(L1?, n\)', token)
        if match:
            return sma(stock_series, n)
    elif 'ema' in token:
        match = re.search(r'ema\(L1?, n\)', token)
        if match:
            return ema(stock_series, n)
    elif 'wma' in token:
        match = re.search(r'wma\(L1?, n\)', token)
        if match:
            return wma(stock_series, n)
    elif 'meandev' in token:
        match = re.search(r'meandev\(L1?, n\)', token)
        if match:
            return meandev(stock_series, n)
    elif 'sum' in token:
        match = re.search(r'sum\(L1?, n\)', token)
        if match:
            return sum_series(stock_series, n)
    elif 'diff' in token:
        match = re.search(r'diff\(L1?\)', token)
        if match:
            return diff(stock_series)
    elif 'roc' in token:
        match = re.search(r'roc\(L1?, n\)', token)
        if match:
            return roc(stock_series, n)
    elif 'rsi' in token:
        match = re.search(r'rsi\(L1?, n\)', token)
        if match:
            return rsi(stock_series, n)
    elif 'macd' in token:
        match = re.search(r'macd\(L1?\)', token)
        if match:
            return macd(stock_series)
    elif 'boll' in token:
        match = re.search(r'boll\(L1?, n\)', token)
        if match:
            return boll(stock_series, n)[0]  # Returning the upper band
    elif 'sd' in token:
        match = re.search(r'sd\(L1?, n\)', token)
        if match:
            return sd(stock_series, n)
    elif 'variance' in token:
        match = re.search(r'variance\(L1?, n\)', token)
        if match:
            return variance(stock_series, n)
    elif 'mean' in token:
        match = re.search(r'mean\(L1?, n\)', token)
        if match:
            return mean(stock_series, n)
    elif 'kurtosis' in token:
        match = re.search(r'kurtosis\(L1?, n\)', token)
        if match:
            return kurtosis(stock_series, n)
    return stock_series


# Define helper functions for common operations used in the grammar expressions
def lag(series, l):
    return series.shift(l)

def sma(series, n):
    return series.rolling(window=n).mean()

def ema(series, n):
    return series.ewm(span=n, adjust=False).mean()

def wma(series, n):
    weights = np.arange(1, n+1)
    return series.rolling(n).apply(lambda prices: np.dot(prices, weights) / weights.sum(), raw=True)

def diff(series):
    return series.diff()

def meandev(series, n):
    return series.rolling(window=n).apply(lambda x: np.mean(np.abs(x - np.mean(x))), raw=True)

def sum_series(series, n):
    return series.rolling(window=n).sum()

def roc(series, n):
    return series.pct_change(periods=n) * 100

def rsi(series, n):
    delta = series.diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=n).mean()
    avg_loss = loss.rolling(window=n).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

def macd(series, n_fast=12, n_slow=26):
    ema_fast = ema(series, n_fast)
    ema_slow = ema(series, n_slow)
    return ema_fast - ema_slow

def boll(series, n):
    sma_val = sma(series, n)
    std = series.rolling(window=n).std()
    upper_band = sma_val + 2 * std
    lower_band = sma_val - 2 * std
    return upper_band, lower_band

def sd(series, n):
    return series.rolling(window=n).std()

def variance(series, n):
    return series.rolling(window=n).var()

def mean(series, n):
    return sma(series, n)

def kurtosis(series, n):
    return series.rolling(window=n).kurt()

def apply_expression_to_stock(stock_series, expression, n=14, l=1):
    """
    Applies the expression generated from the grammar to the stock's price series.
    This function handles more complex expressions with multiple operations.
    """
    try:
        # Split the expression by operations
        tokens = re.split(r'([\-\+\/\*])', expression)  # Split by arithmetic operators
        
        # Start with the first token
        result = evaluate_token(tokens[0].strip(), stock_series, n, l)
        
        # Apply each operation sequentially
        i = 1
        while i < len(tokens):
            operator = tokens[i].strip()
            next_token = tokens[i + 1].strip()
            
            # Evaluate the next token
            next_value = evaluate_token(next_token, stock_series, n, l)
            
            # Apply the operation
            if operator == '-':
                result = result - next_value
            elif operator == '+':
                result = result + next_value
            elif operator == '/':
                result = result / next_value
            elif operator == '*':
                result = result * next_value
            
            i += 2  # Move to the next operator
        
        return result
    except Exception as e:
        print(f"Error evaluating expression {expression}: {e}")
        return pd.Series(np.nan, index=stock_series.index)  # Return NaN series in case of error
    

# Example usage
for expression in exprList_family1:  
    for stock in tech_stocks: 
        stock_column = f'{stock}_last_price'  # Dynamically refer to each stock's last price column
        stock_series = combined_data[stock_column]  
        # Create a unique column name for the new feature
        feature_name = f'{stock}_{expression}'.replace('(', '').replace(')', '').replace(',', '').replace(' ', '_').replace('-', 'minus')
        result = apply_expression_to_stock(stock_series, expression, n=14, l=1)
        combined_data[feature_name] = result

# Drop rows with missing values
combined_data.dropna(inplace=True)


# for current quantile feature, convert it to numerical
quantile_mapping = {
    'A': 4,
    'B': 3,
    'C': 2,
    'D': 1,
    'E': 0
}

# Iterate over each stock in the quantile columns
for stock in classified_data_quantiles.columns:
    quantile_column = f'{stock}_quantile_feature'
    
    # Apply the mapping to convert the quantile feature values to numerical labels
    combined_data[quantile_column] = combined_data[quantile_column].map(quantile_mapping)
    
    quantile_column= f'{stock}_target_label'
    # Apply the mapping to convert the quantile target values to numerical labels
    combined_data[quantile_column] = combined_data[quantile_column].map(quantile_mapping)    
    
combined_data

Unnamed: 0_level_0,ADBE_last_price,AMZN_last_price,GOOGL_last_price,INTC_last_price,META_last_price,MSFT_last_price,NFLX_last_price,NVDA_last_price,TSLA_last_price,ADBE_last_price_return,...,INTC_smaL_n_/_L,MSFT_L,GOOGL_L,AMZN_L,META_L,TSLA_L,NVDA_L,NFLX_L,ADBE_L,INTC_L
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-10-14 13:44:00+00:00,0.632945,0.752256,0.541150,0.888888,0.802257,0.765842,0.489301,0.724727,0.405459,0.135862,...,0.942517,0.765842,0.541150,0.752256,0.802257,0.405459,0.724727,0.489301,0.632945,0.888888
2024-10-14 13:45:00+00:00,0.657759,0.780184,0.541150,0.887071,0.867649,0.770479,0.500504,0.737104,0.360692,0.119950,...,0.939625,0.770479,0.541150,0.780184,0.867649,0.360692,0.737104,0.500504,0.657759,0.887071
2024-10-14 13:46:00+00:00,0.657352,0.811825,0.545266,0.906565,0.863371,0.788254,0.497103,0.752962,0.387944,-0.001966,...,0.920215,0.788254,0.545266,0.811825,0.863371,0.387944,0.752962,0.497103,0.657352,0.906565
2024-10-14 13:47:00+00:00,0.664266,0.790175,0.510287,0.931919,0.842741,0.805803,0.497103,0.763109,0.384016,0.033385,...,0.902542,0.805803,0.510287,0.790175,0.842741,0.384016,0.763109,0.497103,0.664266,0.931919
2024-10-14 13:48:00+00:00,0.713080,0.795171,0.544236,0.983788,0.831064,0.813756,0.479658,0.765658,0.437875,0.235606,...,0.866761,0.813756,0.544236,0.795171,0.831064,0.437875,0.765658,0.479658,0.713080,0.983788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-18 19:54:00+00:00,0.088271,0.710243,0.113169,0.510101,0.069678,0.540958,0.959503,0.766035,0.680312,0.077814,...,0.903233,0.540958,0.113169,0.710243,0.069678,0.680312,0.766035,0.959503,0.088271,0.510101
2024-10-18 19:55:00+00:00,0.071797,0.707811,0.113169,0.497475,0.070455,0.537866,0.961393,0.763520,0.673490,-0.081790,...,0.934562,0.537866,0.113169,0.707811,0.070455,0.673490,0.763520,0.961393,0.071797,0.497475
2024-10-18 19:56:00+00:00,0.081763,0.703580,0.120369,0.502525,0.094202,0.539413,0.967792,0.763939,0.675438,0.049517,...,0.934142,0.539413,0.120369,0.703580,0.094202,0.675438,0.763939,0.967792,0.081763,0.502525
2024-10-18 19:57:00+00:00,0.073423,0.715238,0.138889,0.510101,0.103544,0.556415,0.974997,0.761004,0.696881,-0.041417,...,0.929463,0.556415,0.138889,0.715238,0.103544,0.696881,0.761004,0.974997,0.073423,0.510101


In [92]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def create_sequences(data, feature_columns, target_column, sequence_length=50):
    """
    Create sequences of features and corresponding labels.
    
    :param data: DataFrame containing the features and target columns.
    :param feature_columns: List of feature columns for the stock.
    :param target_column: The column that contains the target labels (shifted by 1 time step).
    :param sequence_length: The length of each input sequence.
    
    :return: Two numpy arrays (X, y) where X is the input features and y is the target labels.
    """
    X, y = [], []
    
    for i in range(len(data) - sequence_length):
        # Get the input sequence of features
        X_sequence = data[feature_columns].iloc[i:i+sequence_length].values
        # Get the corresponding target label (the label for the next time step)
        y_label = data[target_column].iloc[i + sequence_length]
        
        X.append(X_sequence)
        y.append(y_label)
    
    return np.array(X), np.array(y)

# Define the sequence length
sequence_length = 50

# Initialize dictionaries to store sequences for each stock
X_data = {}
y_data = {}



# Iterate over each stock to create sequences
for stock in tech_stocks:
    feature_columns = [col for col in combined_data.columns if stock in col and '_last_price' not in col]
    target_column = f'{stock}_quantile_feature'  # The target column is the quantile label
    
    # Create sequences for the current stock
    X, y = create_sequences(combined_data, feature_columns, target_column, sequence_length)
    
    # Store the sequences in the dictionary
    X_data[stock] = X
    y_data[stock] = y

# Now you have X_data and y_data for each stock
# Example: X_data['AMZN'] and y_data['AMZN'] contain the input features and target labels for AMZN

# Concatenate all stocks into one dataset for training/testing
X_all = np.concatenate([X_data[stock] for stock in tech_stocks], axis=0)
y_all = np.concatenate([y_data[stock] for stock in tech_stocks], axis=0)

# Perform an 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, shuffle=False)

# Print the shapes of the train and test sets
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_train shape: (13449, 50, 36), y_train shape: (13449,)
X_test shape: (3363, 50, 36), y_test shape: (3363,)


In [93]:
# Replace +inf with a large finite number and -inf with a small finite number
X_train = np.where(np.isinf(X_train), np.nan, X_train)
X_test = np.where(np.isinf(X_test), np.nan, X_test)

# Replace NaNs (which now include original NaNs and the infs replaced) with the column means 
# For replacing with column mean
col_mean = np.nanmean(X_train, axis=0)
X_train = np.where(np.isnan(X_train), col_mean, X_train)
X_test = np.where(np.isnan(X_test), col_mean, X_test)

In [95]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score

# Dataset class to feed data into DataLoader
class StockDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.long)

# Positional Encoding class
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)  # Shape: (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (sequence_length, batch_size, d_model)
        x = x + self.pe[:x.size(0), :]
        return x

# Transformer Model for stock sequence prediction
class StockTransformerModel(nn.Module):
    def __init__(self, feature_size, num_heads, num_layers, num_classes, dropout=0.1):
        super(StockTransformerModel, self).__init__()
        
        self.embedding = nn.Linear(feature_size, 64)  # Project input features to 64-dimensional space
        self.pos_encoder = PositionalEncoding(64)     # Positional Encoding
        
        # Transformer Encoder layer
        encoder_layer = nn.TransformerEncoderLayer(d_model=64, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(64, num_classes)  # Final classification layer

    def forward(self, x):
        # x shape: (batch_size, sequence_length, feature_size)
        x = self.embedding(x)  # Transform input features to higher dimension
        
        # Transpose to (sequence_length, batch_size, feature_size) for Transformer
        x = x.transpose(0, 1)
        
        # Add positional encoding
        x = self.pos_encoder(x)
        
        # Pass through Transformer encoder
        encoded_output = self.transformer_encoder(x)
        
        # Take the output of the last time step for classification
        final_output = encoded_output[-1, :, :]
        
        # Output class logits
        return self.fc(final_output)

# Model configuration parameters
feature_size = 36  # This corresponds to the number of input features
num_heads = 4  # Number of attention heads
num_layers = 2  # Number of transformer layers
num_classes = 5  # Assuming you have quantile labels from A-E, i.e., 5 classes

# Initialize the model
model = StockTransformerModel(feature_size=feature_size, num_heads=num_heads, num_layers=num_layers, num_classes=num_classes)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data into PyTorch Datasets and DataLoaders
batch_size = 32
train_dataset = StockDataset(X_train, y_train)
test_dataset = StockDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training loop
def train_model(model, criterion, optimizer, train_loader, num_epochs=10):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.train()

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)

            # Loss calculation
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Track the loss
            running_loss += loss.item()

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

# Evaluation
def evaluate_model(model, test_loader):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Get predictions from the output logits
            _, preds = torch.max(outputs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Train the model
train_model(model, criterion, optimizer, train_loader, num_epochs=10)

# Evaluate the model
evaluate_model(model, test_loader)


Epoch [1/10], Loss: 1.6191
Epoch [2/10], Loss: 1.5929
Epoch [3/10], Loss: 1.5858
Epoch [4/10], Loss: 1.5844
Epoch [5/10], Loss: 1.5813
Epoch [6/10], Loss: 1.5780
Epoch [7/10], Loss: 1.1584
Epoch [8/10], Loss: 0.2773
Epoch [9/10], Loss: 0.1361
Epoch [10/10], Loss: 0.1113
Test Accuracy: 97.35%
