In [19]:
import yfinance as yf
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from collections import defaultdict
import nltk
from nltk import CFG

# List of 10 tech stocks
tech_stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'TSLA', 'NVDA', 'NFLX', 'ADBE', 'INTC']

# Download data for these stocks
intraday_data = yf.download(tickers=tech_stocks, period='5d', interval='1m')

# Save to CSV for further analysis (optional)
intraday_data.to_csv(f'tech_stocks_intraday_data.csv')


[*********************100%***********************]  10 of 10 completed


In [8]:
adj_close_data = intraday_data['Adj Close']
# Check if there are still missing values
#print(adj_close_data.isnull().sum())
# Display the first few rows
adj_close_data.fillna(method='ffill', inplace=True)
adj_close_data

# Calculate the percentage change for each stock
pct_change_data = adj_close_data.pct_change() * 100  # Convert to percentage

# Display the first few rows of percentage change data
pct_change_data.dropna(inplace=True)

pct_change_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_close_data.fillna(method='ffill', inplace=True)


Ticker,AAPL,ADBE,AMZN,GOOGL,INTC,META,MSFT,NFLX,NVDA,TSLA
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-09-30 13:31:00+00:00,0.409591,0.122715,-0.026313,0.415044,0.167859,0.014131,0.324643,0.100054,0.537281,0.995144
2024-09-30 13:32:00+00:00,-0.125025,0.085609,0.444065,0.370776,-0.167578,0.307701,0.133169,0.074261,-0.053073,0.116293
2024-09-30 13:33:00+00:00,0.250357,0.058329,-0.181108,-0.048448,-0.062943,0.014107,0.127794,-0.013770,0.036021,0.017602
2024-09-30 13:34:00+00:00,0.018087,-0.065089,-0.034149,-0.101482,-0.441321,0.047597,-0.106652,0.063969,0.144005,0.384194
2024-09-30 13:35:00+00:00,-0.267770,0.127345,0.144552,-0.141018,0.021511,-0.010571,0.095638,0.289352,0.058523,-0.385003
...,...,...,...,...,...,...,...,...,...,...
2024-10-04 19:55:00+00:00,0.142807,-0.056208,0.107491,0.035878,-0.088509,0.080190,-0.103192,-0.119295,0.016576,0.019956
2024-10-04 19:56:00+00:00,-0.052236,0.042425,0.013427,-0.020902,0.110740,-0.026002,0.000000,-0.004171,0.016005,-0.039947
2024-10-04 19:57:00+00:00,0.088207,0.103558,0.104674,0.095813,-0.066377,0.075498,0.036034,0.044445,0.015923,-0.006043
2024-10-04 19:58:00+00:00,-0.033048,-0.030541,0.010727,-0.008973,-0.022135,-0.031013,-0.038403,-0.026378,-0.055923,0.022006


In [9]:
# Use quantiles to classify the data
# Use quantiles to classify the data into A, B, C, D, E
def classify_quantiles(pct_change, q1, q2, q3, q4):
    if pct_change > q4:
        return 'A'  # Highest range
    elif pct_change > q3:
        return 'B'  # Second highest range
    elif pct_change > q2:
        return 'C'  # Middle range
    elif pct_change > q1:
        return 'D'  # Second lowest range
    else:
        return 'E'  # Lowest range

# Apply quantile-based classification to each ticker
classified_data_quantiles = pct_change_data.copy()

for ticker in pct_change_data.columns:
    q1 = pct_change_data[ticker].quantile(0.2)
    q2 = pct_change_data[ticker].quantile(0.4)
    q3 = pct_change_data[ticker].quantile(0.6)
    q4 = pct_change_data[ticker].quantile(0.8)
    
    # Apply classification for each stock
    classified_data_quantiles[ticker] = pct_change_data[ticker].apply(classify_quantiles, args=(q1, q2, q3, q4))

# Display the classified data
print(classified_data_quantiles.head())

Ticker                    AAPL ADBE AMZN GOOGL INTC META MSFT NFLX NVDA TSLA
Datetime                                                                    
2024-09-30 13:31:00+00:00    A    A    D     A    A    B    A    A    A    A
2024-09-30 13:32:00+00:00    E    A    A     A    E    A    A    A    D    A
2024-09-30 13:33:00+00:00    A    A    E     E    D    C    A    D    B    C
2024-09-30 13:34:00+00:00    B    E    D     E    E    B    E    A    A    A
2024-09-30 13:35:00+00:00    E    A    A     E    C    D    A    A    B    E


In [16]:
# Create continuous sequences for each stock
def create_continuous_sequences(data):
    continuous_sequences = {}
    for ticker in data.columns:
        stock_data = data[ticker].tolist()
        continuous_sequences[ticker] = stock_data
    return continuous_sequences

continuous_sequences = create_continuous_sequences(classified_data_quantiles)
print("Continuous Sequence for AAPL:", continuous_sequences['AAPL'][:20])  # Display first 20 tokens for AAPL

Continuous Sequence for AAPL: ['A', 'E', 'A', 'B', 'E', 'A', 'C', 'A', 'B', 'D', 'E', 'D', 'A', 'E', 'A', 'E', 'A', 'C', 'E', 'E']


In [24]:
# Grammar Definition (CFG)
grammar = CFG.fromstring("""
    S -> A S | B S | C S | D S | E S | A | B | C | D | E
    A -> 'A'
    B -> 'B'
    C -> 'C'
    D -> 'D'
    E -> 'E'
""")
parser = nltk.ChartParser(grammar)

# Parsing function for stock sequences
def parse_sequences(sequences, parser):
    parsed_trees = {}
    for stock, seq in sequences.items():
        stock_parsed_trees = []
        for token_sequence in seq:
            trees = list(parser.parse(token_sequence))
            stock_parsed_trees.extend(trees)
        parsed_trees[stock] = stock_parsed_trees
    return parsed_trees

parsed_trees = parse_sequences(continuous_sequences, parser)

for tree in parsed_trees['AAPL'][:5]:  # Displaying the first 5 parse trees for AAPL
    print(tree)


(S (A A))
(S (E E))
(S (A A))
(S (B B))
(S (E E))


# Create a base-line training with transformer without CFG Grammar

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split

# Padding function for sequences
def pad_sequence(seq, seq_length, pad_value):
    if len(seq) < seq_length:
        return [pad_value] * (seq_length - len(seq)) + seq
    return seq

# Create input-output pairs with padding where necessary
def create_input_output_pairs(sequence, seq_length=20, pad_value=None):
    inputs = []
    outputs = []
    
    print(f"Processing sequence of length: {len(sequence)}")
    
    if len(sequence) >= seq_length + 1:
        for i in range(len(sequence) - seq_length):
            input_chunk = sequence[i:i + seq_length]
            output_token = sequence[i + seq_length]
            inputs.append(input_chunk)
            outputs.append(output_token)
    else:
        print(f"Sequence too short, length: {len(sequence)}. Padding applied.")
        padded_seq = pad_sequence(sequence, seq_length, pad_value)
        inputs.append(padded_seq)
        outputs.append(sequence[-1])  # Use the last real token as the output
    
    return inputs, outputs

# Example token-to-index conversion dictionary including padding
token_to_idx = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, '<PAD>': 5}
pad_value = token_to_idx['<PAD>']  # Padding with a special token '<PAD>'
# Process the continuous_sequences to generate input-output pairs
input_sequences = []
output_sequences = []

for ticker, sequence in continuous_sequences.items():
    print(f"Processing {ticker}, sequence length: {len(sequence)}")
    inputs, outputs = create_input_output_pairs(sequence, seq_length=20, pad_value=pad_value)
    input_sequences.extend(inputs)
    output_sequences.extend(outputs)

# Convert input/output to numeric for easier model training
input_sequences = [[token_to_idx[token] for token in seq] for seq in input_sequences]
output_sequences = [token_to_idx[token] for token in output_sequences]

# Check how many input-output pairs were created
print(f"Total input-output pairs created: {len(input_sequences)}")

# Print a few sample input-output pairs for inspection
for i in range(5):
    print(f"Sample {i + 1} - Input: {input_sequences[i]}, Output: {output_sequences[i]}")


Processing AAPL, sequence length: 1948
Processing sequence of length: 1948
Processing ADBE, sequence length: 1948
Processing sequence of length: 1948
Processing AMZN, sequence length: 1948
Processing sequence of length: 1948
Processing GOOGL, sequence length: 1948
Processing sequence of length: 1948
Processing INTC, sequence length: 1948
Processing sequence of length: 1948
Processing META, sequence length: 1948
Processing sequence of length: 1948
Processing MSFT, sequence length: 1948
Processing sequence of length: 1948
Processing NFLX, sequence length: 1948
Processing sequence of length: 1948
Processing NVDA, sequence length: 1948
Processing sequence of length: 1948
Processing TSLA, sequence length: 1948
Processing sequence of length: 1948
Total input-output pairs created: 19280
Sample 1 - Input: [0, 4, 0, 1, 4, 0, 2, 0, 1, 3, 4, 3, 0, 4, 0, 4, 0, 2, 4, 4], Output: 4
Sample 2 - Input: [4, 0, 1, 4, 0, 2, 0, 1, 3, 4, 3, 0, 4, 0, 4, 0, 2, 4, 4, 4], Output: 2
Sample 3 - Input: [0, 1, 4, 0

In [47]:
# Split data into train and test sets
train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(
    input_sequences, output_sequences, test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
train_inputs = torch.tensor(train_inputs, dtype=torch.long)
train_outputs = torch.tensor(train_outputs, dtype=torch.long)
test_inputs = torch.tensor(test_inputs, dtype=torch.long)
test_outputs = torch.tensor(test_outputs, dtype=torch.long)

# Create Dataset and DataLoader
class StockDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.outputs[idx]

train_dataset = StockDataset(train_inputs, train_outputs)
test_dataset = StockDataset(test_inputs, test_outputs)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Define Transformer Model for Sequence Prediction
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=64, nhead=2, num_encoder_layers=2, dim_feedforward=128, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, dim_feedforward=dim_feedforward, dropout=dropout
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.src_mask = None

    def forward(self, src):
        # Embed the input tokens
        src = self.embedding(src)
        # Create the source mask
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            self.src_mask = self.generate_square_subsequent_mask(len(src)).to(src.device)
        # Apply the transformer model
        output = self.transformer(src, src, src_mask=self.src_mask)
        # Output the final token predictions
        output = self.fc_out(output[-1])  # Take the final output from the sequence
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

# Model Parameters
vocab_size = len(token_to_idx)
model = TransformerModel(vocab_size)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(inputs.T)  # Transpose for transformer (batch, seq_len) -> (seq_len, batch)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss/len(train_loader)}")

# Train the model
train_model(model, train_loader, criterion, optimizer, epochs=10)

# Evaluation Loop
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs.T)  # Transpose for transformer
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Evaluate the model
evaluate_model(model, test_loader)



Epoch 1/10, Loss: 1.6043064816858759
Epoch 2/10, Loss: 1.5871620645661573
Epoch 3/10, Loss: 1.5838352602547134
Epoch 4/10, Loss: 1.5812721850961076
Epoch 5/10, Loss: 1.5801316294432675
Epoch 6/10, Loss: 1.5791233211632092
Epoch 7/10, Loss: 1.5784592225343854
Epoch 8/10, Loss: 1.5785940712418298
Epoch 9/10, Loss: 1.57769234744345
Epoch 10/10, Loss: 1.5782311212472402
Test Accuracy: 26.14%
