In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset
from transformers import BertModel, BertTokenizer
import pandas as pd
import numpy as np
import talib

# 1. Install required packages
# pip install pandas numpy ta-lib torch transformers

In [2]:
# 2. Preprocess the data using TA-Lib
def preprocess_data(df):
    df['SMA'] = talib.SMA(
            df['Stock Close'], timeperiod=14)
    df['EMA'] = talib.EMA(
        df['Stock Close'], timeperiod=14)
    df['RSI'] = talib.RSI(
        df['Stock Close'], timeperiod=14)
    df['MACD'], _, _ = talib.MACD(
        df['Stock Close'], fastperiod=12, slowperiod=26, signalperiod=9)
    df['ATR'] = talib.ATR(
        df['Stock High'], df['Stock Low'], df['Stock Close'], timeperiod=14)
    df["ADX"] = talib.ADX(
        df["Stock High"].values, 
        df["Stock Low"].values, 
        df["Stock Close"].values
        )
    df.dropna(inplace=True)
    print('data has been preprocessed')
    return df

In [3]:
# 3. Create a custom Dataset class
class MultimodalDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_text_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_text_length = max_text_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row["Article Text"]
        ohlcv_data = row[["Stock Open", "Stock High", "Stock Low", "Stock Close", "volume", "SMA", "RSI", "ADX", "EMA", "ATR", "MACD"]].values.astype(np.float32)

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_text_length,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True,
        )

        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "ohlcv_data": torch.tensor(ohlcv_data),
            "target": torch.tensor(row["overall_sentiment_score"].astype(np.float32)),
        }

In [12]:
# 4. Define the multimodal model architecture
class MultimodalModel(nn.Module):
    def __init__(self, text_model_name, num_lstm_layers=1, hidden_size=128, dropout_rate=0.2):
        super(MultimodalModel, self).__init__()
        self.bert = BertModel.from_pretrained(text_model_name)
        self.lstm = nn.LSTM(input_size=11, hidden_size=hidden_size, num_layers=num_lstm_layers, batch_first=True)
        self.fc = nn.Linear(896, 1)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids, attention_mask, ohlcv_data):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = bert_output.last_hidden_state[:, 0, :]

        ohlcv_data = ohlcv_data.unsqueeze(1)
        lstm_output, _ = self.lstm(ohlcv_data)
        lstm_output = lstm_output[:, -1, :]

        combined = torch.cat([text_embedding, lstm_output], dim=1)
        combined = self.dropout(combined)
        output = self.fc(combined)
        return output.squeeze()

In [13]:
# 5. Train the model (example)
def train_model():
    # Load and preprocess data
    df = pd.read_csv("data/stock_news_data_AAPL.csv")
    # df = pd.read_csv("data/train.csv")
    df = preprocess_data(df)
    # Initialize tokenizer, dataset, and dataloader
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    dataset = MultimodalDataset(df, tokenizer, max_text_length=128)
    train_loader = data.DataLoader(dataset, batch_size=16, shuffle=False)

    # Initialize the model, loss function, and optimizer
    model = MultimodalModel("bert-base-uncased")
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    # Train the model
    num_epochs = 10

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0

        for batch in train_loader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            ohlcv_data = batch["ohlcv_data"]
            target = batch["target"]

            # Forward pass
            outputs = model(input_ids, attention_mask, ohlcv_data)
            
            loss = criterion(outputs, target.clone().detach())

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_loader)}")

# train_model()

In [14]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    epoch_loss = 0.0
    correct_predictions = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        ohlcv_data = batch["ohlcv_data"].to(device)
        sentiment_labels = batch["target"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask, ohlcv_data)
        loss = criterion(outputs, sentiment_labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        correct_predictions += torch.sum(outputs == sentiment_labels)

    accuracy = correct_predictions.double() / len(dataloader.dataset)
    return epoch_loss / len(dataloader), accuracy


In [15]:
from sklearn.metrics import classification_report
def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0.0
    correct_predictions = 0
    allpreds = []
    alllabels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            ohlcv_data = batch["ohlcv_data"].to(device)
            sentiment_labels = batch["target"].to(device)
            alllabels.extend(sentiment_labels.cpu().numpy())

            outputs = model(input_ids, attention_mask, ohlcv_data)
            _, preds = torch.max(outputs, dim=1)
            allpreds.extend(preds.cpu().numpy())
            loss = criterion(outputs, sentiment_labels)

            epoch_loss += loss.item()
            correct_predictions += torch.sum(preds == sentiment_labels)

    print(classification_report(alllabels, allpreds))
    accuracy = correct_predictions.double() / len(dataloader.dataset)
    return epoch_loss / len(dataloader), accuracy


In [16]:
def train_and_evaluate(model, train_dataloader, val_dataloader, criterion, optimizer, device, num_epochs):
    for epoch in range(num_epochs):
        train_loss, train_acc = train_epoch(model, train_dataloader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, val_dataloader, criterion, device)
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}, Train Acc: {train_acc}, Val Loss: {val_loss}, Val Acc: {val_acc}")


In [17]:
train_df, val_df = pd.read_csv("data/train.csv"),pd.read_csv("data/test.csv")

train_df = preprocess_data(train_df)
val_df = preprocess_data(val_df)

data has been preprocessed
data has been preprocessed


In [22]:

# Initialize tokenizer, dataset, and dataloader
tokenizer = BertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
train_dataset = MultimodalDataset(train_df, tokenizer, max_text_length=128)
val_dataset = MultimodalDataset(val_df, tokenizer, max_text_length=128)
train_loader = data.DataLoader(train_dataset, batch_size=16, shuffle=False)
val_loader = data.DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalModel("distilbert-base-uncased-finetuned-sst-2-english").to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Train and evaluate the model
num_epochs = 10

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing BertModel: ['distilbert.transformer.layer.5.output_layer_norm.bias', 'distilbert.transformer.layer.5.ffn.lin2.bias', 'distilbert.transformer.layer.3.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.2.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'distilbert.transformer.layer.4.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.5.ffn.lin1.weight', 'distilbert.transformer.layer.5.attention.q_lin.weight', 'distilbert.embeddings.word_embeddings.weight', 'distilbert.transformer.layer.3.ffn.lin1.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.5.sa_layer_norm.bias', 'distilbert.transformer.layer.3.attention.v_lin.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilber

In [23]:
train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, device, num_epochs)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)