In [2]:
import torch
print(torch.version.cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

11.8
Using device: cuda


In [3]:
import pandas as pd
from transformers import BertTokenizer
from sklearn.preprocessing import MinMaxScaler

import torch
from torch import nn
from transformers import BertModel

# Load datasets
news_data = pd.read_csv("news.csv")
stock_data = pd.read_csv("table.csv")

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Combine Top1-Top25 into a single string
news_data['combined_text'] = news_data.iloc[:, 2:].apply(lambda x: " ".join(x.dropna()), axis=1)

# Tokenize headlines
news_data['tokens'] = news_data['combined_text'].apply(
    lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
)

# Scale stock data
scaler = MinMaxScaler()
stock_data[['Open', 'Close', 'High', 'Low', 'Volume']] = scaler.fit_transform(
    stock_data[['Open', 'Close', 'High', 'Low', 'Volume']]
)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
class StockPredictor(nn.Module):
    def __init__(self):
        super(StockPredictor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.stock_fc = nn.Sequential(
            nn.Linear(5, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )
        self.fc = nn.Sequential(
            nn.Linear(768 + 32, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, tokens, stock_features):
        # Text embeddings
        bert_output = self.bert(**tokens).pooler_output
        # Stock features
        stock_output = self.stock_fc(stock_features)
        # Concatenate and predict
        combined = torch.cat((bert_output, stock_output), dim=1)
        return torch.sigmoid(self.fc(combined))


In [14]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, AdamW
import torch
torch.cuda.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Define Dataset Class
class StockDataset(Dataset):
    def __init__(self, news, stocks, labels):
        self.news = news
        self.stocks = torch.tensor(stocks, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.news[idx], self.stocks[idx], self.labels[idx]

# Prepare tokenized text data
train_news = news_data['tokens'].tolist()  # This contains a list of dictionaries
train_stocks = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']].values
train_labels = news_data['Label'].values

# Create Dataset
train_dataset = StockDataset(train_news, train_stocks, train_labels)

# Define Collate Function
def collate_fn(batch):
    news = {key: torch.cat([b[0][key] for b in batch], dim=0) for key in batch[0][0].keys()}  # Tokenized text
    stocks = torch.stack([b[1] for b in batch])  # Stock features
    labels = torch.stack([b[2] for b in batch])  # Labels
    return news, stocks, labels

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Define Model
model = StockPredictor()
loss_fn = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training Loop
for epoch in range(5):
    model.train()
    batch_index = 0
    for batch in train_loader:
        news, stocks, labels = batch
        optimizer.zero_grad()
        outputs = model(news, stocks).squeeze()
        loss = loss_fn(outputs, labels.float())
        loss.backward()
        optimizer.step()
        batch_index += 1
        print(f"Epoch: {epoch+1}, Batch: {batch_index}, Loss: {loss.item()}")
            

Using device: cuda
Epoch: 1, Batch: 1, Loss: 0.6669305562973022
Epoch: 1, Batch: 2, Loss: 0.7550961971282959
Epoch: 1, Batch: 3, Loss: 0.6785579919815063
Epoch: 1, Batch: 4, Loss: 0.6741546988487244
Epoch: 1, Batch: 5, Loss: 0.7743962407112122
Epoch: 1, Batch: 6, Loss: 0.7721146941184998
Epoch: 1, Batch: 7, Loss: 0.6914326548576355
Epoch: 1, Batch: 8, Loss: 0.6880688667297363
Epoch: 1, Batch: 9, Loss: 0.7214998006820679
Epoch: 1, Batch: 10, Loss: 0.6713804006576538
Epoch: 1, Batch: 11, Loss: 0.6932873129844666
Epoch: 1, Batch: 12, Loss: 0.6563521027565002
Epoch: 1, Batch: 13, Loss: 0.6166840195655823
Epoch: 1, Batch: 14, Loss: 0.6846472024917603
Epoch: 1, Batch: 15, Loss: 0.8005673885345459
Epoch: 1, Batch: 16, Loss: 0.7182035446166992
Epoch: 1, Batch: 17, Loss: 0.7665877938270569
Epoch: 1, Batch: 18, Loss: 0.7620524168014526
Epoch: 1, Batch: 19, Loss: 0.7028602361679077
Epoch: 1, Batch: 20, Loss: 0.7382911443710327
Epoch: 1, Batch: 21, Loss: 0.6687521934509277
Epoch: 1, Batch: 22, Los