In [9]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m339.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.

In [11]:
import pandas as pd
from transformers import BertTokenizer
from sklearn.preprocessing import MinMaxScaler

# Load datasets
news_data = pd.read_csv("news.csv")
stock_data = pd.read_csv("table.csv")

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Combine Top1-Top25 into a single string
news_data['combined_text'] = news_data.iloc[:, 2:].apply(lambda x: " ".join(x.dropna()), axis=1)

# Tokenize headlines
news_data['tokens'] = news_data['combined_text'].apply(
    lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
)

# Scale stock data
scaler = MinMaxScaler()
stock_data[['Open', 'Close', 'High', 'Low', 'Volume']] = scaler.fit_transform(
    stock_data[['Open', 'Close', 'High', 'Low', 'Volume']]
)


In [12]:
import torch
from torch import nn
from transformers import BertModel

class StockPredictor(nn.Module):
    def __init__(self):
        super(StockPredictor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.stock_fc = nn.Sequential(
            nn.Linear(5, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )
        self.fc = nn.Sequential(
            nn.Linear(768 + 32, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, tokens, stock_features):
        # Text embeddings
        bert_output = self.bert(**tokens).pooler_output
        # Stock features
        stock_output = self.stock_fc(stock_features)
        # Concatenate and predict
        combined = torch.cat((bert_output, stock_output), dim=1)
        return torch.sigmoid(self.fc(combined))


In [20]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import torch

# Define Dataset Class
class StockDataset(Dataset):
    def __init__(self, news, stocks, labels):
        self.news = news
        self.stocks = torch.tensor(stocks, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.news[idx], self.stocks[idx], self.labels[idx]

# Prepare tokenized text data
train_news = news_data['tokens'].tolist()  # This contains a list of dictionaries
train_stocks = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']].values
train_labels = news_data['Label'].values

# Create Dataset
train_dataset = StockDataset(train_news, train_stocks, train_labels)

# Define Collate Function
def collate_fn(batch):
    news = {key: torch.cat([b[0][key] for b in batch], dim=0) for key in batch[0][0].keys()}  # Tokenized text
    stocks = torch.stack([b[1] for b in batch])  # Stock features
    labels = torch.stack([b[2] for b in batch])  # Labels
    return news, stocks, labels

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Define Model
model = StockPredictor()
loss_fn = nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
for epoch in range(5):
    model.train()
    for batch in train_loader:
        news, stocks, labels = batch
        optimizer.zero_grad()
        outputs = model(news, stocks).squeeze()
        loss = loss_fn(outputs, labels.float())
        loss.backward()
        optimizer.step()


KeyboardInterrupt: 