In [2]:
import torch
print(torch.version.cuda)


if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")
print(f"Using device: {mps_device}")

None
Using device: mps


In [3]:
import pandas as pd
from transformers import BertTokenizer
from sklearn.preprocessing import MinMaxScaler

import torch
from torch import nn
from transformers import BertModel

# Load datasets
news_data = pd.read_csv("news.csv")

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Combine Top1-Top25 into a single string
news_data['combined_text'] = news_data.iloc[:, 2:].apply(lambda x: " ".join(x.dropna()), axis=1)

# Tokenize headlines
news_data['tokens'] = news_data['combined_text'].apply(
    lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
)

labels = news_data['Label'].values  
# Ensure this column is properly set for stock movement


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
class StockPredictor(nn.Module):
    def __init__(self):
        super(StockPredictor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Sequential(
            nn.Linear(768, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
        )
    
    def forward(self, tokens):
        # Text embeddings
        bert_output = self.bert(**tokens).pooler_output
        return torch.sigmoid(self.fc(bert_output))


In [6]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, AdamW
import torch
torch.cuda.manual_seed(42)
# Define Dataset Class
class NewsDataset(Dataset):
    def __init__(self, news, labels):
        self.news = news
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.news[idx], self.labels[idx]

# Prepare tokenized text data
train_news = news_data['tokens'].tolist()  # This contains a list of dictionaries
train_labels = news_data['Label'].values

# Create Dataset
train_dataset = NewsDataset(train_news, train_labels)

# Define Collate Function
def collate_fn(batch):
    news = {key: torch.cat([b[0][key] for b in batch], dim=0) for key in batch[0][0].keys()}  # Tokenized text
    labels = torch.stack([b[1] for b in batch])  # Labels
    return news, labels

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Define Model
model = StockPredictor()
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Training Loop
for epoch in range(5):
    model.train()
    batch_index = 0
    for batch in train_loader:
        news, labels = batch
        optimizer.zero_grad()
        outputs = model(news).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        batch_index += 1
        print(f"Epoch: {epoch+1}, Batch: {batch_index}, Loss: {loss.item()}")
            

Epoch: 1, Batch: 1, Loss: 0.6927637457847595
Epoch: 1, Batch: 2, Loss: 0.7190260887145996
Epoch: 1, Batch: 3, Loss: 0.6920912861824036
Epoch: 1, Batch: 4, Loss: 0.6124520301818848
Epoch: 1, Batch: 5, Loss: 0.7123312950134277
Epoch: 1, Batch: 6, Loss: 0.6286718845367432
Epoch: 1, Batch: 7, Loss: 0.6994006633758545
Epoch: 1, Batch: 8, Loss: 0.73701012134552
Epoch: 1, Batch: 9, Loss: 0.7220447063446045
Epoch: 1, Batch: 10, Loss: 0.7002747058868408
Epoch: 1, Batch: 11, Loss: 0.788399875164032
Epoch: 1, Batch: 12, Loss: 0.6481130123138428
Epoch: 1, Batch: 13, Loss: 0.6981464624404907
Epoch: 1, Batch: 14, Loss: 0.7241858243942261


KeyboardInterrupt: 