<a href="https://colab.research.google.com/github/pei0217/fin_hw7_week11/blob/main/fin_hw7_week11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# 匯入必要套件
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# 下載 VADER Lexicon
nltk.download('vader_lexicon')

# ======= 數據處理 =======

# 1. 加載數據
news_data = pd.read_csv('Combined_News_DJIA(train).csv')
stock_data = pd.read_csv('DJIA_table(train).csv')

# 2. 處理日期格式
news_data['Date'] = pd.to_datetime(news_data['Date'])
stock_data['Date'] = pd.to_datetime(stock_data['Date'], format='%d-%m-%Y', errors='coerce')

# 3. 合併數據
merged_data = pd.merge(stock_data, news_data, on='Date')

# 4. 提取新聞情緒
sia = SentimentIntensityAnalyzer()
merged_data['Sentiment'] = merged_data[['Top1', 'Top2', 'Top3']].fillna('').apply(
    lambda row: sum(sia.polarity_scores(row[col])['compound'] for col in ['Top1', 'Top2', 'Top3']) / 3, axis=1
)

# 5. 增加技術指標特徵
merged_data['Moving_Avg_5'] = merged_data['Close'].rolling(window=5).mean()
merged_data['Volatility'] = merged_data['Close'].rolling(window=5).std()
merged_data.fillna(0, inplace=True)  # 填充缺失值

# 特徵選擇
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Sentiment', 'Moving_Avg_5', 'Volatility']
scaler = MinMaxScaler()
merged_data[features] = scaler.fit_transform(merged_data[features])

# 6. 標籤
labels = merged_data['Label']

# 7. 分割數據集
X = merged_data[features].values
y = labels.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ======= 模型構建 =======

# 8. 構建數據集
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# 9. 定義改進的雙向 LSTM 模型
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.5):
        super(BiLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # 雙向 LSTM 輸出加倍

    def forward(self, x):
        h_0 = torch.zeros(num_layers * 2, x.size(0), hidden_size).to(x.device)  # 雙向 LSTM
        c_0 = torch.zeros(num_layers * 2, x.size(0), hidden_size).to(x.device)
        out, _ = self.lstm(x.unsqueeze(1), (h_0, c_0))  # 添加時間維度
        out = self.dropout(out[:, -1, :])  # Dropout
        out = self.fc(out)
        return out

# 模型參數
input_size = len(features)
hidden_size = 128  # 增加隱藏層神經元數
num_layers = 3     # 增加 LSTM 層數
num_classes = 2
dropout_rate = 0.5  # 增加 Dropout
learning_rate = 0.0005
model = BiLSTMModel(input_size, hidden_size, num_layers, num_classes, dropout_rate)

# 10. 定義損失函數和優化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# ======= 訓練模型 =======
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

epochs = 15
for epoch in range(epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# ======= 評估模型 =======
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f"Accuracy: {100 * correct / total:.2f}%")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Epoch 1/15, Loss: 0.6779
Epoch 2/15, Loss: 0.7108
Epoch 3/15, Loss: 0.6710
Epoch 4/15, Loss: 0.7192
Epoch 5/15, Loss: 0.6923
Epoch 6/15, Loss: 0.6950
Epoch 7/15, Loss: 0.7081
Epoch 8/15, Loss: 0.6816
Epoch 9/15, Loss: 0.6864
Epoch 10/15, Loss: 0.6930
Epoch 11/15, Loss: 0.6890
Epoch 12/15, Loss: 0.6976
Epoch 13/15, Loss: 0.6345
Epoch 14/15, Loss: 0.7170
Epoch 15/15, Loss: 0.6509
Accuracy: 60.05%


In [12]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True