### 🌐 Weekly Macro Indicator Download

This section downloads weekly data for key macroeconomic signals that are used as input features for the model:

| Indicator        | Source Symbol | Description |
|------------------|---------------|-------------|
| **VIX**          | `^VIX`        | CBOE Volatility Index (market fear gauge) |
| **10Y Yield**    | `^TNX`        | 10-Year U.S. Treasury yield (interest rate proxy) |
| **USD Index**    | `DX-Y.NYB`    | Strength of the U.S. dollar |
| **Crude Oil**    | `CL=F`        | WTI Crude Oil futures price |

All indicators are:
- Downloaded at **weekly frequency** using Yahoo Finance
- Aligned on the same date index as the ETF data
- The 10-year yield is converted to a % by multiplying by `0.1`


In [None]:
import os
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta

YEARS = 10

# ETF list
etf_list = [
    'XLK', 'XLF', 'XLV', 'XLE', 'XLI', 'XLY', 'XLP', 'XLRE', 'XLU', 'XLB', 'XLC',
    'SOXX', 'SH', 'DOG', 'RWM', 'ITA', 'JETS', 'QQQ', 'VOO'
]

# Date range
end_date = datetime.today().strftime('%Y-%m-%d')
start_date = (datetime.today() - timedelta(weeks=YEARS*52)).strftime('%Y-%m-%d')

print(f"📅 Downloading data from {start_date} to {end_date}")

# Ensure dataset/ exists
dataset_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset'))
if not os.path.isdir(dataset_path):
    raise FileNotFoundError(f"🚫 'dataset/' folder not found at {dataset_path}")

# Containers
adjclose_data, volume_data, high_data, low_data = {}, {}, {}, {}

# Download each ETF
for symbol in etf_list:
    print(f"⬇️ Downloading {symbol}...")
    data = yf.download(symbol, start=start_date, end=end_date, interval='1wk', auto_adjust=False)
    if not data.empty:
        adjclose_data[symbol] = data[['Adj Close']].rename(columns={'Adj Close': symbol})
        volume_data[symbol] = data[['Volume']].rename(columns={'Volume': symbol})
        high_data[symbol] = data[['High']].rename(columns={'High': symbol})
        low_data[symbol] = data[['Low']].rename(columns={'Low': symbol})

# Merge and clean
def combine_and_save(data_dict, filename):
    df = pd.concat(data_dict.values(), axis=1)
    df = df.apply(pd.to_numeric, errors='coerce')
    df.index = pd.to_datetime(df.index, errors='coerce')
    df = df[~df.index.duplicated(keep='first')].sort_index()
    df.dropna(axis=0, how='all', inplace=True)
    path = os.path.join(dataset_path, filename)
    df.to_csv(path)
    print(f"✅ Saved: {filename}")
    return df

# Save all
price_df = combine_and_save(adjclose_data, 'etf_prices_weekly.csv')
volume_df = combine_and_save(volume_data, 'etf_volume_weekly.csv')
high_df = combine_and_save(high_data, 'etf_high_weekly.csv')
low_df = combine_and_save(low_data, 'etf_low_weekly.csv')

# Preview
price_df.head()

# Macro indicator tickers on Yahoo Finance
macro_tickers = {
    'VIX': '^VIX',               # Volatility Index
    '10Y_Yield': '^TNX',         # 10-Year Treasury Yield (multiply by 0.1)
    'USD_Index': 'DX-Y.NYB',     # U.S. Dollar Index
    'WTI_Crude': 'CL=F'          # Crude Oil (WTI)
}

# Date range matching your ETF backtest period
end_date = datetime.today().strftime('%Y-%m-%d')
start_date = (datetime.today() - timedelta(weeks=YEARS*52)).strftime('%Y-%m-%d')

# Download weekly data
macro_data = {}
for name, ticker in macro_tickers.items():
    print(f"Downloading {name} ({ticker})...")
    data = yf.download(ticker, start=start_date, end=end_date, interval='1wk', auto_adjust=False)
    macro_data[name] = data[['Close']].rename(columns={'Close': name})

# Combine all macro indicators into one DataFrame
macro_df = pd.concat(macro_data.values(), axis=1)

# Fix 10Y yield scale
if '10Y_Yield' in macro_df.columns:
    macro_df['10Y_Yield'] = macro_df['10Y_Yield'] * 0.1

# Drop missing rows
macro_df.dropna(inplace=True)
macro_df.columns = pd.Index(list(macro_tickers.keys()))

# Save to CSV
macro_save_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset', 'macro_indicators_weekly.csv'))
macro_df.to_csv(macro_save_path)
print(f"✅ Macro indicators saved to: {macro_save_path}")

# Preview

# macro_df = macro_df.apply(pd.to_numeric, errors='coerce')
# macro_df.index = pd.to_datetime(macro_df.index)
macro_df = macro_df[~macro_df.index.duplicated(keep='first')]
macro_df.sort_index(inplace=True)
macro_df.head()


### 🧠 Feature Engineering

This section prepares input features for the machine learning model.

#### 📈 ETF-Specific Features:
For each ETF, we will compute:
- **1-week return**: Short-term price movement
- **3-week return**: Medium-term trend
- **6-week return**: Momentum across a longer window
- **Streak**: Number of consecutive up weeks

#### 🌐 Macro Indicators:
From the macro_df, we already have:
- **VIX**
- **10Y Treasury Yield**
- **USD Index**
- **Crude Oil Price**

These will be aligned with the ETF data by date and merged in.

#### 📦 Resulting Feature Matrix:
For each ETF on each week:
- One row = a snapshot of that ETF and macro environment
- Target = the **next week's return** for that ETF


In [None]:
import pandas as pd
import os
from ta import momentum, trend, volume

# === Paths ===
price_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset', 'etf_prices_weekly.csv'))
volume_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset', 'etf_volume_weekly.csv'))
num_etf = 0

# === Helper to load ETF CSVs ===
def load_etf_csv(path, name='[unknown]'):
    global num_etf
    try:
        header_row = pd.read_csv(path, header=None, nrows=2)
        columns = header_row.iloc[1].tolist()[1:]
        df = pd.read_csv(path, skiprows=3, header=None)
        df = df.iloc[:, :len(columns) + 1]
        df.columns = ['Date'] + columns
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.set_index('Date')
        df = df.apply(pd.to_numeric, errors='coerce')
        num_etf = len(columns)
        print(f"✅ Loaded {name} with {len(columns)} tickers")
        return df
    except Exception as e:
        print(f"❌ Failed to load {name}: {e}")
        raise

# === Load data ===
price_df = load_etf_csv(price_path, name='ETF Prices')
volume_df = load_etf_csv(volume_path, name='ETF Volume')

# === Feature storage ===
features_all = []
skipped = []

# === Feature generation loop ===
for symbol in price_df.columns:
    if symbol not in volume_df.columns:
        print(f"⚠️ Skipping {symbol}: volume data missing.")
        skipped.append(symbol)
        continue

    df = pd.DataFrame(index=price_df.index)
    df['close'] = price_df[symbol]
    df['volume'] = volume_df[symbol]

    try:
        # === Return-based technical indicators ===
        df[f'{symbol}_ret_1w'] = df['close'].pct_change(1)
        df[f'{symbol}_ret_3w'] = df['close'].pct_change(3)
        df[f'{symbol}_ret_6w'] = df['close'].pct_change(6)

        high = df['close'].rolling(window=14).max()
        low = df['close'].rolling(window=14).min()
        df[f'{symbol}_stoch_k'] = 100 * (df['close'] - low) / (high - low)
        df[f'{symbol}_stoch_d'] = df[f'{symbol}_stoch_k'].rolling(window=3).mean()
        df[f'{symbol}_williams_r'] = -100 * (high - df['close']) / (high - low)

        df[f'{symbol}_cci'] = trend.cci(high=df['close'], low=df['close'], close=df['close'], window=20)
        df[f'{symbol}_rsi'] = momentum.rsi(df['close'], window=14)
        df[f'{symbol}_obv'] = volume.on_balance_volume(df['close'], df['volume'])

        df[f'{symbol}_macd'] = trend.macd(df['close'])
        df[f'{symbol}_macd_signal'] = trend.macd_signal(df['close'])
        df[f'{symbol}_macd_diff'] = trend.macd_diff(df['close'])

        # === Price/volume and its variation ===
        df[f'{symbol}_price_change'] = df['close'].pct_change(1)
        df[f'{symbol}_volume_change'] = df['volume'].pct_change(1)

        # === Short-term KST (custom) ===
        roc1 = df['close'].pct_change(10)
        roc2 = df['close'].pct_change(15)
        roc3 = df['close'].pct_change(20)
        roc4 = df['close'].pct_change(30)
        df[f'{symbol}_kst_short'] = (
            roc1.rolling(10).mean() +
            roc2.rolling(10).mean() * 2 +
            roc3.rolling(10).mean() * 3 +
            roc4.rolling(15).mean() * 4
        )

        derived_cols = df.columns.difference(['close', 'volume'])
        feature_df = df[derived_cols].copy()

        features_all.append(feature_df)
        print(f"📈 {symbol}: {feature_df.dropna(how='all').shape[0]} valid rows")

    except Exception as e:
        print(f"❌ Error processing {symbol}: {e}")
        skipped.append(symbol)

# === Final merge ===
if len(features_all) == 0:
    raise ValueError("🛑 No valid ETF features generated.")

features_df = pd.concat(features_all, axis=1).sort_index()
features_df = features_df[~features_df.index.duplicated(keep='first')]

print("📀 Final feature shape:", features_df.shape)

# === Save to CSV ===
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset'))
fname = os.path.join(base_dir, f'weekly_{num_etf}_etf_tech_features.csv')
features_df.to_csv(fname)
print(f"✅ Saved features to: {fname}")

In [179]:
import pandas as pd
import numpy as np
from datetime import datetime

# === Load datasets ===
price_df = pd.read_csv('../dataset/etf_prices_weekly.csv', index_col=0)
volume_df = pd.read_csv('../dataset/etf_volume_weekly.csv', index_col=0)
high_df = pd.read_csv('../dataset/etf_high_weekly.csv', index_col=0)
low_df = pd.read_csv('../dataset/etf_low_weekly.csv', index_col=0)

# === Clean and convert ===
for df in [price_df, volume_df, high_df, low_df]:
    df[:] = df.apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=0, how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    df.index = pd.to_datetime(df.index, errors='coerce')
    df.sort_index(inplace=True)
    df.dropna(inplace=True)

# === Feature generation ===
feature_rows = []

for symbol in price_df.columns:
    close = price_df[symbol]
    high = high_df[symbol]
    low = low_df[symbol]
    volume = pd.to_numeric(volume_df[symbol], errors='coerce').replace(0, np.nan)

    returns_1w = close.pct_change(1)
    returns_3w = close.pct_change(3)
    returns_6w = close.pct_change(6)
    streak = (close.pct_change(1) > 0).astype(int).rolling(3).sum()

    log_volume = np.log(volume)
    log_volume_norm = log_volume / log_volume.rolling(5).mean()

    shock_amplify_raw = (high - low) / close
    shock_amplify = shock_amplify_raw.rolling(3).mean()
    shock_amplify_1w = shock_amplify.shift(1)
    shock_amplify_3w = shock_amplify.rolling(3).mean()
    shock_delta = shock_amplify.diff()

    vol_flag = (
        shock_amplify_raw > (shock_amplify_raw.rolling(10).mean() +
                             2 * shock_amplify_raw.rolling(10).std())
    ).astype(int)

    rsv = (close - low.rolling(9).min()) / (high.rolling(9).max() - low.rolling(9).min()) * 100
    k = rsv.ewm(com=2).mean()
    d = k.ewm(com=2).mean()

    kd_signal = pd.Series(0, index=close.index)
    kd_signal[(k < 30) & (d < 30)] = 1
    kd_signal[(k > 70) & (d > 70)] = -1

    ema12 = close.ewm(span=12).mean()
    ema26 = close.ewm(span=26).mean()
    macd = ema12 - ema26
    macd_slope = macd.diff()

    momentum_2w = close.pct_change(2)

    kd_x_shock = kd_signal * shock_amplify_3w
    streak_x_ret6 = streak * returns_6w

    for i in range(len(close)):
        date = close.index[i]
        try:
            nearest_macro_index = macro_df.index.get_indexer([date], method='nearest')[0]
            macro_row = macro_df.iloc[nearest_macro_index]
        except Exception:
            continue

        row = {
            'Date': date,
            'ETF': symbol,
            'Return_1w': returns_1w.iloc[i],
            'Return_3w': returns_3w.iloc[i],
            'Return_6w': returns_6w.iloc[i],
            'Streak_Up': streak.iloc[i],
            'LogVolumeNorm': log_volume_norm.iloc[i],
            'Shock_Amplify': shock_amplify.iloc[i],
            'Shock_Amplify_1w': shock_amplify_1w.iloc[i],
            'Shock_Amplify_3w': shock_amplify_3w.iloc[i],
            'Shock_Delta': shock_delta.iloc[i],
            'Vol_Flag': vol_flag.iloc[i],
            'KD_Signal': kd_signal.iloc[i],
            'MACD': macd.iloc[i],
            'MACD_Slope': macd_slope.iloc[i],
            'ROC_5w': close.pct_change(5).iloc[i],
            'Momentum_2w': momentum_2w.iloc[i],
            'KD_Signal_x_Shock3w': kd_x_shock.iloc[i],
            'Streak_x_Return6w': streak_x_ret6.iloc[i],
            'Target_Next_Week_Return': close.pct_change(1).shift(-1).iloc[i],
            'Direction': (close.pct_change(1).shift(-1).iloc[i] > 0).astype(int),
        }

        feature_rows.append(row)

# === Assemble + Clip Outliers ===
feature_df = pd.DataFrame(feature_rows)
feature_df.dropna(inplace=True)

# Clip each feature to 1st–99th percentile
for col in feature_df.columns:
    if col not in ['Date', 'ETF', 'Direction']:
        lower = feature_df[col].quantile(0.01)
        upper = feature_df[col].quantile(0.99)
        feature_df[col] = feature_df[col].clip(lower, upper)

# Optional: Flag outlier conditions
values = feature_df.drop(columns=['Date', 'ETF', 'Direction'])
z_scores = (values - values.mean()) / values.std()
del values
feature_df['Edge_Flag'] = (np.abs(z_scores) > 2.5).sum(axis=1) > 3

# Save
feature_df.to_csv('../dataset/etf_features.csv', index=False)
print("✅ Feature CSV with outlier clipping and edge flag saved.")


✅ Feature CSV with outlier clipping and edge flag saved.


### 📌 Deep Sector Rotation Strategy with Shock-Aware Early Exit

This strategy builds on the "Deep Sector Rotation" approach proposed in [SSRN-4280640](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4280640), with the following modifications:

---

#### 🧠 Core Model (MLP)

- A multi-layer perceptron (MLP) is trained to predict next-week returns for each ETF independently.
- Features include:
  - Past 1w, 3w, 6w returns
  - Volume (log normalized)
  - Macro indicators (VIX, 10Y yield, USD index, oil)
  - Streak up count (3-week up trend)
  - Shock Amplify features:
    - This week
    - 1-week lag
    - 3-week average

---

#### 🔁 Weekly Rotation Rule (baseline)

- Each week (e.g., Monday), predict returns for all ETFs using the MLP.
- Rank the ETFs by predicted return.
- Buy top-N (e.g., 3) ETFs.
- Hold for 1 week (unless overridden by shock rule below).

---

#### ⚡ Shock Amplify Early Exit Rule (custom addition)

- Each day (or evaluation step), check for ETFs in the portfolio with:
  - `Shock_Amplify_3w` > +10% or < -10%
- If triggered:
  - Sell that ETF immediately.
  - Immediately start a new turn (predict again, re-select top-N).

---

#### 💼 Goal

- Combine deep learning-based prediction with handcrafted rules for volatility control.
- Achieve more stable and responsive ETF swing trading performance.


In [59]:
import os
import re
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from typing import List
from torch.utils.data import Dataset, DataLoader

# 1. 資料準備 
TEST_PERCENTAGE = 0.2
etf_list = [
    'XLK', 'XLF', 'XLV', 'XLE', 'XLI', 'XLY', 'XLP', 'XLRE', 'XLU', 'XLB', 'XLC',
    'SOXX', 'SH', 'DOG', 'RWM', 'ITA', 'JETS', 'QQQ', 'VOO'
]

# 範例 ETF 資料 (DataFrame) - 假設你的 ETF 資料是 DataFrame 格式，每行代表一天一個 ETF 的資料
# 實際情況你需要從你的資料來源載入
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset'))
feature_file = [res.group(0) for f in os.listdir(base_dir) if (res := re.search(r'weekly_(\d+)_etf_tech_features.csv', f)) is not None]
etf_data = pd.read_csv(os.path.join(base_dir, feature_file[0])).fillna(0)

# 範例 Macro 指標資料 (DataFrame) - 假設你的 Macro 指標資料是 DataFrame 格式，每行代表一天的 Macro 指標
# 實際情況你需要從你的資料來源載入
macro_data = pd.read_csv(os.path.join(base_dir, 'macro_indicators_weekly.csv'), index_col=0)

# 2. 資料集 (Dataset) 定義

class StockDataset(Dataset):
    def __init__(self, etf_list:List, etf_df:pd.DataFrame, macro_df:pd.DataFrame, test_percentage:float = 0.2):
        self.etf_list = etf_list
        self.etf_df:pd.DataFrame = etf_df
        self.macro_df:pd.DataFrame = macro_df
        self.num_etf_features = 0
        self.num_macros = len(macro_df.columns)

        # 資料預處理和特徵工程
        self.processed_data = self._preprocess_data()
        self.test_len = int(len(self.processed_data) * test_percentage)
        self.train_len = len(self.processed_data) - self.test_len
        self.train_test_flag = 0 # 0 for train, 1 for test

    def _preprocess_data(self):
        num_etfs = len(self.etf_list)
        columns = etf_data.columns
        self.num_etf_features = (len(columns) - 1) // num_etfs  # Minus 1 for Date
        
        processed_list = [] # processed_list 現在是樣本列表
        dates_full = self.etf_df['Date'].values
        macro_features_full = self.macro_df.values

        for i in range(1, len(columns), self.num_etf_features):
            etf_features_full = self.etf_df[columns[i:i + self.num_etf_features]]
            etf_symbol = etf_features_full.columns.values[0].split("_")[0]
            targets_full = etf_features_full[f'{etf_symbol}_price_change'].values.reshape(-1, 1)

            seq_len = 12 # 設定序列長度 (例如 12 周) - 可以根據需要調整
            interval = 2
            for j in range(seq_len, len(etf_features_full), interval): # 滑動窗口生成樣本
                start_index = j - seq_len
                end_index = j

                etf_features = etf_features_full.values[start_index:end_index] # 取過去 seq_len 單位(周)的 ETF 特徵
                target = targets_full[end_index] # 取當單位(周) (end_index) 的目標值
                dates = ",".join(dates_full[start_index:end_index]) # 取過去 seq_len 單位(周)的日期
                macro_features = macro_features_full[start_index:end_index] # 取過去 seq_len 單位(周)的 Macro 特徵

                # 標準化 (可以考慮在批次中標準化，而不是在樣本中標準化)
                # etf_features = StandardScaler().fit_transform(etf_features)
                # macro_features = StandardScaler().fit_transform(macro_features)

                processed_list.append({ # 生成單個樣本
                    'etf_features': torch.tensor(etf_features, dtype=torch.float32),
                    'macro_features': torch.tensor(macro_features, dtype=torch.float32),
                    'targets': torch.tensor(target, dtype=torch.float32), # 目標值現在是單個數值
                    'dates': dates,
                    'etf_symbol': etf_symbol
                })
        return processed_list # 返回樣本列表
    
    def train(self):
        self.train_test_flag = 0

    def test(self):
        self.train_test_flag = 1

    def __len__(self):
        return self.train_len if self.train_test_flag == 0 else self.test_len

    def __getitem__(self, idx):
        return self.processed_data[idx] if self.train_test_flag == 0 else self.processed_data[self.train_len + idx]

# 4. 模型訓練 (簡化範例)

# 超參數設定
TRANSFORMER_DIM = 96
NUM_HEADS = 2
NUM_LAYERS = 2
BATCH_SIZE = 16
OUTPUT_DIM = 1 # 預測下週漲跌幅度 (單一數值)
LEARNING_RATE = 1E-5
NUM_EPOCHS = 100

# 建立資料集和資料載入器
dataset = StockDataset(etf_list, etf_data, macro_data, test_percentage=TEST_PERCENTAGE)
dataset.train()
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# 5. 模型評估 (簡化範例) - 你需要準備測試資料集並評估模型性能
# ... (模型評估程式碼，例如計算 RMSE, MAE 等指標)

In [67]:
# 3. 模型定義 (Transformer 模型)

class StockPredictionTransformer(nn.Module):
    def __init__(self, etf_feature_dim, macro_feature_dim, input_dim, num_heads, num_layers, batch_size, output_dim):
        super().__init__()
        self.batch_size = batch_size
        # 輸入嵌入層 (Input Embedding)
        self.etf_embedding = nn.Linear(etf_feature_dim, input_dim)
        self.macro_embedding = nn.Linear(macro_feature_dim, input_dim)
        self.etf_norm = nn.BatchNorm1d(input_dim)
        self.macro_norm = nn.BatchNorm1d(input_dim)

        # Transformer Encoder 層
        encoder_layers = nn.TransformerEncoderLayer(d_model=2 * input_dim, nhead=num_heads) # *2 for etf and macro output dim
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

        # Attention pooling 層
        self.attention_score_layer = nn.Linear(2 * input_dim, 1)

        # 融合層 (簡單拼接後的全連接層)
        self.fusion_layer = nn.Linear(2 * input_dim, input_dim) # 拼接 ETF 和 Macro 特徵
        self.relu = nn.ReLU()

        # 輸出層 (回歸預測下週漲跌幅度)
        self.output_layer = nn.Linear(input_dim, output_dim)

    def forward(self, etf_features, macro_features):
        # 輸入嵌入
        etf_embedded = self.relu(self.etf_embedding(etf_features)) # [batch_size, seq_len, transformer_dim] - 假設修正後形狀為 3 維
        macro_embedded = self.relu(self.macro_embedding(macro_features)) # [batch_size, seq_len, transformer_dim] - 假設修正後形狀為 3 維

        # Batch normalize (需要調整輸入形狀為 [batch_size, feature_dim, seq_len])
        etf_embedded = etf_embedded.transpose(1, 2)
        macro_embedded = macro_embedded.transpose(1, 2)

        etf_normed = self.etf_norm(etf_embedded)
        macro_normed = self.macro_norm(macro_embedded)

        # Transformer Encoder (需要調整輸入形狀為 (seq_len, batch_size, feature_dim))
        # 這裡假設 batch_first=False，所以需要將 batch_size 維度放到第二維
        etf_normed = etf_normed.permute(2, 0, 1) # [seq_len, batch_size, transformer_dim]
        macro_normed = macro_normed.permute(2, 0, 1) # [seq_len, batch_size, transformer_dim]

        # 拼接 ETF 和 Macro 特徵 (在 feature 維度拼接, dim=2)
        fused_features = torch.cat((etf_normed, macro_normed), dim=2) # [seq_len, batch_size, 2*transformer_dim]

        # 透過 Transformer Encoder
        transformer_output = self.transformer_encoder(fused_features) # [seq_len, batch_size, transformer_dim]

        # 取 Transformer 輸出的最後一個時間步的特徵 (可以用平均池化或其他方式)
        # output_feature = transformer_output[-1, :, :] # [batch_size, transformer_dim]

        # ----- Learnable Attention Pooling -----
        # 1. 計算原始注意力分數
        # [seq_len, batch_size, transformer_dim] -> [seq_len, batch_size, 1]
        attention_scores = self.attention_score_layer(transformer_output)

        # 2. 歸一化注意力權重
        # 在序列長度維度 (dim=0) 上應用 Softmax
        # [seq_len, batch_size, 1] -> [seq_len, batch_size, 1]
        attention_weights = torch.softmax(attention_scores, dim=0)

        # 3. 使用 torch.bmm 計算加權和
        # 需要將張量形狀調整為 (batch_size, n, m) 和 (batch_size, m, p)
        # attention_weights: [seq_len, batch_size, 1] -> permute(1, 2, 0) -> [batch_size, 1, seq_len] (αᵀ 形態)
        # transformer_output: [seq_len, batch_size, transformer_dim] -> permute(1, 0, 2) -> [batch_size, seq_len, transformer_dim] (H 形態)
        # torch.bmm([batch_size, 1, seq_len], [batch_size, seq_len, transformer_dim]) -> [batch_size, 1, transformer_dim]

        # 注意：由於我們的 DataLoader 設為 batch_size=1，這裡 batch_size 維度是 1
        # 如果你使用更大的 batch_size，這裡的 permute 操作是關鍵

        # 將序列長度維度 (0) 和 batch_size 維度 (1) 交換
        attention_weights_permuted = attention_weights.permute(1, 2, 0) # [batch_size, 1, seq_len]
        transformer_output_permuted = transformer_output.permute(1, 0, 2) # [batch_size, seq_len, transformer_dim]

        # 執行批量矩陣乘法
        pooled_output = torch.bmm(attention_weights_permuted, transformer_output_permuted) # [batch_size, 1, transformer_dim]

        # 移除中間的 1 維度，得到 [batch_size, transformer_dim] 的形狀
        pooled_output = pooled_output.squeeze(1) # [batch_size, transformer_dim]

        # 融合層
        fused_output = self.relu(self.fusion_layer(pooled_output)) # [batch_size, transformer_dim]

        # 輸出層
        prediction = self.output_layer(fused_output) # [batch_size, output_dim]

        return prediction.squeeze(1) # [output_dim]
    
class customLoss():
    def __init__(self):
        self.mse_loss = nn.MSELoss()
        self.mae_loss = nn.L1Loss()

    def __call__(self, pred, y):
        mse = self.mse_loss(pred, y)
        # mae = self.mae_loss(pred, y)
        direction_loss = 1 - (torch.sign(pred) == torch.sign(y)).float().mean()
        return mse * direction_loss

# 建立模型、損失函數和優化器
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = StockPredictionTransformer(dataset.num_etf_features, dataset.num_macros, TRANSFORMER_DIM, NUM_HEADS, NUM_LAYERS, BATCH_SIZE, OUTPUT_DIM).to(device)
model.device = device
criterion = customLoss() # 均方誤差損失函數 (回歸任務)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# 訓練迴圈 (簡化範例)
for epoch in range(NUM_EPOCHS):
    model.train() # 設定模型為訓練模式
    total_loss = 0
    for batch in dataloader:
        etf_features = batch['etf_features'].to(model.device) # [seq_len, feature_dim]
        macro_features = batch['macro_features'].to(model.device) # [seq_len, feature_dim]
        targets = batch['targets'].to(model.device) # [seq_len, 1]

        # 前向傳播
        outputs = model(etf_features, macro_features) # [output_dim]

        # 計算損失 (只取最後一個時間步的目標值進行比較，範例簡化處理)
        loss = criterion(outputs, targets.squeeze()) # targets[-1] 取最後一個時間步的目標值，並移除 batch_size 維度

        # 反向傳播和優化
        optimizer.zero_grad() # 清空梯度
        loss.backward() # 反向傳播計算梯度
        optimizer.step() # 更新模型參數

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.2e}")

print("Training finished!")


enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)



Epoch [1/100], Average Loss: 1.76e-03
Epoch [2/100], Average Loss: 7.84e-04
Epoch [3/100], Average Loss: 6.75e-04
Epoch [4/100], Average Loss: 6.32e-04
Epoch [5/100], Average Loss: 6.04e-04
Epoch [6/100], Average Loss: 5.64e-04
Epoch [7/100], Average Loss: 5.78e-04
Epoch [8/100], Average Loss: 5.81e-04
Epoch [9/100], Average Loss: 5.12e-04
Epoch [10/100], Average Loss: 5.32e-04
Epoch [11/100], Average Loss: 5.39e-04
Epoch [12/100], Average Loss: 5.03e-04
Epoch [13/100], Average Loss: 5.28e-04
Epoch [14/100], Average Loss: 5.02e-04
Epoch [15/100], Average Loss: 4.93e-04
Epoch [16/100], Average Loss: 4.81e-04
Epoch [17/100], Average Loss: 5.05e-04
Epoch [18/100], Average Loss: 4.94e-04
Epoch [19/100], Average Loss: 5.01e-04
Epoch [20/100], Average Loss: 4.82e-04
Epoch [21/100], Average Loss: 4.53e-04
Epoch [22/100], Average Loss: 4.63e-04
Epoch [23/100], Average Loss: 4.61e-04
Epoch [24/100], Average Loss: 4.76e-04
Epoch [25/100], Average Loss: 4.62e-04
Epoch [26/100], Average Loss: 4.73

In [72]:
# 設定模型為評估模式
model.eval()

# 準備測試資料集和資料載入器
dataset.train() # 使用與訓練集相同的 ETF 列表
test_dataloader = DataLoader(dataset, batch_size=32, shuffle=False) # batch_size 可以與訓練時相同，shuffle=False

predictions = {} # 儲存所有預測結果
actual_targets = {} # 儲存所有真實目標值
date_of_inputs = {}

with torch.no_grad(): # 關閉梯度計算
    for batch in test_dataloader:
        etf_features = batch['etf_features'].to(model.device)
        macro_features = batch['macro_features'].to(model.device)
        targets = batch['targets'].to(model.device)
        dates = batch['dates']
        etf_symbol = batch['etf_symbol']

        # 前向傳播，獲取預測輸出
        outputs = model(etf_features, macro_features)

        # 將預測結果和真實目標值轉換為 NumPy array 並儲存
        for etf, d, pred, real in zip(etf_symbol, dates, outputs.cpu().numpy(), targets.cpu().numpy()):
            predictions.setdefault(etf, []).append(pred)
            actual_targets.setdefault(etf, []).append(real)
            date_of_inputs.setdefault(etf, []).append(d.split(',')[-1])

# # 將預測結果和真實目標值列表轉換為 NumPy array
for etf in predictions:
    predictions[etf] = np.array(predictions[etf])
    actual_targets[etf] = np.array(actual_targets[etf]).flatten()
    date_of_inputs[etf] = np.array(date_of_inputs[etf])

predictions.keys()

dict_keys(['XLK', 'XLF', 'XLV', 'XLE', 'XLI', 'XLY', 'XLP', 'XLRE', 'XLU', 'XLB', 'XLC', 'SOXX', 'SH', 'DOG', 'RWM', 'ITA'])

In [73]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

etf = 'XLK'
sorted_idx = date_of_inputs[etf].argsort()

fig = make_subplots(rows=1, cols=1, shared_xaxes=True)
fig.add_trace(go.Scatter(x = date_of_inputs[etf][sorted_idx], y = predictions[etf][sorted_idx], name='Predictions'), 1, 1)
fig.add_trace(go.Scatter(x = date_of_inputs[etf][sorted_idx], y = actual_targets[etf][sorted_idx], name='Real'), 1, 1)
fig.update_layout(title=f'{etf}')

#### Tensorflow ver. (not tested)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tf_keras as keras
from tf_keras import layers

# 1. 模擬輸入數據 (使用隨機數據，實際應用中需要讀取真實數據)
def generate_dummy_data(num_samples, num_etfs, num_news_features):
    dates = pd.date_range('2023-01-01', periods=num_samples, freq='D')
    etf_symbols = [f'ETF_{i}' for i in range(num_etfs)]
    macro_feature_names = list(macro_tickers.keys())
    etf_data_list = []
    macro_data_df = pd.DataFrame(index=dates)
    news_data_df = pd.DataFrame(index=dates)

    for symbol in etf_symbols:
        etf_df = pd.DataFrame({
            'Date': dates,
            'ETF': symbol,
            'Return_1w': np.random.randn(num_samples),
            'Return_3w': np.random.randn(num_samples),
            'Return_6w': np.random.randn(num_samples),
            'Streak_Up': np.random.randint(0, 10, num_samples),
            'LogVolumeNorm': np.random.randn(num_samples),
            'Shock_Amplify': np.random.randn(num_samples),
            'Shock_Amplify_1w': np.random.randn(num_samples),
            'Shock_Amplify_3w': np.random.randn(num_samples),
            'Shock_Delta': np.random.randn(num_samples),
            'Vol_Flag': np.random.randint(0, 2, num_samples),
            'KD_Signal': np.random.randn(num_samples),
            'MACD': np.random.randn(num_samples),
            'MACD_Slope': np.random.randn(num_samples),
            'ROC_5w': np.random.randn(num_samples),
            'Momentum_2w': np.random.randn(num_samples),
            'KD_Signal_x_Shock3w': np.random.randn(num_samples),
            'Streak_x_Return6w': np.random.randn(num_samples),
            'Target_Next_Week_Return': np.random.randn(num_samples), # 模擬目標值
            'Direction': np.random.randint(0, 2, num_samples), # 模擬方向 (分類任務可選)
        })
        etf_data_list.append(etf_df)
    etf_data = pd.concat(etf_data_list)

    for feature_name in macro_feature_names:
        macro_data_df[feature_name] = np.random.randn(num_samples)

    for i in range(num_news_features):
        news_data_df[f'News_Impact_{i}'] = np.random.randn(num_samples)

    return etf_data, macro_data_df, news_data_df

# 2. 資料預處理函數
def preprocess_etf_data(etf_df):
    etf_df_processed = etf_df.copy()
    # 缺失值處理 (簡單示例：使用均值填補)
    etf_df_processed = etf_df_processed.fillna(etf_df_processed.mean(numeric_only=True))
    # 數值特徵列表 (排除 Date, ETF, Target, Direction 等非數值或目標欄位)
    numerical_features = [col for col in etf_df_processed.columns if col not in ['Date', 'ETF', 'Target_Next_Week_Return', 'Direction']]
    # 標準化/歸一化 (MinMaxScaler)
    scaler = MinMaxScaler()
    etf_df_processed[numerical_features] = scaler.fit_transform(etf_df_processed[numerical_features])
    return etf_df_processed, scaler # 返回 scaler 以便後續使用

def preprocess_macro_data(macro_df):
    macro_df_processed = macro_df.copy()
    # 缺失值處理 (簡單示例：使用均值填補)
    macro_df_processed = macro_df_processed.fillna(macro_df_processed.mean(numeric_only=True))
    # 標準化/歸一化 (MinMaxScaler)
    scaler = MinMaxScaler()
    macro_df_processed = scaler.fit_transform(macro_df_processed)
    return macro_df_processed, scaler

def preprocess_news_data(news_df):
    news_df_processed = news_df.copy()
    # 缺失值處理 (簡單示例：使用均值填補)
    news_df_processed = news_df_processed.fillna(news_df_processed.mean(numeric_only=True))
    # 標準化/歸一化 (MinMaxScaler)
    scaler = MinMaxScaler()
    news_df_processed = scaler.fit_transform(news_df_processed)
    return news_df_processed, scaler

# 3. 創建 Transformer Encoder 模型
def create_transformer_encoder_model(input_shape_etf, input_shape_macro, input_shape_news, num_transformer_layers=2, num_heads=4, ff_dim=32, output_dim=1):
    # ETF 輸入分支
    input_etf = layers.Input(shape=input_shape_etf, name="etf_input")
    embedding_etf = layers.Dense(ff_dim)(input_etf) # 簡單線性嵌入
    positional_encoding_etf = layers.LayerNormalization(epsilon=1e-6)(embedding_etf) # 簡單位置編碼 - 此處簡化，實際可使用更複雜的位置編碼

    x_etf = positional_encoding_etf
    for _ in range(num_transformer_layers):
        attention_output_etf = layers.MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(x_etf, x_etf)
        attention_output_etf = layers.Dropout(0.1)(attention_output_etf)
        out1_etf = layers.Add()([x_etf, attention_output_etf])
        out1_etf = layers.LayerNormalization(epsilon=1e-6)(out1_etf)

        ffn_etf = layers.Dense(ff_dim, activation="relu")(out1_etf)
        ffn_output_etf = layers.Dense(ff_dim)(ffn_etf)
        ffn_output_etf = layers.Dropout(0.1)(ffn_output_etf)
        x_etf = layers.Add()([out1_etf, ffn_output_etf])
        x_etf = layers.LayerNormalization(epsilon=1e-6)(x_etf)

    transformer_output_etf = layers.GlobalAveragePooling1D(data_format="channels_first")(x_etf) if x_etf.shape[1] is not None else layers.Flatten()(x_etf) # 處理時間序列或扁平輸入

    # Macro 指標輸入分支
    input_macro = layers.Input(shape=input_shape_macro, name="macro_input")
    embedding_macro = layers.Dense(ff_dim)(input_macro) # 簡單線性嵌入
    positional_encoding_macro = layers.LayerNormalization(epsilon=1e-6)(embedding_macro) # 簡單位置編碼 - 此處簡化

    x_macro = positional_encoding_macro
    for _ in range(num_transformer_layers):
        attention_output_macro = layers.MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(x_macro, x_macro)
        attention_output_macro = layers.Dropout(0.1)(attention_output_macro)
        out1_macro = layers.Add()([x_macro, attention_output_macro])
        out1_macro = layers.LayerNormalization(epsilon=1e-6)(out1_macro)

        ffn_macro = layers.Dense(ff_dim, activation="relu")(out1_macro)
        ffn_output_macro = layers.Dense(ff_dim)(ffn_macro)
        ffn_output_macro = layers.Dropout(0.1)(ffn_output_macro)
        x_macro = layers.Add()([out1_macro, ffn_output_macro])
        x_macro = layers.LayerNormalization(epsilon=1e-6)(x_macro)

    transformer_output_macro = layers.GlobalAveragePooling1D(data_format="channels_first")(x_macro) if x_macro.shape[1] is not None else layers.Flatten()(x_macro) # 處理時間序列或扁平輸入


    # 新聞影響度輸入分支 (可選)
    input_news = layers.Input(shape=input_shape_news, name="news_input")
    embedding_news = layers.Dense(ff_dim)(input_news) # 簡單線性嵌入
    positional_encoding_news = layers.LayerNormalization(epsilon=1e-6)(embedding_news) # 簡單位置編碼 - 此處簡化

    x_news = positional_encoding_news
    for _ in range(num_transformer_layers):
        attention_output_news = layers.MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(x_news, x_news)
        attention_output_news = layers.Dropout(0.1)(attention_output_news)
        out1_news = layers.Add()([x_news, attention_output_news])
        out1_news = layers.LayerNormalization(epsilon=1e-6)(out1_news)

        ffn_news = layers.Dense(ff_dim, activation="relu")(out1_news)
        ffn_output_news = layers.Dense(ff_dim)(ffn_news)
        ffn_output_news = layers.Dropout(0.1)(ffn_output_news)
        x_news = layers.Add()([out1_news, ffn_output_news])
        x_news = layers.LayerNormalization(epsilon=1e-6)(x_news)

    transformer_output_news = layers.GlobalAveragePooling1D(data_format="channels_first")(x_news) if x_news.shape[1] is not None else layers.Flatten()(x_news) # 處理時間序列或扁平輸入


    # 特徵融合
    fused_features = layers.concatenate([transformer_output_etf, transformer_output_macro, transformer_output_news])

    # 輸出層 (回歸任務 - 預測漲跌幅度)
    output_layer = layers.Dense(output_dim, activation='linear', name="output")(fused_features) # 線性激活函數用於回歸

    model = keras.Model(inputs=[input_etf, input_macro, input_news], outputs=output_layer) # 定義多輸入模型
    return model

# 4. 定義 Macro 指標 tickers
macro_tickers = {
    'VIX': '^VIX',
    '10Y_Yield': '^TNX',
    'USD_Index': 'DX-Y.NYB',
    'WTI_Crude': 'CL=F'
}

# 5. 生成模擬數據
num_samples = 1000
num_etfs = 5
num_macro_features = len(macro_tickers)
num_news_features = 3 # 假設有 3 個新聞影響度特徵
etf_data, macro_data_df, news_data_df = generate_dummy_data(num_samples, num_etfs, num_macro_features, num_news_features)

# 6. 資料預處理
etf_data_processed, etf_scaler = preprocess_etf_data(etf_data)
macro_data_processed, macro_scaler = preprocess_macro_data(macro_data_df)
news_data_processed, news_scaler = preprocess_news_data(news_data_df)

# 7. 準備模型輸入 (以 ETF_0 為例，並且簡化時間序列處理，直接使用所有時間點的資料作為輸入)
etf_symbol_to_predict = 'ETF_0'
etf_input_data = etf_data_processed[etf_data_processed['ETF'] == etf_symbol_to_predict].drop(['Date', 'ETF', 'Target_Next_Week_Return', 'Direction'], axis=1).values
macro_input_data = macro_data_processed
news_input_data = news_data_processed

# 8. 準備目標變數 (Target - ETF_0 的下週報酬率)
target_data = etf_data_processed[etf_data_processed['ETF'] == etf_symbol_to_predict]['Target_Next_Week_Return'].values

# 9. 調整輸入形狀 (假設輸入是二維的，若要處理時間序列，需要調整形狀為三維，例如 (samples, timesteps, features))
input_shape_etf = (etf_input_data.shape[1],) # (features)
input_shape_macro = (macro_input_data.shape[1],) # (features)
input_shape_news = (news_input_data.shape[1],) # (features)

raise KeyboardInterrupt

# 10. 創建 Transformer 模型
model = create_transformer_encoder_model(input_shape_etf, input_shape_macro, input_shape_news)

# 11. 編譯模型
model.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse']) # 回歸任務使用 MSE 和 MAE

# 12. 模型摘要
model.summary()

# 13. 訓練模型 (簡化示例，實際應用中需要劃分訓練集、驗證集、測試集)
history = model.fit(
    x = {"etf_input": etf_input_data, "macro_input": macro_input_data, "news_input": news_input_data},
    y = target_data,
    epochs=10, # 實際訓練需要更多 epochs
    batch_size=32,
    validation_split=0.2 # 簡化驗證集設定
)

# 14. 評估模型 (在測試集上評估，這裡簡化為使用驗證集評估)
loss, mae, mse = model.evaluate(
    x = {"etf_input": etf_input_data[-200:], "macro_input": macro_input_data[-200:], "news_input": news_input_data[-200:]}, # 使用最後 200 筆資料作為簡化驗證
    y = target_data[-200:]
)
print(f"Validation Loss: {loss}, MAE: {mae}, MSE: {mse}")

# 15. 使用模型進行預測 (簡化示例，使用最後一筆資料進行預測)
sample_input_etf = etf_input_data[-1:]
sample_input_macro = macro_input_data[-1:]
sample_input_news = news_input_data[-1:]

prediction = model.predict({"etf_input": sample_input_etf, "macro_input": sample_input_macro, "news_input": sample_input_news})
print(f"預測的下週漲跌幅度: {prediction[0][0]}")