In [12]:
import os
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta

# === ETF list ===
etf_list = [
    'XLK', 'XLF', 'XLV', 'XLE', 'XLI', 'XLY', 'XLP', 'XLRE', 'XLU', 'XLB', 'XLC',
    'SOXX', 'SH', 'DOG', 'RWM', 'ITA', 'JETS', 'PSQ', 'VNQ'
]

# === Date range ===
today = datetime.today()
start_date = (today - timedelta(weeks=52 * 10)).strftime('%Y-%m-%d')
end_date = today.strftime('%Y-%m-%d')
print(f"🗕️ Downloading weekly data from {start_date} to {end_date} (until last friday)")

# === Output folder ===
dataset_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset'))
os.makedirs(dataset_path, exist_ok=True)

# === Containers ===
adjclose_data, volume_data, high_data, low_data = {}, {}, {}, {}

# === Shift logic ===
def shift_to_next_friday(date):
    weekday = date.weekday()
    days_until_friday = (4 - weekday + 7) % 7
    return date + timedelta(days=days_until_friday)

# === Download each ETF ===
for symbol in etf_list:
    print(f"⬇️ Downloading {symbol}...")
    data = yf.download(
        symbol,
        start=start_date,
        end=end_date,
        interval='1wk',
        auto_adjust=False,
        progress=False
    )
    if not data.empty:
        data = data[~data.index.duplicated(keep='first')].sort_index()
        data.index = data.index.to_series().apply(shift_to_next_friday)

        # ❌ Filter out future dates after shifting
        data = data[data.index <= pd.Timestamp(today.date())]

        # ✅ Store
        adjclose_data[symbol] = data[['Adj Close']].rename(columns={'Adj Close': symbol})
        volume_data[symbol] = data[['Volume']].rename(columns={'Volume': symbol})
        high_data[symbol] = data[['High']].rename(columns={'High': symbol})
        low_data[symbol] = data[['Low']].rename(columns={'Low': symbol})

# === Merge and Save in clean format ===
def combine_and_save(data_dict, filename):
    df = pd.concat(data_dict.values(), axis=1)
    df.index = pd.to_datetime(df.index, errors='coerce')
    df = df[df.index < pd.Timestamp(today.date())]  # 🚫 Final future filtering
    df = df[~df.index.duplicated(keep='first')].sort_index()
    df.dropna(axis=0, how='all', inplace=True)
    df.insert(0, 'Date', df.index.strftime('%Y-%m-%d'))
    df.to_csv(os.path.join(dataset_path, filename), index=False)
    print(f"✅ Saved: {filename}")
    return df

# === Save all ===
price_df = combine_and_save(adjclose_data, 'etf_prices_weekly.csv')
volume_df = combine_and_save(volume_data, 'etf_volume_weekly.csv')
high_df = combine_and_save(high_data, 'etf_high_weekly.csv')
low_df = combine_and_save(low_data, 'etf_low_weekly.csv')

# === Preview ===
price_df.head()


🗕️ Downloading weekly data from 2015-05-08 to 2025-04-25
⬇️ Downloading XLK...
⬇️ Downloading XLF...
⬇️ Downloading XLV...
⬇️ Downloading XLE...
⬇️ Downloading XLI...
⬇️ Downloading XLY...
⬇️ Downloading XLP...
⬇️ Downloading XLRE...
⬇️ Downloading XLU...
⬇️ Downloading XLB...
⬇️ Downloading XLC...
⬇️ Downloading SOXX...
⬇️ Downloading SH...
⬇️ Downloading DOG...
⬇️ Downloading RWM...
⬇️ Downloading ITA...
⬇️ Downloading JETS...
⬇️ Downloading PSQ...
⬇️ Downloading VNQ...
✅ Saved: etf_prices_weekly.csv
✅ Saved: etf_volume_weekly.csv
✅ Saved: etf_high_weekly.csv
✅ Saved: etf_low_weekly.csv


Price,Date,XLK,XLF,XLV,XLE,XLI,XLY,XLP,XLRE,XLU,XLB,XLC,SOXX,SH,DOG,RWM,ITA,JETS,PSQ,VNQ
Ticker,Unnamed: 1_level_1,XLK,XLF,XLV,XLE,XLI,XLY,XLP,XLRE,XLU,XLB,XLC,SOXX,SH,DOG,RWM,ITA,JETS,PSQ,VNQ
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2015-05-08,2015-05-08,38.081741,16.607914,62.328041,54.583641,47.107685,68.427338,37.711849,,31.769207,42.11253,,28.441837,153.447144,78.749908,51.72715,54.787785,23.647642,237.650406,53.803043
2015-05-15,2015-05-15,38.32954,16.567669,63.015366,53.855934,47.548946,68.418404,38.156696,,31.978456,42.063541,,28.762247,152.713654,78.259262,51.239155,55.6563,23.714334,235.520844,54.2108
2015-05-22,2015-05-22,38.586185,16.674992,63.626339,53.542175,47.374115,68.731079,37.773212,,32.158836,41.745052,,29.208433,152.273544,78.329346,50.925442,55.633801,22.027943,233.476547,53.515572
2015-05-29,2015-05-29,38.382652,16.500593,63.5924,52.333775,46.46661,68.15934,37.420418,,32.108334,41.328587,,30.265497,153.520508,79.205505,51.12587,54.70229,22.008886,234.243179,52.907204
2015-06-05,2015-06-05,38.010941,16.628035,63.083271,51.853123,46.508221,68.355865,36.500038,,30.823999,40.838623,,29.474941,154.547379,79.906471,50.472309,54.378307,22.018414,235.776413,51.616924


In [15]:
import os 
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

# === Macro indicator tickers ===
macro_tickers = {
    'VIX': '^VIX',
    '10Y_Yield': '^TNX',   # Scale * 0.1
    '2Y_Yield': '^IRX',    # Scale * 0.01
    'USD_Index': 'DX-Y.NYB',
    'WTI_Crude': 'CL=F',
    'SPY_Price': 'SPY',
    'QQQ_Price': 'QQQ'
}

# === Date range ===
today = datetime.today()
back_time = today - timedelta(weeks=52 * 10)
start_date = back_time.strftime('%Y-%m-%d')
end_date = today.strftime('%Y-%m-%d')

print(f"📊 Downloading macro indicators from {start_date} to {end_date}")

# === Output path ===
macro_save_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset', 'macro_indicators_weekly.csv'))

# === Shift helper ===
def shift_to_next_friday(date):
    days_ahead = (4 - date.weekday() + 7) % 7
    return date + timedelta(days=days_ahead)

# === Download each macro ticker ===
macro_data = {}
for name, ticker in macro_tickers.items():
    print(f"⬇️ Downloading {name} ({ticker})...")
    data = yf.download(ticker, start=start_date, end=end_date, interval='1wk', auto_adjust=False, progress=False)
    if not data.empty:
        data = data[~data.index.duplicated(keep='first')].sort_index()
        data.index = pd.to_datetime([shift_to_next_friday(d) for d in data.index])
        #filter futrue data
        data = data[data.index < pd.Timestamp(today.date())]
        macro_data[name] = data[['Close']].rename(columns={'Close': name})

# === Combine into single DataFrame ===
macro_df = pd.concat(macro_data.values(), axis=1)
macro_df.index = pd.to_datetime(macro_df.index)
macro_df = macro_df[~macro_df.index.duplicated(keep='first')].sort_index()

# === Apply scaling to yields ===
if '10Y_Yield' in macro_df.columns:
    macro_df['10Y_Yield'] *= 0.1
if '2Y_Yield' in macro_df.columns:
    macro_df['2Y_Yield'] *= 0.01

# === Final cleanup ===
macro_df.dropna(axis=0, how='all', inplace=True)
macro_df.insert(0, 'Date', macro_df.index.strftime('%Y-%m-%d'))
macro_df.to_csv(macro_save_path, index=False)

print(f"✅ Macro indicators saved to: {macro_save_path}")


📊 Downloading macro indicators from 2015-05-08 to 2025-04-25
⬇️ Downloading VIX (^VIX)...
⬇️ Downloading 10Y_Yield (^TNX)...
⬇️ Downloading 2Y_Yield (^IRX)...
⬇️ Downloading USD_Index (DX-Y.NYB)...
⬇️ Downloading WTI_Crude (CL=F)...
⬇️ Downloading SPY_Price (SPY)...
⬇️ Downloading QQQ_Price (QQQ)...
✅ Macro indicators saved to: D:\CodingWorks\Weekly_Swing_TransformerQT\dataset\macro_indicators_weekly.csv


In [3]:
# calculate TA signals


import os
import pandas as pd
from ta import momentum, trend, volume

# === Paths ===
price_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset', 'etf_prices_weekly.csv'))
volume_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset', 'etf_volume_weekly.csv'))

# === Helper to load ETF CSVs ===
def load_etf_csv(path, name='[unknown]'):
    try:
        header_row = pd.read_csv(path, header=None, nrows=2)
        columns = header_row.iloc[1].tolist()[1:]
        df = pd.read_csv(path, skiprows=3, header=None)
        df = df.iloc[:, :len(columns) + 1]
        df.columns = ['Date'] + columns
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.set_index('Date')
        df = df.apply(pd.to_numeric, errors='coerce')
        print(f"✅ Loaded {name} with {len(columns)} tickers")
        return df
    except Exception as e:
        print(f"❌ Failed to load {name}: {e}")
        raise

# === Load data ===
price_df = load_etf_csv(price_path, name='ETF Prices')
volume_df = load_etf_csv(volume_path, name='ETF Volume')

# === Feature storage ===
features_all = []
skipped = []

# === Feature generation loop ===
for symbol in price_df.columns:
    if symbol not in volume_df.columns:
        print(f"⚠️ Skipping {symbol}: volume data missing.")
        skipped.append(symbol)
        continue

    df = pd.DataFrame(index=price_df.index)
    df['close'] = price_df[symbol]
    df['volume'] = volume_df[symbol]

    try:
        # === Return-based technical indicators ===
        df[f'{symbol}_ret_1w'] = df['close'].pct_change(1)
        df[f'{symbol}_ret_3w'] = df['close'].pct_change(3)
        df[f'{symbol}_ret_6w'] = df['close'].pct_change(6)

        high = df['close'].rolling(window=14).max()
        low = df['close'].rolling(window=14).min()
        df[f'{symbol}_stoch_k'] = 100 * (df['close'] - low) / (high - low)
        df[f'{symbol}_stoch_d'] = df[f'{symbol}_stoch_k'].rolling(window=3).mean()
        df[f'{symbol}_williams_r'] = -100 * (high - df['close']) / (high - low)

        df[f'{symbol}_cci'] = trend.cci(high=df['close'], low=df['close'], close=df['close'], window=20)
        df[f'{symbol}_rsi'] = momentum.rsi(df['close'], window=14)
        df[f'{symbol}_obv'] = volume.on_balance_volume(df['close'], df['volume'])

        df[f'{symbol}_macd'] = trend.macd(df['close'])
        df[f'{symbol}_macd_signal'] = trend.macd_signal(df['close'])
        df[f'{symbol}_macd_diff'] = trend.macd_diff(df['close'])

        # === Short-term KST (custom) ===
        roc1 = df['close'].pct_change(10)
        roc2 = df['close'].pct_change(15)
        roc3 = df['close'].pct_change(20)
        roc4 = df['close'].pct_change(30)
        df[f'{symbol}_kst_short'] = (
            roc1.rolling(10).mean() +
            roc2.rolling(10).mean() * 2 +
            roc3.rolling(10).mean() * 3 +
            roc4.rolling(15).mean() * 4
        )

        derived_cols = df.columns.difference(['close', 'volume'])
        feature_df = df[derived_cols].copy()

        features_all.append(feature_df)
        print(f"📈 {symbol}: {feature_df.dropna(how='all').shape[0]} valid rows")

    except Exception as e:
        print(f"❌ Error processing {symbol}: {e}")
        skipped.append(symbol)

# === Final merge ===
if len(features_all) == 0:
    raise ValueError("🛑 No valid ETF features generated.")

features_df = pd.concat(features_all, axis=1).sort_index()
features_df = features_df[~features_df.index.duplicated(keep='first')]

print("📀 Final feature shape:", features_df.shape)

# === Save to CSV ===
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset'))
features_df.to_csv(os.path.join(base_dir, 'weekly_etf_tech_feature.csv'))
print(f"✅ Saved features to: weekly_etf_tech_feature.csv")


✅ Loaded ETF Prices with 19 tickers
✅ Loaded ETF Volume with 19 tickers
📈 XLK: 521 valid rows
📈 XLF: 521 valid rows
📈 XLV: 521 valid rows
📈 XLE: 521 valid rows
📈 XLI: 521 valid rows
📈 XLY: 521 valid rows
📈 XLP: 521 valid rows
📈 XLRE: 508 valid rows
📈 XLU: 521 valid rows
📈 XLB: 521 valid rows
📈 XLC: 508 valid rows
📈 SOXX: 521 valid rows
📈 SH: 521 valid rows
📈 DOG: 521 valid rows
📈 RWM: 521 valid rows
📈 ITA: 521 valid rows
📈 JETS: 521 valid rows
📈 PSQ: 521 valid rows
📈 VNQ: 521 valid rows
📀 Final feature shape: (521, 247)
✅ Saved features to: weekly_etf_tech_feature.csv


In [17]:
# normalizing matrix add masks


# Normalize and mask using consistent CSV structure
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

# === Paths ===
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset'))
feat_path = os.path.join(base_dir, 'weekly_etf_tech_feature.csv')
macro_path = os.path.join(base_dir, 'macro_indicators_weekly.csv')
price_path = os.path.join(base_dir, 'etf_prices_weekly.csv')

# === Load Data ===
feat_df = pd.read_csv(feat_path)
macro_df = pd.read_csv(macro_path)
price_df = pd.read_csv(price_path)

# === Set Index ===
feat_df.set_index('Date', inplace=True)
macro_df.set_index('Date', inplace=True)
price_df.set_index('Date', inplace=True)

# === Convert index to datetime
today = pd.Timestamp.today().normalize()
feat_df.index = pd.to_datetime(feat_df.index)
macro_df.index = pd.to_datetime(macro_df.index)
price_df.index = pd.to_datetime(price_df.index)

feat_df = feat_df[feat_df.index < today]
macro_df = macro_df[macro_df.index < today]
price_df = price_df[price_df.index < today]

# === Drop duplicated rows if any
feat_df = feat_df[~feat_df.index.duplicated()]
macro_df = macro_df[~macro_df.index.duplicated()]
price_df = price_df[~price_df.index.duplicated()]

# === Output Directory ===
norm_dir = os.path.join(base_dir, 'normalized_matrix')
os.makedirs(norm_dir, exist_ok=True)

# === Scaler
scaler = StandardScaler()

# === Detect unique tickers
all_cols = feat_df.columns
tickers = sorted(set(col.split('_')[0] for col in all_cols if '_' in col))
print(f"🧾 Tickers detected: {tickers}")

for ticker in tickers:
    feat_cols = [c for c in feat_df.columns if c.startswith(f"{ticker}_")]
    if ticker not in price_df.columns:
        print(f"⚠️ Skipped {ticker}: no matching price data.")
        continue

    # Build DataFrame with date index
    df = pd.DataFrame(index=feat_df.index)

    # Add technical indicators
    for col in feat_cols:
        clean_name = col.replace(f"{ticker}_", "")
        df[clean_name] = feat_df[col]

    # Add price
    df.insert(0, 'price', price_df[ticker])

    # Add macro indicators
    df = df.join(macro_df, how='left')

    # Create mask before filling NAs
    mask = df.isna().astype(float)

    # Custom masking logic
    if ticker == 'PSQ':
        mask_cols = [c for c in df.columns if 'QQQ' in c]
        df[mask_cols] = 0.0
        mask[mask_cols] = 1.0
    if ticker == 'SH':
        mask_cols = [c for c in df.columns if 'SPY' in c]
        df[mask_cols] = 0.0
        mask[mask_cols] = 1.0

    # Normalize technical (non-macro) features
    macro_cols = macro_df.columns
    norm_cols = df.columns.difference(macro_cols.union({'price'}))

    if df[norm_cols].dropna(how='all').empty:
        print(f"⚠️ Skipped {ticker}: no valid technical features.")
        continue

    df[norm_cols] = scaler.fit_transform(df[norm_cols].fillna(0.0))
    df[macro_cols] = df[macro_cols].fillna(0.0)

    # === Save
    df.to_csv(os.path.join(norm_dir, f'{ticker}_combined.csv'))
    mask.to_csv(os.path.join(norm_dir, f'{ticker}_mask.csv'))
    print(f"✅ Saved {ticker}: {df.shape[0]} rows, {df.shape[1]} features")


🧾 Tickers detected: ['DOG', 'ITA', 'JETS', 'PSQ', 'RWM', 'SH', 'SOXX', 'VNQ', 'XLB', 'XLC', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLRE', 'XLU', 'XLV', 'XLY']
✅ Saved DOG: 521 rows, 21 features
✅ Saved ITA: 521 rows, 21 features
✅ Saved JETS: 521 rows, 21 features
✅ Saved PSQ: 521 rows, 21 features
✅ Saved RWM: 521 rows, 21 features
✅ Saved SH: 521 rows, 21 features
✅ Saved SOXX: 521 rows, 21 features
✅ Saved VNQ: 521 rows, 21 features
✅ Saved XLB: 521 rows, 21 features
✅ Saved XLC: 521 rows, 21 features
✅ Saved XLE: 521 rows, 21 features
✅ Saved XLF: 521 rows, 21 features
✅ Saved XLI: 521 rows, 21 features
✅ Saved XLK: 521 rows, 21 features
✅ Saved XLP: 521 rows, 21 features
✅ Saved XLRE: 521 rows, 21 features
✅ Saved XLU: 521 rows, 21 features
✅ Saved XLV: 521 rows, 21 features
✅ Saved XLY: 521 rows, 21 features


# start training

In [5]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

# === Paths ===
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset', 'normalized_matrix'))

# === Dataset ===
class ETFDataset(Dataset):
    def __init__(self, combined_csv, mask_csv, etf, return_feature="price_change", seq_len=4):
        self.df = pd.read_csv(combined_csv, index_col=0)
        self.mask = pd.read_csv(mask_csv, index_col=0)
        self.seq_len = seq_len

        # Extract the correct target column
        target_col = f"{etf}_{return_feature}"
        if target_col not in self.df.columns:
            raise ValueError(f"❌ Target column {target_col} not found in {combined_csv}")

        print(f"📌 Predicting: {target_col}")

        self.target_index = self.df.columns.get_loc(target_col)
        self.raw_X = self.df.values.astype(np.float32)
        self.raw_M = self.mask.values.astype(np.float32)

        self.X, self.M, self.y = [], [], []
        for i in range(seq_len, len(self.raw_X)):
            self.X.append(self.raw_X[i-seq_len:i])
            self.M.append(self.raw_M[i-seq_len:i])
            self.y.append(self.raw_X[i, self.target_index])

        self.X = np.stack(self.X)
        self.M = np.stack(self.M)
        self.y = np.array(self.y)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X[idx]),
            torch.tensor(self.M[idx]),
            torch.tensor(self.y[idx])
        )


# === Model with Attention Pooling ===
class DualTransformerModel(nn.Module):
    def __init__(self, num_features, hidden_dim=128, num_layers=2):
        super().__init__()

        # Choose the highest possible nhead divisor of num_features
        possible_heads = [h for h in [8, 4, 2, 1] if num_features % h == 0]
        nhead = possible_heads[0]

        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=num_features,
                nhead=nhead,
                dim_feedforward=hidden_dim,
                batch_first=True
            ),
            num_layers=num_layers
        )
        self.attn_query = nn.Parameter(torch.randn(1, 1, num_features))
        self.output = nn.Linear(num_features, 1)

    def forward(self, x, mask):
        x_masked = x * (1 - mask)
        enc = self.encoder(x_masked)

        # Apply learnable attention pooling
        attn_scores = torch.matmul(self.attn_query, enc.transpose(1, 2))  # (1, 1, F) x (B, F, T) = (B, 1, T)
        attn_weights = torch.softmax(attn_scores, dim=-1)  # (B, 1, T)
        pooled = torch.bmm(attn_weights, enc).squeeze(1)  # (B, 1, T) x (B, T, F) = (B, 1, F) → (B, F)

        return self.output(pooled).squeeze(-1)

# === Training Params ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 0.001
max_epochs = 3000
seq_len = 4

# === Training Function ===
def train_one_etf(etf):
    print(f"\n🚀 Training {etf}")
    feat_path = os.path.join(data_dir, f"{etf}_combined.csv")
    mask_path = os.path.join(data_dir, f"{etf}_mask.csv")
    dataset = ETFDataset(feat_path, mask_path, etf=etf, return_feature="price_change", seq_len=seq_len)
    loader = DataLoader(dataset, batch_size=32, shuffle=False)

    X, _, _ = next(iter(loader))
    model = DualTransformerModel(num_features=X.shape[-1]).to(device)
    model.device = device

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    mse_loss = nn.MSELoss()
    mae_loss = nn.L1Loss()

    best_models = []

    for epoch in range(max_epochs):
        model.train()
        total_loss, total_mae, total_win = 0, 0, 0

        for X, M, y in loader:
            X, M, y = X.to(model.device), M.to(model.device), y.to(model.device)
            pred = model(X, M)

            mse = mse_loss(pred, y)
            mae = mae_loss(pred, y)
            direction_loss = 1 - (torch.sign(pred) == torch.sign(y)).float().mean()
            loss = 0.4*mse + 0.25 * mae + 0.35 * direction_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_mae += mae.item()
            total_win += (torch.sign(pred) == torch.sign(y)).float().mean().item()

        avg_loss = total_loss / len(loader)
        avg_mae = total_mae / len(loader)
        avg_win = total_win / len(loader)
        score = avg_win * 0.6 + (1 - avg_mae) * 0.4
        best_models.append((score, model.state_dict(), avg_win, avg_mae))

        if (epoch + 1) % 100 == 0:
            print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, MAE={avg_mae:.4f}, WinRate={avg_win:.4f}")

    # Save top 5 models
    top5 = sorted(best_models, key=lambda x: -x[0])[:5]
    weight_sum = sum(x[0] for x in top5)

    # Ensure model_weights folder exists
    save_dir = os.path.join("..", "model_weights")
    os.makedirs(save_dir, exist_ok=True)

    for i, (score, weights, win, mae) in enumerate(top5):
        out = {
            'weights': weights,
            'score': score,
            'win_rate': win,
            'mae': mae,
            'weight': score / weight_sum
        }
        save_path = os.path.join(save_dir, f"{etf}_top{i+1}.pt")
        torch.save(out, save_path)

# === Execute ===
for etf in etf_list:
    train_one_etf(etf)



🚀 Training XLK


ValueError: ❌ Target column XLK_price_change not found in D:\CodingWorks\Weekly_Swing_TransformerQT\dataset\normalized_matrix\XLK_combined.csv

In [None]:
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.patches import Rectangle
from datetime import timedelta

# === Paths ===
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset', 'normalized_matrix'))
model_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'model_weights'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Gather predictions
plot_data = []

for etf in etf_list:
    try:
        feat_path = os.path.join(data_dir, f"{etf}_combined.csv")
        mask_path = os.path.join(data_dir, f"{etf}_mask.csv")
        model_path = os.path.join(model_dir, f"{etf}_top1.pt")

        df_feat = pd.read_csv(feat_path, index_col=0).astype('float32')
        df_mask = pd.read_csv(mask_path, index_col=0).astype('float32')

        if len(df_feat) < 5:
            continue

        X_seq = []
        M_seq = []
        y_seq = []
        price_seq = df_feat['price_change'].copy()

        for i in range(4, len(df_feat)):
            X_seq.append(df_feat.iloc[i-4:i].values)
            M_seq.append(df_mask.iloc[i-4:i].values)
            return_col = "price_change"
            target_index = df_feat.columns.get_loc(return_col)
            y_seq.append(df_feat.iloc[i, target_index])

        X_seq = torch.tensor(np.array(X_seq), dtype=torch.float32).to(device)
        M_seq = torch.tensor(np.array(M_seq), dtype=torch.float32).to(device)
        y_seq = np.array(y_seq)

        model = DualTransformerModel(num_features=X_seq.shape[-1]).to(device)
        model.load_state_dict(torch.load(model_path, map_location=device)['weights'])
        model.eval()

        with torch.no_grad():
            pred = model(X_seq, M_seq).cpu().numpy()

        date_index = pd.to_datetime(df_feat.index[4:])
        price_base = df_feat['price_change'].shift(1).iloc[4:].reset_index(drop=True)
        actual_prices = (1 + y_seq).cumprod() * 100
        predicted_prices = (1 + pred).cumprod() * 100


        df_plot = pd.DataFrame({
            'Date': date_index,
            'Predicted': predicted_prices,
            'Actual': actual_prices
        }).set_index('Date')
        plot_data.append((etf, df_plot.iloc[-50:]))

    except Exception as e:
        print(f"⚠️ Error loading/predicting {etf}: {e}")

# === Plotting
fig = plt.figure(figsize=(20, 4 * ((len(plot_data) + 3) // 4)))
gs = gridspec.GridSpec((len(plot_data) + 3) // 4, 4, figure=fig)

for i, (etf, df) in enumerate(plot_data):
    ax = fig.add_subplot(gs[i])
    ax.axhline(0, color='black', linestyle='--', linewidth=1)
    ax.plot(df.index, df['Actual'], label='Actual (from price)', color='blue')
    ax.plot(df.index, df['Predicted'], label='Predicted (simulated price)', color='red')
    ax.set_title(etf)
    ax.legend()
    ax.tick_params(axis='x', rotation=45)

    ymin, ymax = ax.get_ylim()
    for j, (pred, actual) in enumerate(zip(df['Predicted'], df['Actual'])):
        color = 'lightgreen' if np.sign(pred - actual) == 0 else 'lightgrey'
        start = df.index[j]
        end = start + timedelta(days=7)
        ax.axvspan(start, end, color=color, alpha=0.2)

fig.suptitle("📈 ETF Price Trajectory Simulated from Prediction vs. Actual", fontsize=18)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
import os
import pandas as pd
import torch
import numpy as np
from datetime import datetime, timedelta

# === Prediction Class ===
class WeeklyETFPredictor:
    def __init__(self, model_dir, data_dir, record_dir, price_path, device=None):
        self.model_dir = model_dir
        self.data_dir = data_dir
        self.record_dir = record_dir
        self.price_path = price_path
        os.makedirs(self.record_dir, exist_ok=True)
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def predict(self):
        # Get the nearest past Monday — this represents the start of the prediction week
        today = datetime.today()
        days_back = today.weekday() % 7
        monday = today - timedelta(days=days_back)
        monday_dt = pd.to_datetime(monday.strftime("%Y-%m-%d"))
        date_str = monday_dt.strftime("%Y-%m-%d")

        # Load ETF price data for reference
        price_df = pd.read_csv(
            self.price_path,
            index_col=0,
            parse_dates=True,
            date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d", errors='coerce')
        )
        price_df = price_df[price_df.index.notna()]
        price_df = price_df[~price_df.index.duplicated()].sort_index()

        print(f"🔍 Checking last available price dates: {price_df.index[-5:].to_list()}")
        print(f"📅 Using feature/price data for week starting on: {monday_dt.date()}")

        summary = []

        for fname in os.listdir(self.data_dir):
            if not fname.endswith("_combined.csv"):
                continue

            etf = fname.replace("_combined.csv", "")
            feat_path = os.path.join(self.data_dir, f"{etf}_combined.csv")
            mask_path = os.path.join(self.data_dir, f"{etf}_mask.csv")

            try:
                df_feat = pd.read_csv(feat_path, index_col=0, parse_dates=True)
                df_mask = pd.read_csv(mask_path, index_col=0, parse_dates=True)
            except Exception as e:
                print(f"❌ Error loading data for {etf}: {e}")
                continue

            if monday_dt not in df_feat.index:
                print(f"⚠️ No feature data for {etf} on {monday_dt.date()}")
                continue

            if etf not in price_df.columns or monday_dt not in price_df.index:
                print(f"⚠️ Price info missing for {etf} on {monday_dt.date()}")
                continue

            try:
                # Collect last 4 weeks (or pad if at beginning)
                idx_pos = df_feat.index.get_loc(monday_dt)
                if idx_pos < 3:
                    # Pad beginning if not enough data
                    x_real = df_feat.iloc[[0]* (4 - idx_pos) + list(range(idx_pos + 1))].values
                    x_mask = df_mask.iloc[[0]* (4 - idx_pos) + list(range(idx_pos + 1))].values
                else:
                    x_real = df_feat.iloc[idx_pos-3:idx_pos+1].values
                    x_mask = df_mask.iloc[idx_pos-3:idx_pos+1].values
                
                # Final tensor format: [1, 4, num_features]
                x_real = torch.tensor(x_real.astype(np.float32)).unsqueeze(0).to(self.device)
                x_mask = torch.tensor(x_mask.astype(np.float32)).unsqueeze(0).to(self.device)

                price = float(price_df.loc[monday_dt, etf])
            except Exception as e:
                print(f"❌ Tensor prep or price parse failed for {etf}: {e}")
                continue

            scores, preds, maes, winrates = [], [], [], []
            for i in range(1, 6):
                path = os.path.join(self.model_dir, f"{etf}_top{i}.pt")
                if not os.path.exists(path): continue

                try:
                    checkpoint = torch.load(path, map_location=self.device)
                    model = DualTransformerModel(num_features=x_real.shape[-1]).to(self.device)
                    model.load_state_dict(checkpoint['weights'])
                    model.eval()
                    with torch.no_grad():
                        pred = model(x_real, x_mask).item()
                    preds.append(pred)
                    maes.append(checkpoint['mae'])
                    winrates.append(checkpoint['win_rate'])
                    scores.append(checkpoint['weight'])
                except Exception as e:
                    print(f"❌ Model predict error for {etf} top{i}: {e}")

            if len(preds) == 0:
                continue

            pred_return = sum(p * w for p, w in zip(preds, scores))
            avg_mae = sum(maes) / len(maes)
            avg_win = sum(winrates) / len(winrates)

            if pred_return >= 0:
                target_up = round(price * (1 + pred_return / 100 + pred_return * avg_mae / 100), 2)
                stop_down = round(price * (1 - pred_return * avg_mae / 100), 2)
                buy_price = round(price, 2)
            else:
                target_up = stop_down = buy_price = "X"

            summary.append({
                'ETF': etf,
                'PredictedReturn': round(pred_return, 4),
                'MAE': round(avg_mae, 4),
                'WinRate': round(avg_win, 4),
                'BuyPrice': buy_price,
                'Target↑': target_up,
                'Stop↓': stop_down
            })

        summary_df = pd.DataFrame(summary)
        if summary_df.empty:
            print(f"⚠️ No valid predictions generated for {date_str}")
            return summary_df

        summary_df = summary_df.sort_values(by='PredictedReturn', ascending=False)

        # === Output
        print(f"\n📊 Weekly ETF Prediction Summary for {date_str}:")
        print(summary_df.to_string(index=False))

        output_path = os.path.join(self.record_dir, f"{date_str}_predict_record.csv")
        summary_df.to_csv(output_path, index=False)
        print(f"📁 Saved prediction to: {output_path}")

        return summary_df

# === Usage
predictor = WeeklyETFPredictor(
    model_dir="../model_weights",
    data_dir="../dataset/normalized_matrix",
    record_dir="../dataset/predict_record",
    price_path="../dataset/etf_prices_weekly.csv"
)
predictor.predict()


In [None]:
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# === Config ===
weights_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'model_weights'))
feature_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset', 'normalized_matrix'))
os.makedirs(weights_dir, exist_ok=True)

# === Load feature names from any example ETF
first_etf = etf_list[0]
example_path = os.path.join(feature_dir, f'{first_etf}_combined.csv')
example_features = pd.read_csv(example_path, index_col=0)
factor_names = example_features.columns.tolist()

# === Collect factor importance across ETFs
factor_weights = {}

for etf in etf_list:
    weights = []
    for i in range(1, 6):
        model_path = os.path.join(weights_dir, f"{etf}_top{i}.pt")
        if not os.path.exists(model_path):
            continue

        data = torch.load(model_path, map_location='cpu')
        linear_weights = data['weights'].get('output.weight', None)
        if linear_weights is None:
            print(f"⚠️ {etf}_top{i}.pt missing output.weight")
            continue

        linear_weights = linear_weights.squeeze().numpy()
        if len(linear_weights) != len(factor_names):
            print(f"⚠️ {etf}_top{i}.pt: weight dim {len(linear_weights)} != {len(factor_names)} features")
            continue

        weights.append(linear_weights * data.get('weight', 1.0))

    if weights:
        factor_weights[etf] = sum(weights)

# === Create heatmap DataFrame
weight_df = pd.DataFrame(factor_weights, index=factor_names).T

# === Normalize (optional)
weight_df = weight_df.div(weight_df.abs().max(axis=1), axis=0)

# === Plot
plt.figure(figsize=(14, 8))
sns.heatmap(weight_df, cmap="coolwarm", center=0, annot=False)
plt.title("🎯 Factor Importance per ETF (Weighted Top 5 Models)", fontsize=14)
plt.xlabel("Factors")
plt.ylabel("ETFs")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
import os
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import warnings

warnings.filterwarnings("ignore", message=".*enable_nested_tensor.*")
warnings.filterwarnings("ignore", message=".*weights_only=False.*")

class ETFBacktester:
    def __init__(self, model_dir, data_dir, price_path, initial_cash=2000, benchmark_symbol='SPY', seq_len=4):
        self.model_dir = model_dir
        self.data_dir = data_dir
        self.price_path = price_path
        self.initial_cash = initial_cash
        self.benchmark_symbol = benchmark_symbol
        self.seq_len = seq_len
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def run(self):
        print("🔁 Loading price data...")
        price_df = pd.read_csv(self.price_path, index_col=0)
        price_df.index = pd.to_datetime(price_df.index, errors='coerce')
        price_df = price_df[price_df.index.notna()]
        price_df = price_df.apply(pd.to_numeric, errors='coerce')
        price_df = price_df[~price_df.index.duplicated()].sort_index()
        print(f"📈 Loaded price data with {len(price_df)} entries.")

        etf_list = [fname.replace('_combined.csv', '') for fname in os.listdir(self.data_dir) if fname.endswith('_combined.csv')]
        print(f"🔍 ETF list found: {etf_list}")

        cash = self.initial_cash
        holdings = {}
        portfolio_values = []
        benchmark_values = []
        dates = []
        trade_log = []
        value_log = []
        portfolio_changes = []

        for i in range(self.seq_len, len(price_df) - 1):
            date = price_df.index[i]
            prev_dates = price_df.index[i - self.seq_len:i]
            next_date = price_df.index[i + 1]
            print(f"\n📅 Processing week: {date.date()} -> {next_date.date()}")

            for etf, info in list(holdings.items()):
                if etf not in price_df.columns or next_date not in price_df.index:
                    continue
                try:
                    sell_price = float(price_df.loc[next_date, etf])
                except Exception:
                    continue
                shares = info['Shares']
                buy_price = info['BuyPrice']
                sell_value = shares * sell_price
                actual_return = (sell_price - buy_price) / buy_price
                predicted_return = info.get('PredictedReturn', None)
                cash += sell_value
                trade_log.append({
                    'Date': next_date.strftime('%Y-%m-%d'),
                    'ETF': etf,
                    'Action': 'Sell',
                    'Price': sell_price,
                    'Shares': shares,
                    'Value': sell_value,
                    'BuyPrice': buy_price,
                    'PredictedReturn': predicted_return,
                    'ActualReturn': actual_return * 100
                })
            holdings.clear()

            week_predictions = []
            for etf in etf_list:
                feat_path = os.path.join(self.data_dir, f"{etf}_combined.csv")
                mask_path = os.path.join(self.data_dir, f"{etf}_mask.csv")
                if not os.path.exists(feat_path) or not os.path.exists(mask_path):
                    continue

                feat_df = pd.read_csv(feat_path, index_col=0, parse_dates=True)
                mask_df = pd.read_csv(mask_path, index_col=0, parse_dates=True)

                if not all(d in feat_df.index for d in prev_dates):
                    continue

                x_seq = torch.tensor(feat_df.loc[prev_dates].values.astype(np.float32)).unsqueeze(0).to(self.device)
                x_mask = torch.tensor(mask_df.loc[prev_dates].values.astype(np.float32)).unsqueeze(0).to(self.device)

                scores, preds = [], []
                for j in range(1, 6):
                    model_path = os.path.join(self.model_dir, f"{etf}_top{j}.pt")
                    if not os.path.exists(model_path):
                        continue
                    checkpoint = torch.load(model_path, map_location=self.device)
                    model = DualTransformerModel(num_features=x_seq.shape[-1]).to(self.device)
                    model.load_state_dict(checkpoint['weights'])
                    model.eval()
                    with torch.no_grad():
                        pred = model(x_seq, x_mask).item()
                    preds.append(pred)
                    scores.append(checkpoint['weight'])

                if preds:
                    pred_return = sum(p * w for p, w in zip(preds, scores))
                    try:
                        price = float(price_df.loc[date, etf])
                    except Exception:
                        continue
                    if not np.isnan(price):
                        week_predictions.append({
                            'ETF': etf,
                            'BuyPrice': price,
                            'PredictedReturn': float(pred_return)
                        })

            week_predictions = sorted(week_predictions, key=lambda x: -x['PredictedReturn'])[:2]

            budget_per_etf = cash / 2
            for item in week_predictions:
                etf = item['ETF']
                price = item['BuyPrice']
                shares = int(budget_per_etf // price)
                cost = shares * price
                if shares <= 0 or cost > cash:
                    continue
                cash -= cost
                holdings[etf] = {'Shares': shares, 'BuyPrice': price, 'PredictedReturn': item['PredictedReturn']}
                trade_log.append({
                    'Date': date.strftime('%Y-%m-%d'),
                    'ETF': etf,
                    'Action': 'Buy',
                    'Price': price,
                    'Shares': shares,
                    'Value': cost,
                    'PredictedReturn': item['PredictedReturn']
                })

            value = cash
            for etf, info in holdings.items():
                try:
                    price = float(price_df.loc[next_date, etf])
                except Exception:
                    price = 0
                value += info['Shares'] * price

            prev_value = portfolio_values[-1] if portfolio_values else self.initial_cash
            change_pct = ((value - prev_value) / prev_value) * 100

            portfolio_values.append(value)
            portfolio_changes.append(change_pct)
            dates.append(next_date.strftime('%Y-%m-%d'))
            value_log.append({
                'Date': next_date.strftime('%Y-%m-%d'),
                'Value': value,
                'Cash': cash,
                'Change%': change_pct,
                'Holdings': {k: dict(v) for k, v in holdings.items()}
            })

            if self.benchmark_symbol in price_df.columns:
                base_price = price_df[self.benchmark_symbol].iloc[0]
                current_price = price_df.loc[next_date, self.benchmark_symbol]
                benchmark_values.append(self.initial_cash * (current_price / base_price))
            else:
                benchmark_values.append(value)

        result_df = pd.DataFrame({
            'Date': dates,
            'PortfolioValue': portfolio_values,
            'BenchmarkValue': benchmark_values,
            'PortfolioChange%': portfolio_changes
        })
        result_df['Date'] = pd.to_datetime(result_df['Date'], errors='coerce')
        result_df.set_index('Date', inplace=True)

        pd.DataFrame(trade_log).to_csv("../dataset/backtest_trade_log.csv", index=False)
        pd.DataFrame(value_log).to_csv("../dataset/backtest_value_log.csv", index=False)

        plt.figure(figsize=(12, 6))
        plt.plot(result_df['PortfolioValue'], label='Strategy Portfolio')
        plt.plot(result_df['BenchmarkValue'], label=f'{self.benchmark_symbol} Benchmark')
        plt.title("Backtest Result: Strategy vs. Benchmark")
        plt.ylabel("Portfolio Value (USD)")
        plt.xlabel("Date")
        xticks_idx = result_df.index[::52]
        xticks_labels = [d.strftime('%Y') for d in xticks_idx]
        plt.xticks(xticks_idx, xticks_labels)
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        print(f"\n💰 Final Portfolio Value: ${result_df['PortfolioValue'].iloc[-1]:,.2f}")
        print(f"📈 Final Benchmark Value: ${result_df['BenchmarkValue'].iloc[-1]:,.2f}")
        print(f"🎯 Strategy Return: {((result_df['PortfolioValue'].iloc[-1] / self.initial_cash - 1) * 100):.2f}%")
        print(f"📊 Benchmark Return: {((result_df['BenchmarkValue'].iloc[-1] / self.initial_cash - 1) * 100):.2f}%")

        return result_df

# === Execute
backtester = ETFBacktester(
    model_dir="../model_weights",
    data_dir="../dataset/normalized_matrix",
    price_path="../dataset/etf_prices_weekly.csv",
    initial_cash=2000,
    benchmark_symbol="SPY",
    seq_len=4
)

result_df = backtester.run()  # ⬅️ THIS must be included!
