<a href="https://colab.research.google.com/github/liyi0206/BostonHousingPrice/blob/master/v1_0_custom_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install torch transformers scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [5]:
# Prepare price sequence data and news data
import yfinance as yf
import pandas as pd
import pprint
import numpy as np
import math
from scipy.signal import argrelextrema
import datetime
import requests
import pytz # import pytz module

ticker = 'APP'
interval = '4h'
interval_obj = datetime.timedelta(hours=4)
period = '30d'
sample_len = 10
forward_window = 10   # how far ahead to look for return labeling
atr_lookback = 14         # ATR lookback period (# of bars)

# Define news API parameters
url = "https://newsapi.org/v2/everything"
params = {
    "q": "AppLovin",
    "from": "2025-03-10",  # Start date (optional)
    # "to": "2025-01-05",    # End date (optional)
    "language": "en",      # Language of articles
    "sortBy": "publishedAt",  # Sort by recency
    "apiKey": "1a3007037b7a45ffb48cdc6514d8d794",
}

def ATR(df, lookback):
  # Compute ATR over the last 'lookback' periods
  tr_list = []
  for j in range(i - lookback + 1, i + 1):
      current_high = df['High'].iloc[j].item()
      current_low = df['Low'].iloc[j].item()
      previous_close = df['Close'].iloc[j-1].item()
      true_range = max(current_high - current_low, abs(current_high - previous_close), abs(current_low - previous_close))
      tr_list.append(true_range)
  return np.mean(tr_list)

# ---------------------------
# Step 1: Get raw datasets
# ---------------------------
# Get pricing data
df = yf.download(ticker, interval=interval, period=period)

# Get news data
response = requests.get(url, params=params)
news_data = response.json()
print('news_data:', news_data)

news_samples = []
for d in news_data['articles']:
    ts = datetime.datetime.strptime(d['publishedAt'], '%Y-%m-%dT%H:%M:%SZ')
    ts = ts.replace(tzinfo=pytz.UTC)  # Replace tz_localize with replace
    news_samples.append((ts, d['content']))
print('News samples:', len(news_samples))

# Merge into pricing_data
matched_news = []
for i in range(len(df)):
    matching_news = [news for t1, news in news_samples if abs(t1 - df.index[i]) <= interval_obj]
    matched_news.append(matching_news)
df['News'] = matched_news

# ---------------------------
# Step 2: Add features
# ---------------------------
df['return'] = df['Close'].pct_change()
df['vol_mean'] = df['Volume'].rolling(window=30).mean()
df['vol_mean'] = df['vol_mean'].apply(lambda x: float('inf') if math.isnan(x) else x)
vol_spike = []
for i in range(len(df)):
    spike = df['Volume'].iloc[i].item() > 1.5 * df['vol_mean'].iloc[i].item()
    vol_spike.append(spike)
df['vol_spike'] = vol_spike

# ---------------------------
# Step 3: Detect local extrema
# ---------------------------
order = 5  # number of surrounding points to compare for a local extremum
df['is_local_max'] = 0
df['is_local_min'] = 0
local_max = argrelextrema(df['Close'].values, np.greater, order=order)[0]
local_min = argrelextrema(df['Close'].values, np.less, order=order)[0]

df.loc[df.index[local_max], 'is_local_max'] = 1
df.loc[df.index[local_min], 'is_local_min'] = 1

# ---------------------------
# Step 4: Combine turning points
# ---------------------------
df['is_turning_point'] = df['is_local_max'] | df['is_local_min'] | df['vol_spike']

# ---------------------------
# Step 5: Sample T-length sequences before turning points
# ---------------------------
training_data = []
for i in range(sample_len, len(df) - forward_window):
    if df['is_turning_point'].iloc[i]:
        # Extract price sequence and news
        cp = df.iloc[i - sample_len:i][['Open', 'High', 'Low', 'Close', 'Volume']].copy()
        price_seq = [r for r in cp.to_numpy()]
        cp = df.iloc[i - sample_len:i]['News'].copy()
        news_seq = [r for r in cp.to_numpy()]

        # Label: based on first significant move in forward window
        baseline = df['Close'].iloc[i].item()
        future_window = df.iloc[i+1:i+1+forward_window].copy()
        atr = ATR(df, atr_lookback)
        threshold = 2 * (atr / baseline)

        # Calculate return at each step
        future_window['up_return'] = (future_window['High'] - baseline) / baseline
        future_window['down_return'] = (future_window['Low'] - baseline) / baseline

        # Find first index where up_return or down_return exceeds threshold
        first_up = future_window[future_window['up_return'] > threshold]
        first_down = future_window[future_window['down_return'] < -threshold]

        first_up_time = first_up.index[0] if not first_up.empty else None
        first_down_time = first_down.index[0] if not first_down.empty else None

        # Decision based on whichever move happened first
        if first_up_time and (not first_down_time or first_up_time < first_down_time):
            label = 2  # up
        elif first_down_time:
            label = 0  # down
        else:
            label = 1  # neutral

        training_data.append({'time': df.index[i], 'price_seq': price_seq, 'news_seq': news_seq, 'label': label})

# ---------------------------
# Step 6: Show extracted labels and turning points
# ---------------------------
# for d in training_data:
#   print(f"Sample {d['time']}: label={['down','neutral','up'][d['label']]}, price_volume=\n{str(d['price_seq'])}")
#   print(f"News {d['news_seq']}")

training_df = pd.DataFrame(training_data)
print("training_df ", str(training_df))

[*********************100%***********************]  1 of 1 completed


news_data: {'status': 'ok', 'totalResults': 131, 'articles': [{'source': {'id': None, 'name': 'GlobeNewswire'}, 'author': 'Hagens Berman Sobol Shapiro LLP', 'title': 'AppLovin Corporation (APP) Market Value Craters $20 Billion On March 27 After Muddy Waters Takes Aim, Class Action Pending – Hagens Berman', 'description': 'SAN FRANCISCO, March 31, 2025 (GLOBE NEWSWIRE) -- On March 27, 2025, investors in AppLovin (NASDAQ: APP) saw the price of their shares drop $65.92, wiping out about $20 billion of the company’s market value, after prominent short seller Muddy Waters Research …', 'url': 'https://www.globenewswire.com/news-release/2025/03/31/3052905/32716/en/AppLovin-Corporation-APP-Market-Value-Craters-20-Billion-On-March-27-After-Muddy-Waters-Takes-Aim-Class-Action-Pending-Hagens-Berman.html', 'urlToImage': 'https://ml.globenewswire.com/Resource/Download/51670c91-d369-4a8c-8102-d393de9f6d19', 'publishedAt': '2025-03-31T23:28:00Z', 'content': 'SAN FRANCISCO, March 31, 2025 (GLOBE NEWSW

In [10]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Load FinBERT or any transformer for financial news
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
text_model = AutoModel.from_pretrained("yiyanghkust/finbert-tone")

class NewsPriceDataset(Dataset):
    def __init__(self, df, device='cpu', max_len=128):
        self.df = df
        self.max_len = max_len
        self.scaler = StandardScaler()
        for i in range(len(df)):
            row = df.iloc[i]
            res = self.scaler.fit_transform(row['price_seq'])  # output ndarray (10, 5), while the input is array (len=10) of array (len=5)
            for j, r in enumerate(res):
                self.df.loc[i, 'price_seq'][j] = r.tolist()

    def __len__(self):
        return len(self.df)

    def get_text_embedding(self, news_list):  # num_news x 1
        if not news_list:
            return torch.zeros(text_model.config.hidden_size)
        encoding = tokenizer(news_list, truncation=True, padding='max_length',
                                max_length=self.max_len, return_tensors="pt")
        # print('encoding ', encoding.shape)
        text_embedding = text_model(**{k: v.to(device) for k, v in encoding.items()}).last_hidden_state.mean(dim=1).squeeze()
        if text_embedding.ndim == 1:  # when the shape is (768)
            text_embedding = text_embedding.unsqueeze(0)
        return text_embedding.mean(dim=0)

    def process_news_seq(self, news_articles):  # sample_len x num_news x 1
        with torch.no_grad():
            # return torch.mean(torch.stack([self.get_text_embedding(t) for t in news_articles]), dim=0)
            return torch.stack([self.get_text_embedding(t) for t in news_articles])

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        label = torch.tensor(row['label'], dtype=torch.float)
        price_seq = torch.tensor(row['price_seq'], dtype=torch.float)
        # Encode text, data structure example https://screen-snap.com/WeqixVuHDWZTt1mM9RJG7
        news_embeddings = self.process_news_seq(row['news_seq'])
        return news_embeddings, price_seq, label


In [16]:
import torch.nn as nn

# --- 1. Positional Encoding ---
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, seq_len=5000, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1) # Shape: (seq_len, 1, d_model)
        self.register_buffer('pe', pe) # Register as buffer so it's not a model parameter

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, d_model]
        """
        xt = x.permute(1, 0, 2)  # transpose to [seq_len, batch_size, d_model]
        xt = xt + self.pe[:xt.size(0), :]
        x = xt.permute(1, 0, 2)  # transpose back to [batch_size, seq_len, d_model]
        return self.dropout(x)

# --- 2. Transformer Model for Classification ---
# It's suggested to have 3 approaches to combine ts data and news data, https://screen-snap.com/eCw4sQFOK5aPHV30yb_dL
# This implements the approach 1.
class TransformerModel(nn.Module):
    def __init__(self, seq_len, text_dim=256, ts_dim=2, hidden_dim=128, ts_weight_proportion=0.5, dim_feedforward=256, dropout=0.1):
        super().__init__()

        # --- Calculate weighted embedding dimensions ---
        assert 0 < ts_weight_proportion < 1, "ts_weight_proportion must be between 0 and 1 (exclusive)"
        # Round the first dimension, derive the second to ensure sum is exact
        self.embed_ts = round(hidden_dim * ts_weight_proportion)
        self.embed_text = hidden_dim - self.embed_ts
        assert self.embed_ts * self.embed_text > 0

        self.ts_proj = nn.Linear(ts_dim, self.embed_ts)
        self.text_proj = nn.Linear(text_dim, self.embed_text)
        # Optional: Layer Normalization with calculated dimensions
        self.layer_norm_ts = nn.LayerNorm(self.embed_ts)
        self.layer_norm_text = nn.LayerNorm(self.embed_text)
        self.concat_layer_norm = nn.LayerNorm(hidden_dim)

        # Positional Encoding
        # Note: Standard PyTorch Transformer layers expect (seq_len, batch_size, features)
        # if batch_first=False (default). We'll use batch_first=True for convenience.
        self.pos_encoder = PositionalEncoding(hidden_dim, seq_len, dropout)
        # Or use learned positional embeddings:
        # self.pos_encoder = nn.Embedding(max_seq_len, d_model)
        # self.register_buffer('position_ids', torch.arange(max_seq_len).expand((1, -1)))

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=4, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True),
            num_layers=2
        )
        self.fc = nn.Linear(hidden_dim, 1)

        self.d_model = hidden_dim

    def forward(self, text_emb, ts_data):
        # text_emb: [7, 10, 768], ts_data: [7, 10, 5]
        x_text = self.text_proj(text_emb)  # [7, 10, 64]
        x_text = self.layer_norm_text(x_text)
        x_ts = self.ts_proj(ts_data)  # [7, 10, 64]
        x_ts = self.layer_norm_ts(x_ts)
        combined_emb = torch.cat([x_text, x_ts], dim=-1)  # [7, 10, 128]
        combined_emb = self.concat_layer_norm(combined_emb)
        # Apply scaling factor (common practice, helps relate embedding scale to attention scale)
        combined_emb = combined_emb * math.sqrt(self.d_model)

        combined_emb = self.pos_encoder(combined_emb)  # [7, 10, 128]
        x_out = self.transformer(combined_emb)
        out = self.fc(x_out.mean(dim=1))
        return out.squeeze()


In [17]:
# Initialized Transformer model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerModel(seq_len=10, text_dim=768, ts_dim=5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.MSELoss()
max_news_len = 128 # truncation length of each news content for text encoding

dl = DataLoader(NewsPriceDataset(training_df, device, max_news_len), batch_size=16, shuffle=True)
# Train model
for epoch in range(5):
    for text_emb, price_seq, label in dl:
        text_emb, price_seq, label = text_emb.to(device), price_seq.to(device), label.to(device)
        pred = model(text_emb, price_seq)
        loss = loss_fn(pred, label)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 1.8697
Epoch 2, Loss: 1.1049
Epoch 3, Loss: 0.8889
Epoch 4, Loss: 0.7843
Epoch 5, Loss: 0.5002
