In [6]:
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, f1_score

In [7]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [8]:
from sklearn.model_selection import train_test_split

TRAIN_PATH = "/content/df_top_cleaned.csv"
VAL_PATH   = "/content/df_mid_cleaned.csv"
TEST_PATH  = "/content/df_bot_cleaned.csv"

def slice_date(df, start="2018-07-11", end="2018-10-02"):
    df = df.copy()
    df["DATE"] = pd.to_datetime(df["DATE"])
    mask = (df["DATE"] >= start) & (df["DATE"] <= end)
    return df.loc[mask].sort_values("DATE").reset_index(drop=True)

MODEL_NAME = "bert-base-uncased"
BATCH_SIZE = 16
MAX_LENGTH = 128
LR         = 2e-5
EPOCHS     = 5
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
df_top = slice_date(pd.read_csv(TRAIN_PATH).dropna())
df_mid = slice_date(pd.read_csv(VAL_PATH).dropna())
df_bot = slice_date(pd.read_csv(TEST_PATH).dropna())

# combine all three into one big DataFrame
df_all = pd.concat([df_top, df_mid, df_bot]).reset_index(drop=True)

# sort by stock & date
df_all["DATE"] = pd.to_datetime(df_all["DATE"])
df_all = df_all.sort_values(["STOCK","DATE"])

# example numeric features
df_all["lag1_return"] = (
    df_all.groupby("STOCK")["1_DAY_RETURN"]
          .shift(1)
          .fillna(0)
)
df_all["ma5_return"] = (
    df_all.groupby("STOCK")["1_DAY_RETURN"]
          .rolling(5).mean()
          .reset_index(level=0, drop=True)
          .fillna(0)
)
df_all["vol5_return"] = (
    df_all.groupby("STOCK")["1_DAY_RETURN"]
          .rolling(5).std()
          .reset_index(level=0, drop=True)
          .fillna(0)
)

# If you want count of tweets per day:
tweet_counts = df_all.groupby(["STOCK","DATE"]).size().rename("tweet_count")
df_all = df_all.join(tweet_counts, on=["STOCK","DATE"]).fillna(0)

  df["DATE"] = pd.to_datetime(df["DATE"])
  df["DATE"] = pd.to_datetime(df["DATE"])
  df["DATE"] = pd.to_datetime(df["DATE"])


In [10]:
train_df, temp_df = train_test_split(
    df_all,
    test_size=0.30,
    random_state=42,
    shuffle=True
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    shuffle=True
)

print(f"Sizes → train: {len(train_df)}, val: {len(val_df)}, test: {len(test_df)}")

Sizes → train: 14145, val: 3031, test: 3032


In [11]:
from sklearn.preprocessing import StandardScaler

numeric_cols = ["lag1_return", "ma5_return", "vol5_return", "tweet_count"]

scaler = StandardScaler()
# fit on train only
train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])

# apply to val and test
val_df[numeric_cols]   = scaler.transform(val_df[numeric_cols])
test_df[numeric_cols]  = scaler.transform(test_df[numeric_cols])

In [12]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class StockTweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128, numeric_cols=None):
        self.texts    = df["TWEET"].tolist()
        self.targets  = torch.tensor(
                          df[["1_DAY_RETURN","2_DAY_RETURN",
                              "3_DAY_RETURN","7_DAY_RETURN"]].values,
                          dtype=torch.float32
                        )
        # grab the numeric features as a float32 tensor
        self.numerics = torch.tensor(
                          df[numeric_cols].values,
                          dtype=torch.float32
                        )
        self.tokenizer  = tokenizer
        self.max_length = max_length
        self.stocks = df["STOCK"].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return (
            enc["input_ids"].squeeze(0),
            enc["attention_mask"].squeeze(0),
            self.numerics[idx],
            self.targets[idx],
            self.stocks[idx]
        )

numeric_cols = ["lag1_return","ma5_return","vol5_return","tweet_count"]
train_loader = DataLoader(
    StockTweetDataset(train_df, tokenizer, numeric_cols=numeric_cols),
    batch_size=16, shuffle=True
)
val_loader = DataLoader(
    StockTweetDataset(val_df, tokenizer, numeric_cols=numeric_cols),
    batch_size=16,
    shuffle=False
)

test_loader = DataLoader(
    StockTweetDataset(test_df, tokenizer, numeric_cols=numeric_cols),
    batch_size=16,
    shuffle=False
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [13]:
class BertNumericHybrid(nn.Module):
    def __init__(self, numeric_dim, output_size=4, dropout_rate=0.3, bert_name=MODEL_NAME):
        super().__init__()
        # 1) Text encoder
        self.bert     = BertModel.from_pretrained(bert_name)
        hidden_size   = self.bert.config.hidden_size

        # 2) Numeric‐feature tower
        self.num_fc1  = nn.Linear(numeric_dim, 32)
        self.num_fc2  = nn.Linear(32, 16)
        self.num_act  = nn.ReLU()

        # 3) Fusion + output head
        self.dropout  = nn.Dropout(dropout_rate)
        self.fuse_fc  = nn.Linear(hidden_size + 16, output_size)

    def forward(self, input_ids, attention_mask, numeric_feats):
        # Text path
        bert_out   = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        txt_emb    = bert_out.pooler_output   # (batch_size, hidden_size)

        # Numeric path
        x          = self.num_act(self.num_fc1(numeric_feats))
        num_emb    = self.num_act(self.num_fc2(x))   # (batch_size, 16)

        # Fuse & predict
        combined   = torch.cat([txt_emb, num_emb], dim=1)
        return self.fuse_fc(self.dropout(combined))

numeric_dim = len(numeric_cols)

model     = BertNumericHybrid(numeric_dim, output_size=4, dropout_rate=0.3).to(DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    for input_ids, masks, numeric_feats, targets, _ in train_loader:
        # move only the tensors you care about
        input_ids = input_ids.to(DEVICE)
        masks     = masks.to(DEVICE)
        numeric_feats = numeric_feats.to(DEVICE)
        targets   = targets.to(DEVICE)

        preds  = model(input_ids, masks, numeric_feats)
        loss  = criterion(preds, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch}/{EPOCHS} — Train Loss: {avg_train_loss:.4f}")


Epoch 1/5 — Train Loss: 0.0025
Epoch 2/5 — Train Loss: 0.0006
Epoch 3/5 — Train Loss: 0.0005
Epoch 4/5 — Train Loss: 0.0004
Epoch 5/5 — Train Loss: 0.0004


In [15]:
def evaluate(model, dataloader):
    model.eval()
    all_preds, all_targets = [], []

    with torch.no_grad():
        # Unpack numeric_feats as the third tensor in each batch
        for input_ids, masks, numeric_feats, targets, *_ in dataloader:
            input_ids     = input_ids.to(DEVICE)
            masks         = masks.to(DEVICE)
            numeric_feats = numeric_feats.to(DEVICE)
            targets       = targets.to(DEVICE)

            # Pass numeric_feats into the model
            preds = (
                model(input_ids, masks, numeric_feats)
                .cpu()
                .numpy()
            )

            all_preds.append(preds)
            all_targets.append(targets.cpu().numpy())

    preds = np.vstack(all_preds)
    targs = np.vstack(all_targets)

    # regression metrics
    maes = [mean_absolute_error(targs[:,i], preds[:,i]) for i in range(preds.shape[1])]
    r2s  = [r2_score(targs[:,i], preds[:,i])              for i in range(preds.shape[1])]

    # classification metrics (direction)
    bin_pred = (preds >= 0).astype(int)
    bin_targ = (targs >= 0).astype(int)
    accs = [accuracy_score(bin_targ[:,i], bin_pred[:,i]) for i in range(preds.shape[1])]
    f1s  = [f1_score   (bin_targ[:,i], bin_pred[:,i])     for i in range(preds.shape[1])]

    return {"MAE": maes, "R2": r2s, "Acc": accs, "F1": f1s}

In [16]:
val_metrics  = evaluate(model, val_loader)
test_metrics = evaluate(model, test_loader)

print("\nValidation Metrics:")
for name, vals in val_metrics.items():
    print(f" {name}: {[f'{v:.4f}' for v in vals]}")

print("\nTest Metrics:")
for name, vals in test_metrics.items():
    print(f" {name}: {[f'{v:.4f}' for v in vals]}")


Validation Metrics:
 MAE: ['0.0053', '0.0075', '0.0089', '0.0159']
 R2: ['0.7548', '0.5401', '0.5006', '0.3977']
 Acc: ['0.8199', '0.7516', '0.7598', '0.7443']
 F1: ['0.8620', '0.7569', '0.7531', '0.7881']

Test Metrics:
 MAE: ['0.0054', '0.0078', '0.0089', '0.0157']
 R2: ['0.7417', '0.5205', '0.5137', '0.4189']
 Acc: ['0.8107', '0.7312', '0.7596', '0.7523']
 F1: ['0.8521', '0.7238', '0.7445', '0.7905']


In [17]:
def evaluate_with_stock(model, dataloader):
    model.eval()
    all_preds, all_targs, all_stocks = [], [], []

    with torch.no_grad():
        for input_ids, masks, numeric_feats, targets, stocks in dataloader:
            # Move tensors
            input_ids     = input_ids.to(DEVICE)
            masks         = masks.to(DEVICE)
            numeric_feats = numeric_feats.to(DEVICE)
            targets       = targets.to(DEVICE)

            # Forward pass through your hybrid model
            preds = (
                model(input_ids, masks, numeric_feats)
                .cpu()
                .numpy()
            )

            all_preds.append(preds)
            all_targs.append(targets.cpu().numpy())
            all_stocks.extend(stocks)

    # Stack into arrays and return along with stock list
    return (
        np.vstack(all_preds),
        np.vstack(all_targs),
        all_stocks
    )

In [18]:
import warnings
import pandas as pd
from sklearn.exceptions import UndefinedMetricWarning

# 1) SILENCE the specific DeprecationWarning from pandas.groupby.apply
warnings.filterwarnings(
    "ignore",
    message="DataFrameGroupBy.apply operated on the grouping columns"
)

# 2) SILENCE the UndefinedMetricWarning from sklearn.metrics
warnings.filterwarnings(
    "ignore",
    category=UndefinedMetricWarning
)

# 1) GLOBAL ACCURACY & F1
preds, targs, stocks = evaluate_with_stock(model, test_loader)
df = pd.DataFrame({
    "STOCK": stocks,
    **{f"{h}_pred": preds[:,i] for i,h in enumerate(["1-Day","2-Day","3-Day","7-Day"])},
    **{f"{h}_targ": targs[:,i] for i,h in enumerate(["1-Day","2-Day","3-Day","7-Day"])}
})

print("\n=== Global Accuracy and F1-score ===")
for h in ["1-Day","2-Day","3-Day","7-Day"]:
    pred_col = f"{h}_pred"
    targ_col = f"{h}_targ"
    # binarize direction
    df[f"{h}_pred_dir"] = (df[pred_col] >= 0).astype(int)
    df[f"{h}_true_dir"] = (df[targ_col] >= 0).astype(int)

    acc = accuracy_score(df[f"{h}_true_dir"], df[f"{h}_pred_dir"])
    f1  = f1_score   (df[f"{h}_true_dir"], df[f"{h}_pred_dir"])
    print(f"{h} Return → Accuracy: {acc:.4f} | F1-Score: {f1:.4f}")

# 2) TOP 5 COMPANIES BY F1
print("\n=== Top 5 Companies by F1-score for Each Return Period ===")
for h in ["1-Day","2-Day","3-Day","7-Day"]:
    # 1) Filter out any ticker with fewer than 20 test samples
    df_filtered = (
        df
        .groupby("STOCK")
        .filter(lambda d: len(d) >= 20)
    )

    # 2) Now compute metrics only on that filtered set
    grp = (
        df_filtered
        .groupby("STOCK")
        .apply(lambda d: pd.Series({
            "Accuracy": accuracy_score(d[f"{h}_true_dir"], d[f"{h}_pred_dir"]),
            "F1_Score":   f1_score(d[f"{h}_true_dir"], d[f"{h}_pred_dir"], zero_division=0),
            "Samples":    len(d)
        }))
    )

    top5 = grp.sort_values("F1_Score", ascending=False).head(5)

    print(f"\nTop 5 STOCK - {h} Return:")
    print(top5.to_string())



=== Global Accuracy and F1-score ===
1-Day Return → Accuracy: 0.8107 | F1-Score: 0.8521
2-Day Return → Accuracy: 0.7312 | F1-Score: 0.7238
3-Day Return → Accuracy: 0.7596 | F1-Score: 0.7445
7-Day Return → Accuracy: 0.7523 | F1-Score: 0.7905

=== Top 5 Companies by F1-score for Each Return Period ===

Top 5 STOCK - 1-Day Return:
            Accuracy  F1_Score  Samples
STOCK                                  
Next        0.984848  0.992248     66.0
Netflix     0.987421  0.988506    159.0
Facebook    0.949640  0.972763    139.0
Intel       0.937500  0.962963     32.0
Mastercard  0.931034  0.960000     29.0

Top 5 STOCK - 2-Day Return:
          Accuracy  F1_Score  Samples
STOCK                                
Ford      0.987374  0.971429    396.0
Next      0.924242  0.960000     66.0
Facebook  0.892086  0.939271    139.0
adidas    0.833333  0.892857     36.0
Nike      0.733333  0.837587    525.0

Top 5 STOCK - 3-Day Return:
          Accuracy  F1_Score  Samples
STOCK                      