In [None]:
# =========================
# 0. Imports & config
# =========================
import pandas as pd
from datetime import datetime
from gnews import GNews

TICKER = "AAPL"
START_YEAR = 2016
END_YEAR   = 2024

# =========================
# 1. Yahoo Finance (recent only)
# =========================
import requests
from datetime import datetime

def fetch_yahoo_news_api(ticker, count=100):
    url = (
        f"https://query1.finance.yahoo.com/v1/finance/search?"
        f"q={ticker}&newsCount={count}&quotesCount=0"
    )
    headers = {"User-Agent": "Mozilla/5.0"}
    r = requests.get(url, headers=headers).json()

    if "news" not in r:
        return pd.DataFrame()

    rows = []
    for item in r["news"]:
        try:
            rows.append({
                "ticker": ticker,
                "title": item.get("title"),
                "publisher": item.get("publisher"),
                "datetime": datetime.utcfromtimestamp(item.get("providerPublishTime")),
                "url": item.get("link"),
                "source": "yahoo",
            })
        except:
            continue

    return pd.DataFrame(rows)

df_yahoo = fetch_yahoo_news_api(TICKER, count=100)

print("Yahoo articles:", len(df_yahoo))

# =========================
# 2. Google News (MONTHLY)
# =========================
def fetch_gnews_monthly(ticker, year, month):
    google = GNews(
        language="en",
        country="US",
        max_results=100
    )

    start_date = datetime(year, month, 1)
    if month == 12:
        end_date = datetime(year + 1, 1, 1)
    else:
        end_date = datetime(year, month + 1, 1)

    google.start_date = start_date
    google.end_date = end_date

    query = f"{ticker} stock".replace(" ", "+")
    articles = google.get_news(query)

    rows = []
    for a in articles:
        try:
            dt = pd.to_datetime(a["published date"])
        except:
            dt = None

        rows.append({
            "ticker": ticker,
            "title": a.get("title"),
            "publisher": a.get("publisher", {}).get("title", ""),
            "datetime": dt,
            "url": a.get("url"),
            "source": "google",
        })

    return pd.DataFrame(rows)

dfs_google = []

for year in range(START_YEAR, END_YEAR + 1):
    for month in range(1, 13):
        print(f"Fetching Google News: {year}-{month:02d}")
        df_m = fetch_gnews_monthly(TICKER, year, month)
        dfs_google.append(df_m)

df_google = pd.concat(dfs_google, ignore_index=True)
print("Google articles (raw):", len(df_google))

# =========================
# 3. Combine Yahoo + Google
# =========================
df_all = pd.concat([df_google, df_yahoo], ignore_index=True)

# Basic cleaning
df_all["datetime"] = pd.to_datetime(df_all["datetime"], errors="coerce")
df_all = df_all.dropna(subset=["datetime", "title"])

# Daily key for LSTM alignment
df_all["date"] = df_all["datetime"].dt.date
df_all["date"] = pd.to_datetime(df_all["date"])

# =========================
# 4. Light deduplication
# =========================
df_all["title_norm"] = (
    df_all["title"]
    .str.lower()
    .str.replace(r"[^a-z0-9 ]", "", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

df_all = df_all.drop_duplicates(subset=["title_norm"])
df_all = df_all.drop(columns=["title_norm"])



Yahoo articles: 10
Fetching Google News: 2016-01
Fetching Google News: 2016-02
Fetching Google News: 2016-03
Fetching Google News: 2016-04
Fetching Google News: 2016-05
Fetching Google News: 2016-06
Fetching Google News: 2016-07
Fetching Google News: 2016-08
Fetching Google News: 2016-09
Fetching Google News: 2016-10
Fetching Google News: 2016-11
Fetching Google News: 2016-12
Fetching Google News: 2017-01
Fetching Google News: 2017-02
Fetching Google News: 2017-03
Fetching Google News: 2017-04
Fetching Google News: 2017-05
Fetching Google News: 2017-06
Fetching Google News: 2017-07
Fetching Google News: 2017-08
Fetching Google News: 2017-09
Fetching Google News: 2017-10
Fetching Google News: 2017-11
Fetching Google News: 2017-12
Fetching Google News: 2018-01
Fetching Google News: 2018-02
Fetching Google News: 2018-03
Fetching Google News: 2018-04
Fetching Google News: 2018-05
Fetching Google News: 2018-06
Fetching Google News: 2018-07
Fetching Google News: 2018-08
Fetching Google News:

FileNotFoundError: [Errno 2] No such file or directory: 'sp500_features.csv'

In [None]:
# =========================
# 5. Align to AAPL trading days
# =========================
market_df = pd.read_csv("../data/sp500_historical_clean.csv", parse_dates=["Date"])

aapl_trading_days = set(
    market_df[market_df["Ticker"] == TICKER]["Date"].unique()
)

df_all = df_all[df_all["date"].isin(aapl_trading_days)]

# =========================
# 6. Final sort & save
# =========================
df_all = df_all.sort_values(["date", "datetime"]).reset_index(drop=True)

df_all = df_all[[
    "ticker",
    "date",
    "datetime",
    "title",
    "publisher",
    "url",
    "source"
]]

df_all.to_csv("../data/aapl_news_raw.csv", index=False)

print("=================================")
print("FINAL RESULT")
print("Total articles:", len(df_all))
print(df_all.head())


FINAL RESULT
Total articles: 1849
  ticker       date            datetime  \
0   AAPL 2016-01-06 2016-01-06 08:00:00   
1   AAPL 2016-01-06 2016-01-06 08:00:00   
2   AAPL 2016-01-08 2016-01-08 08:00:00   
3   AAPL 2016-01-11 2016-01-11 08:00:00   
4   AAPL 2016-01-12 2016-01-12 08:00:00   

                                               title  \
0  Apple stock crash nears another scary level - ...   
1  Apple Inc. ($AAPL) Stock | Crushed Premarket O...   
2  Best Big Tech Stocks Of 2015 Can Ring In 2016 ...   
3  Survey Suggests iPhone Sales Grew 33% in China...   
4  How Tesla and Nissan’s Self-Parking Cars Fores...   

                   publisher  \
0                  USA Today   
1     warriortradingnews.com   
2  Investor's Business Daily   
3                    Fortune   
4                    Fortune   

                                                 url  source  
0  https://news.google.com/rss/articles/CBMiigFBV...  google  
1  https://news.google.com/rss/articles/CBMisgFBV..

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

df = pd.read_csv("../data/aapl_news_raw.csv", parse_dates=["date", "datetime"])

sbert = SentenceTransformer("all-MiniLM-L6-v2")

df["sbert_emb"] = list(
    sbert.encode(df["title"].tolist(), show_progress_bar=True)
)


  from .autonotebook import tqdm as notebook_tqdm
12/14/2025 04:58:25 PM - Use pytorch device_name: cuda:0
12/14/2025 04:58:25 PM - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 58/58 [00:07<00:00,  8.13it/s]


In [None]:
import faiss
from tqdm import tqdm

TICKER = "AAPL"
TOP_K = 10
QUERY_TEXT = "AAPL financial news"

# Encode query ONCE
query_emb = sbert.encode([QUERY_TEXT], convert_to_numpy=True).astype("float32")
EMB_DIM = query_emb.shape[1]

results = []

# Ensure sorted by date
df = df.sort_values("date").reset_index(drop=True)

all_dates = df["date"].unique()

for d in tqdm(all_dates, desc="Daily FAISS retrieval"):
    # Use only articles up to day d
    df_past = df[df["date"] <= d]

    if len(df_past) == 0:
        continue

    X = np.vstack(df_past["sbert_emb"].values).astype("float32")

    index = faiss.IndexFlatL2(EMB_DIM)
    index.add(X)

    k = min(TOP_K, len(df_past))
    distances, indices = index.search(query_emb, k)

    for rank, (idx, dist) in enumerate(zip(indices[0], distances[0]), start=1):
        row = df_past.iloc[idx]

        results.append({
            "ticker": TICKER,
            "date": d,
            "article_datetime": row["datetime"],
            "title": row["title"],
            "publisher": row.get("publisher", ""),
            "url": row.get("url", ""),
            "source": row.get("source", ""),
            "faiss_rank": rank,
            "faiss_distance": float(dist),
        })

# Save retrieval-only dataset
retrieved_df = pd.DataFrame(results)
retrieved_df = retrieved_df.sort_values(["date", "faiss_rank"]).reset_index(drop=True)

retrieved_df.to_csv("../data/aapl_daily_retrieved_articles.csv", index=False)

print("Saved aapl_daily_retrieved_articles.csv")
print("Total retrieved rows:", len(retrieved_df))
print(retrieved_df.head())


12/14/2025 05:01:36 PM - Loading faiss with AVX512 support.
12/14/2025 05:01:38 PM - Successfully loaded faiss with AVX512 support.
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]
Daily FAISS retrieval: 100%|██████████| 1010/1010 [00:02<00:00, 361.15it/s]


Saved aapl_daily_retrieved_articles.csv
Total retrieved rows: 10065
  ticker       date    article_datetime  \
0   AAPL 2016-01-06 2016-01-06 08:00:00   
1   AAPL 2016-01-06 2016-01-06 08:00:00   
2   AAPL 2016-01-08 2016-01-06 08:00:00   
3   AAPL 2016-01-08 2016-01-06 08:00:00   
4   AAPL 2016-01-08 2016-01-08 08:00:00   

                                               title  \
0  Apple Inc. ($AAPL) Stock | Crushed Premarket O...   
1  Apple stock crash nears another scary level - ...   
2  Apple Inc. ($AAPL) Stock | Crushed Premarket O...   
3  Apple stock crash nears another scary level - ...   
4  Best Big Tech Stocks Of 2015 Can Ring In 2016 ...   

                   publisher  \
0     warriortradingnews.com   
1                  USA Today   
2     warriortradingnews.com   
3                  USA Today   
4  Investor's Business Daily   

                                                 url  source  faiss_rank  \
0  https://news.google.com/rss/articles/CBMisgFBV...  google       

In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

DATA_DIR = "../data"
os.makedirs(DATA_DIR, exist_ok=True)

FEATURE_NAMES = [
    "polarity", "intensity", "relevance", "short_term",
    "long_term", "volatility", "novelty", "credibility"
]


In [8]:
def mean_pool(vectors):
    return np.mean(vectors, axis=0)

def weighted_pool(vectors, distances=None, eps=1e-6):
    vectors = np.array(vectors)

    if distances is None:
        weights = np.ones(len(vectors))
    else:
        similarities = 1.0 / (np.array(distances) + eps)
        intensities = vectors[:, 1]  # intensity dimension
        weights = similarities * (1.0 + intensities)

    weights = weights / (weights.sum() + eps)
    return (vectors * weights[:, None]).sum(axis=0)


In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import re

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

EMBED_SYSTEM_PROMPT = """
You are a financial news embedding model. Your task is to convert financial text
into a JSON object with EXACTLY 8 numeric features. Each value must be a float
in the range [-1, 1].

The 8 features are:
1. polarity       (negative → positive sentiment)
2. intensity      (strength of sentiment)
3. relevance      (how directly it relates to TARGET_TICKER)
4. short_term     (expected next-day price impact)
5. long_term      (expected multi-week impact)
6. volatility     (uncertainty implied by the text)
7. novelty        (new vs repeated information)
8. credibility    (rumor → fact-based)

You MUST output only a JSON object with these 8 fields.

--------------------------------------------------------
FEW-SHOT EXAMPLES
--------------------------------------------------------

Example A:
TEXT: "Netflix subscriber growth slows as competition intensifies across streaming platforms."
TARGET_TICKER: NFLX

OUTPUT:
{
  "polarity": -0.50,
  "intensity": 0.70,
  "relevance": 0.95,
  "short_term": -0.40,
  "long_term": -0.20,
  "volatility": 0.60,
  "novelty": 0.55,
  "credibility": 0.90
}

Example B:
TEXT: "Coca-Cola reports strong international sales and raises full-year outlook."
TARGET_TICKER: KO

OUTPUT:
{
  "polarity": 0.60,
  "intensity": 0.55,
  "relevance": 0.90,
  "short_term": 0.35,
  "long_term": 0.50,
  "volatility": 0.20,
  "novelty": 0.30,
  "credibility": 0.95
}

Example C:
TEXT: "Boeing receives a major multi-billion dollar aircraft order from a Middle Eastern airline."
TARGET_TICKER: BA

OUTPUT:
{
  "polarity": 0.80,
  "intensity": 0.85,
  "relevance": 1.00,
  "short_term": 0.70,
  "long_term": 0.75,
  "volatility": 0.40,
  "novelty": 0.70,
  "credibility": 0.95
}

Example D:
TEXT: "Meta Faces a global outage across Instagram and Facebook services."
TARGET_TICKER: META

OUTPUT:
{
  "polarity": -0.75,
  "intensity": 0.80,
  "relevance": 1.00,
  "short_term": -0.60,
  "long_term": -0.35,
  "volatility": 0.85,
  "novelty": 0.65,
  "credibility": 0.90
}

Example E:
TEXT: "Oil prices rise as OPEC announces unexpected production cuts."
TARGET_TICKER: XLE

OUTPUT:
{
  "polarity": 0.30,
  "intensity": 0.65,
  "relevance": 0.85,
  "short_term": 0.40,
  "long_term": 0.25,
  "volatility": 0.70,
  "novelty": 0.50,
  "credibility": 0.95
}

--------------------------------------------------------

NOW PROCESS THE NEW INPUT.
Return ONLY the JSON object.
"""





def generate_embedding(text, ticker="AAPL"):
    user_prompt = f"""
TARGET_TICKER: {ticker}

TEXT:
{text}

Return ONLY the JSON object with the 8 fields.
"""

    prompt = f"<|system|>\n{EMBED_SYSTEM_PROMPT}\n<|user|>\n{user_prompt}\n<|assistant|>\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.0,
        do_sample=False
    )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    # extract ALL JSON objects
    matches = re.findall(r"\{.*?\}", decoded, flags=re.DOTALL)

    if not matches:
        print("No JSON found:\n", decoded)
        return None

    # use the LAST JSON (assistant output)
    json_str = matches[-1]

    try:
        obj = json.loads(json_str)

        vector = [
            obj["polarity"],
            obj["intensity"],
            obj["relevance"],
            obj["short_term"],
            obj["long_term"],
            obj["volatility"],
            obj["novelty"],
            obj["credibility"]
        ]

        return vector

    except Exception as e:
        print("JSON parse error:", e)
        print("Raw JSON:", json_str)
        print("Full Output:", decoded)
        return None


`torch_dtype` is deprecated! Use `dtype` instead!
12/14/2025 05:07:57 PM - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


In [10]:
df_raw = pd.read_csv("../data/aapl_news_raw.csv", parse_dates=["date"])
df_raw = df_raw.sort_values("date")


In [11]:
rows_mean = []
rows_weighted = []

for d, group in tqdm(df_raw.groupby("date"), desc="RAW aggregation"):
    vectors = []

    for title in group["title"]:
        v = generate_embedding(title, ticker="AAPL")
        if v is not None:
            vectors.append(v)

    if len(vectors) == 0:
        agg_mean = np.zeros(8)
        agg_weighted = np.zeros(8)
    else:
        agg_mean = mean_pool(vectors)
        agg_weighted = weighted_pool(vectors)

    rows_mean.append(["AAPL", d] + agg_mean.tolist())
    rows_weighted.append(["AAPL", d] + agg_weighted.tolist())


RAW aggregation:   0%|          | 0/1010 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
RAW aggregation: 100%|██████████| 1010/1010 [1:34:08<00:00,  5.59s/it] 


In [12]:
cols = ["ticker", "date"] + FEATURE_NAMES

pd.DataFrame(rows_mean, columns=cols).to_csv(
    f"{DATA_DIR}/aapl_daily_sentiment_raw_mean.csv", index=False
)

pd.DataFrame(rows_weighted, columns=cols).to_csv(
    f"{DATA_DIR}/aapl_daily_sentiment_raw_weighted.csv", index=False
)


In [14]:
# =========================================
# Article-level Sentiment Extraction (ONCE)
# =========================================

import pandas as pd
from tqdm import tqdm
import numpy as np

# Load raw news
df_raw = pd.read_csv("../data/aapl_news_raw.csv")

FEATURES = [
    "polarity", "intensity", "relevance", "short_term",
    "long_term", "volatility", "novelty", "credibility"
]

embedding_cache = {}
rows = []

unique_titles = df_raw["title"].dropna().unique()
print(f"Encoding {len(unique_titles)} unique articles")

for title in tqdm(unique_titles, desc="Qwen sentiment encoding"):
    try:
        v = generate_embedding(title, ticker="AAPL")
        if v is None:
            continue

        embedding_cache[title] = v
        rows.append(["AAPL", title] + v)

    except Exception as e:
        print("Error on title:", title)
        print(e)

# Save article-level sentiment
cols = ["ticker", "title"] + FEATURES
df_article_sent = pd.DataFrame(rows, columns=cols)

df_article_sent.to_csv(
    "../data/aapl_article_sentiment.csv",
    index=False
)

print("=================================")
print("Saved ../data/aapl_article_sentiment.csv")
print("Total encoded articles:", len(df_article_sent))
print(df_article_sent.head())


Encoding 1849 unique articles


Qwen sentiment encoding: 100%|██████████| 1849/1849 [1:30:57<00:00,  2.95s/it]

Saved ../data/aapl_article_sentiment.csv
Total encoded articles: 1849
  ticker                                              title  polarity  \
0   AAPL  Apple stock crash nears another scary level - ...      -0.8   
1   AAPL  Apple Inc. ($AAPL) Stock | Crushed Premarket O...      -0.9   
2   AAPL  Best Big Tech Stocks Of 2015 Can Ring In 2016 ...       0.5   
3   AAPL  Survey Suggests iPhone Sales Grew 33% in China...       0.5   
4   AAPL  How Tesla and Nissan’s Self-Parking Cars Fores...       0.5   

   intensity  relevance  short_term  long_term  volatility  novelty  \
0       0.85       0.95        -0.7       -0.5        0.65     0.55   
1       0.85       0.95        -0.7       -0.5        0.65     0.80   
2       0.70       0.95         0.3        0.2        0.60     0.55   
3       0.70       0.95         0.3        0.2        0.60     0.55   
4       0.70       0.95         0.3        0.2        0.60     0.55   

   credibility  
0         0.90  
1         0.95  
2         0.9




In [15]:
# =========================================
# FAST FAISS Sentiment Aggregation (POST-QWEN)
# =========================================

import pandas as pd
import numpy as np
from tqdm import tqdm

FEATURES = [
    "polarity", "intensity", "relevance", "short_term",
    "long_term", "volatility", "novelty", "credibility"
]

# Load FAISS retrieval
df_faiss = pd.read_csv(
    "../data/aapl_daily_retrieved_articles.csv",
    parse_dates=["date", "article_datetime"]
)

# Load article-level sentiment (generated by Qwen)
df_sent = pd.read_csv("../data/aapl_article_sentiment.csv")

# Merge sentiment onto FAISS rows
df_faiss = df_faiss.merge(df_sent, on="title", how="left")

def mean_pool(v):
    return v.mean(axis=0)

def weighted_pool(v, d, eps=1e-6):
    w = (1.0 / (d + eps)) * (1.0 + v[:, 1])  # intensity-aware
    w = w / w.sum()
    return (v * w[:, None]).sum(axis=0)

rows_mean = []
rows_weighted = []

for d, group in tqdm(df_faiss.groupby("date"), desc="FAISS aggregation"):
    vecs = group[FEATURES].values
    dists = group["faiss_distance"].values

    if len(vecs) == 0:
        agg_mean = np.zeros(8)
        agg_weighted = np.zeros(8)
    else:
        agg_mean = mean_pool(vecs)
        agg_weighted = weighted_pool(vecs, dists)

    rows_mean.append(["AAPL", d] + agg_mean.tolist())
    rows_weighted.append(["AAPL", d] + agg_weighted.tolist())

cols = ["ticker", "date"] + FEATURES

pd.DataFrame(rows_mean, columns=cols).to_csv(
    "../data/aapl_daily_sentiment_faiss_mean.csv",
    index=False
)

pd.DataFrame(rows_weighted, columns=cols).to_csv(
    "../data/aapl_daily_sentiment_faiss_weighted.csv",
    index=False
)

print("FAISS sentiment aggregation DONE.")


FAISS aggregation: 100%|██████████| 1010/1010 [00:00<00:00, 2622.24it/s]


FAISS sentiment aggregation DONE.
