In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install required packages
!pip install -q transformers bs4 requests sentencepiece

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datetime import datetime, timedelta
import time
import re

In [None]:
# Normalize title
# =========================
def normalize_title(text):
    text = text.lower().strip()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

In [None]:
# Load FinBERT
# =========================
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert_model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finbert_model.to(device)

def analyze_sentiment(texts):
    if not texts:
        return []
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = finbert_model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()
    scores = [p[2] - p[0] for p in probs]  # positive - negative
    return scores

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
def fetch_bing_news_titles(date, ticker, max_results=10):
    """
    Fetch up to max_results unique headlines for a ticker and date from Bing News.
    Use normalization to deduplicate headlines.
    """
    query_date = date.strftime("%Y-%m-%d")
    query = f'"{ticker}" {query_date}'

    url = "https://www.bing.com/news/search"
    params = {
        "q": query,
        "qft": 'sortbydate="1"',
        "form": "QBNH"
    }
    headers = {"User-Agent": "Mozilla/5.0"}

    res = requests.get(url, headers=headers, params=params, timeout=15)
    soup = BeautifulSoup(res.text, "html.parser")
    title_tags = (
        soup.select("a.title") +
        soup.select("a[class*='title']") +
        soup.select("div.news-card h2 a") +
        soup.select("a[href*='/news/']")
    )

    seen_titles = set()
    headlines = []
    for tag in title_tags:
        text = tag.get_text(strip=True)
        norm_text = normalize_title(text)
        # Accept only headlines with at least 4 words and not already seen
        if text and len(text.split()) >= 4 and norm_text not in seen_titles:
            headlines.append(text)
            seen_titles.add(norm_text)
        if len(headlines) >= max_results:
            break

    if not headlines:
        print(f"  ‚ö†Ô∏è {ticker}: No headlines found for {query_date}")
    return headlines


In [None]:
def generate_sentiment_dataset(tickers, start_date, end_date, max_results=10, sleep_sec=1.0):
    """
    Loop through dates and tickers to fetch deduplicated headlines,
    run sentiment analysis, and build the result DataFrame.
    Returns headline-level sentiment scores.
    """
    rows = []
    current = start_date
    while current <= end_date:
        print(f"\nüóìÔ∏è {current.strftime('%Y-%m-%d')}")
        for ticker in tickers:
            try:
                headlines = fetch_bing_news_titles(current, ticker, max_results=max_results)
                if not headlines:
                    continue
                # Get a sentiment score for each headline
                scores = analyze_sentiment(headlines)
                # Append one row per headline (headline-level score)
                for title, score in zip(headlines, scores):
                    rows.append({
                        "date": current.strftime("%Y-%m-%d"),
                        "ticker": ticker,
                        "title": title,
                        "sentiment": score
                    })
                print(f" {ticker}: {len(headlines)} unique headlines")
            except Exception as e:
                print(f" Error for {ticker}: {e}")
        current += timedelta(days=1)
        time.sleep(sleep_sec)
    return pd.DataFrame(rows)

In [None]:
# LOAD ARKK TOP 20
csv_path = "/content/drive/My Drive/Fintech/Dataset/Holding_data/ARKK_top20_holdings.csv"
df_holdings = pd.read_csv(csv_path)
tickers = df_holdings["ticker"].unique().tolist()
print("Loaded ARKK tickers:", tickers)

Loaded ARKK tickers: ['TSLA', 'ROKU', 'COIN', 'RBLX', 'PLTR', 'CRSP', 'TEM', 'SHOP', 'HOOD', 'SQ', 'CRCL', 'BMNR', 'ACHR', 'AMD', 'TWST', 'BEAM', 'XYZ', 'META', 'AMZN', 'TER']


In [None]:
# Step 2: Set your time range
start = datetime(2025, 1, 1)
end = datetime(2025, 10, 31)

# Step 3: Run your pipeline (no need to change anything else)
df_sentiment = generate_sentiment_dataset(tickers, start, end, max_results=10, sleep_sec=1.0)


üóìÔ∏è 2025-01-01


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 TWST: 10 unique headlines
 BEAM: 10 unique headlines
 XYZ: 10 unique headlines
 META: 10 unique headlines
 AMZN: 8 unique headlines
 TER: 10 unique headlines

üóìÔ∏è 2025-03-19
 TSLA: 8 unique headlines
 ROKU: 10 unique headlines
 COIN: 10 unique headlines
 RBLX: 9 unique headlines
 PLTR: 10 unique headlines
 CRSP: 10 unique headlines
 TEM: 10 unique headlines
 SHOP: 10 unique headlines
 HOOD: 10 unique headlines
 SQ: 10 unique headlines
 CRCL: 10 unique headlines
 BMNR: 10 unique headlines
 ACHR: 9 unique headlines
 AMD: 10 unique headlines
 TWST: 9 unique headlines
 BEAM: 10 unique headlines
 XYZ: 10 unique headlines
 META: 10 unique headlines
 AMZN: 8 unique headlines
 TER: 10 unique headlines

üóìÔ∏è 2025-03-20
 TSLA: 10 unique headlines
 ROKU: 10 unique headlines
 COIN: 10 unique headlines
 RBLX: 10 unique headlines
 PLTR: 10 unique headlines
 CRSP: 9 unique headlines
 TEM: 10 unique headlines
 SHOP: 10 unique hea

In [None]:
# Drop duplicates
df_sentiment = df_sentiment.drop_duplicates(subset=["date", "ticker", "title"]).reset_index(drop=True)

# Save and display
df_sentiment.to_csv("ARKK_companynews_sentiment_202501_202510.csv", index=False)
df_sentiment.head()

Unnamed: 0,date,ticker,title,sentiment
0,2025-01-01,TSLA,"Earnings live: Home Depot, Klarna stocks fall ...",0.090468
1,2025-01-01,TSLA,Tesla Zone Drilling Further Defines and Expand...,-0.001249
2,2025-01-01,TSLA,Tesla's 2025 CEO Performance Award: Aligning V...,-7e-06
3,2025-01-01,TSLA,Elon Musk vs Sam Altman: Tesla CEO accuses Ope...,-0.994335
4,2025-01-01,TSLA,"Daily Horoscope for November 01, 2025",-0.999984


In [None]:
# Download
from google.colab import files
files.download("ARKK_companynews_sentiment_202501_202510.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>