In [None]:
import joblib
from io import BytesIO
from azure.storage.filedatalake import DataLakeServiceClient
import pandas as pd
import requests
from datetime import datetime, timedelta

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 2) DAILY NEWS FROM GNEWS.IO
# ─────────────────────────────────────────────────────────────────────────────
GNEWS_API_KEY = "GNEWS_API_KEY"
if not GNEWS_API_KEY:
    raise ValueError("Set GNEWS_API_KEY env var first")

gnews_url = (
    "https://gnews.io/api/v4/search"
    f"?q=bitcoin"
    f"&from={start_str}"
    f"&to={end_str}"
    f"&lang=en"
    f"&sortby=publishedAt"
    f"&max=100"
    f"&token={GNEWS_API_KEY}"
)

resp = requests.get(gnews_url)
resp.raise_for_status()
articles = resp.json().get("articles", [])

df_news = pd.DataFrame([
    {
      "date":   pd.to_datetime(a["publishedAt"], utc=True).date(),
      "title":  a["title"],
      "text":   (a.get("description") or a.get("content") or ""),
      "url":    a["url"],
      "source": a["source"]["name"]
    }
    for a in articles
])
df_news["year"] = pd.to_datetime(df_news["date"]).dt.year

In [None]:
news_agg = (
    df_news
      .groupby("date")
      .agg(news_sentiment=("sentiment_score", "mean"),
           news_count    =("sentiment_score", "size"))
      .reset_index()
)