# Notebook 1 — Prices (AAPL, XOM) + News → FinBERT → Daily sentiment index `I_t` + Features

Что делает ноутбук:

1) скачивает дневные OHLCV за последние **5 лет** (можно поменять `YEARS_BACK`);  
2) скачивает новости (по умолчанию **Alpha Vantage News & Sentiment**; нужен API key; есть fallback на GDELT, но у него ограничение по истории);  
3) прогоняет тексты новостей через **FinBERT (ProsusAI/finbert)**, считает `s(x)` и дневной индекс `I_t`;  
4) считает `returns`, `RSI`, `MACD`;  
5) собирает финальную таблицу `date, returns, RSI, MACD, I_t` (и сохраняет в файлы).


In [None]:
import sys, subprocess, importlib

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)

required = [
    "pandas>=2.0", "numpy>=1.24", "requests>=2.31", "tqdm>=4.66",
    "yfinance>=0.2.30",
    "transformers>=4.40", "torch",  # torch может быть уже установлен
    "pyarrow>=14.0",  # для parquet
]

missing = []
for pkg in ["pandas","numpy","requests","tqdm","yfinance","transformers","torch","pyarrow"]:
    try:
        importlib.import_module(pkg)
    except Exception:
        missing.append(pkg)

if missing:
    print("Installing missing:", missing)
    pip_install(required)
else:
    print("All dependencies look installed.")

  from .autonotebook import tqdm as notebook_tqdm


All dependencies look installed.


In [2]:
import os, math, time
from datetime import datetime, timedelta, timezone
from pathlib import Path

import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

TICKERS = ["AAPL", "XOM"]
YEARS_BACK = 5  # поменяйте на 3..5 по задаче

END_DATE = pd.Timestamp.utcnow().normalize()
START_DATE = END_DATE - pd.DateOffset(years=YEARS_BACK)

print("Date range:", START_DATE.date(), "→", END_DATE.date())

Date range: 2020-12-28 → 2025-12-28


## 1) Данные по ценам (daily OHLCV)

In [None]:
import pandas as pd
import yfinance as yf

# --- 0) DATA_DIR (на всякий случай) ---
DATA_DIR.mkdir(parents=True, exist_ok=True)

# --- 1) Выбор engine: fastparquet -> предпочтительно ---
_PARQUET_ENGINE = None
try:
    import fastparquet  # noqa: F401
    _PARQUET_ENGINE = "fastparquet"
    print("Parquet engine:", _PARQUET_ENGINE)
except Exception:
    _PARQUET_ENGINE = "pyarrow"
    print("Parquet engine:", _PARQUET_ENGINE, "(fastparquet not found)")

# --- 2) Safe Parquet writer ---
def _reset_pyarrow_pandas_ext_types():
    # безопасно: если pyarrow нет/другая версия — просто пропустим
    try:
        import pyarrow as pa
    except Exception:
        return

    # точечные самые частые
    for name in ("pandas.period", "pandas.interval"):
        try:
            pa.unregister_extension_type(name)
        except Exception:
            pass

    # попытка подчистить всё pandas.* если API доступен
    for attr in ("registered_extension_types", "get_registered_extension_types", "list_registered_extension_types"):
        try:
            fn = getattr(pa, attr)
        except Exception:
            continue
        try:
            reg = fn()
            # pyarrow может вернуть dict или list
            if isinstance(reg, dict):
                names = list(reg.keys())
            else:
                names = list(reg)
            for n in names:
                n = str(n)
                if n.startswith("pandas."):
                    try:
                        pa.unregister_extension_type(n)
                    except Exception:
                        pass
            break
        except Exception:
            pass


def safe_to_parquet(df: pd.DataFrame, path, engine: str):
    """
    Пишем Parquet устойчиво.
    - Если ловим ArrowKeyError про pandas.* already defined — чистим registry и повторяем 1 раз.
    """
    try:
        df.to_parquet(path, index=False, engine=engine)
    except Exception as e:
        msg = str(e)
        if ("type extension with name pandas." in msg and "already defined" in msg) or "ArrowKeyError" in msg:
            _reset_pyarrow_pandas_ext_types()
            df.to_parquet(path, index=False, engine=engine)  # retry once
        else:
            raise

def download_prices(ticker: str, start: pd.Timestamp, end: pd.Timestamp) -> pd.DataFrame:
    df = yf.download(
        ticker, start=start.date(), end=(end + pd.Timedelta(days=1)).date(),
        interval="1d", auto_adjust=False, progress=False
    )
    if df.empty:
        raise RuntimeError(f"No price data for {ticker}")
    df = df.rename(columns=str.lower)
    df.index = pd.to_datetime(df.index).tz_localize(None)
    df = df.reset_index().rename(columns={"Date":"date", "index":"date"})
    df["ticker"] = ticker
    return df[["date","open","high","low","close","adj close","volume","ticker"]]

# --- 3) Скачивание и сохранение ---
prices_list = []

for t in TICKERS:
    dft = download_prices(t, START_DATE, END_DATE)  # ваша функция из предыдущей ячейки
    prices_list.append(dft)

    out_path = DATA_DIR / f"prices_{t}.parquet"
    safe_to_parquet(dft, out_path, engine=_PARQUET_ENGINE)

    print(
        t,
        dft.shape,
        dft["date"].min().date(),
        dft["date"].max().date(),
        "->",
        out_path.name,
    )

prices = pd.concat(prices_list, ignore_index=True)

all_path = DATA_DIR / "prices_all.parquet"
safe_to_parquet(prices, all_path, engine=_PARQUET_ENGINE)
print("Saved:", all_path)

prices.head()


Parquet engine: fastparquet
AAPL (1256, 8) 2020-12-28 2025-12-26 -> prices_AAPL.parquet
XOM (1256, 8) 2020-12-28 2025-12-26 -> prices_XOM.parquet
Saved: data/prices_all.parquet


Price,date,open,high,low,close,adj close,volume,ticker,open,high,low,close,adj close,volume
Ticker,Unnamed: 1_level_1,aapl,aapl,aapl,aapl,aapl,aapl,Unnamed: 8_level_1,xom,xom,xom,xom,xom,xom
0,2020-12-28,133.990005,137.339996,133.509995,136.690002,133.061218,124486200.0,AAPL,,,,,,
1,2020-12-29,138.050003,138.789993,134.339996,134.869995,131.28952,121047300.0,AAPL,,,,,,
2,2020-12-30,135.580002,135.990005,133.399994,133.720001,130.170029,96452100.0,AAPL,,,,,,
3,2020-12-31,134.080002,134.740005,131.720001,132.690002,129.167374,99116600.0,AAPL,,,,,,
4,2021-01-04,133.520004,133.610001,126.760002,129.410004,125.97448,143301900.0,AAPL,,,,,,


## 2) Новости + маппинг по датам

### Источник новостей
- **Основной (рекомендуется): Alpha Vantage `NEWS_SENTIMENT`** — поддерживает `time_from/time_to` (можно брать 3–5 лет), но нужен API key. citeturn3view0  
- **Fallback: GDELT DOC API** — без ключа, но по сути ограничен коротким окном истории (не подойдет для 3–5 лет). citeturn1view0

В коде ниже:
- если `ALPHAVANTAGE_API_KEY` задан, используем Alpha Vantage;
- иначе используем GDELT на коротком промежутке (чтобы ноутбук всё равно работал).

In [4]:
# TB228JYIOYDWU5Q9

ALPHAVANTAGE_API_KEY = os.getenv("ALPHAVANTAGE_API_KEY", "").strip()
ALPHAVANTAGE_API_KEY="TB228JYIOYDWU5Q9"
USE_ALPHA_VANTAGE = bool(ALPHAVANTAGE_API_KEY)


print("USE_ALPHA_VANTAGE =", USE_ALPHA_VANTAGE)
if not USE_ALPHA_VANTAGE:
    print("⚠️  Нет ALPHAVANTAGE_API_KEY. Будет fallback на GDELT (ограниченная история).")



USE_ALPHA_VANTAGE = True


In [5]:
def yyyymmddThhmm(ts: pd.Timestamp) -> str:
    # Alpha Vantage: YYYYMMDDTHHMM (UTC)
    ts = ts.tz_localize(timezone.utc) if ts.tzinfo is None else ts.tz_convert(timezone.utc)
    return ts.strftime("%Y%m%dT%H%M")

def alpha_vantage_news_window(ticker: str, time_from: pd.Timestamp, time_to: pd.Timestamp, limit: int = 1000, sort: str="EARLIEST") -> pd.DataFrame:
    url = "https://www.alphavantage.co/query"
    params = {
        "function": "NEWS_SENTIMENT",
        "tickers": ticker,
        "time_from": yyyymmddThhmm(time_from),
        "time_to": yyyymmddThhmm(time_to),
        "limit": limit,
        "sort": sort,
        "apikey": ALPHAVANTAGE_API_KEY,
    }
    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()
    data = r.json()

    # возможные ошибки: {'Information': '...'} / {'Note': '...'}
    if "feed" not in data:
        raise RuntimeError(f"Alpha Vantage response without 'feed': {list(data.keys())}")

    rows = []
    for it in data["feed"]:
        rows.append({
            "ticker": ticker,
            "time_published": it.get("time_published"),
            "title": it.get("title"),
            "summary": it.get("summary"),
            "url": it.get("url"),
            "source": it.get("source"),
        })
    df = pd.DataFrame(rows)
    return df

def fetch_alpha_vantage_news(ticker: str, start: pd.Timestamp, end: pd.Timestamp, window_days: int = 30) -> pd.DataFrame:
    # chunk по окнам, чтобы не упираться в limit и проще переживать rate-limit
    all_parts = []
    cur = start
    pbar = tqdm(total=int((end-start).days/window_days)+1, desc=f"AV news {ticker}")
    while cur < end:
        nxt = min(cur + pd.Timedelta(days=window_days), end)
        try:
            part = alpha_vantage_news_window(ticker, cur, nxt, limit=1000, sort="EARLIEST")
            all_parts.append(part)
        except Exception as e:
            print("Window failed:", cur.date(), "→", nxt.date(), ":", repr(e))
        # free-tier rate limit обычно 5 запросов/мин → пауза; при необходимости уменьшите
        time.sleep(12)
        cur = nxt
        pbar.update(1)
    pbar.close()

    if not all_parts:
        return pd.DataFrame(columns=["ticker","time_published","title","summary","url","source"])

    df = pd.concat(all_parts, ignore_index=True).drop_duplicates(subset=["url"])
    return df

# GDELT fallback (короткое окно истории)
def fetch_gdelt_artlist(query: str, timespan: str = "3m", maxrecords: int = 250) -> pd.DataFrame:
    url = "https://api.gdeltproject.org/api/v2/doc/doc"
    params = {
        "query": query,
        "mode": "artlist",
        "format": "json",
        "timespan": timespan,
        "maxrecords": maxrecords,
        "sort": "datedesc",
    }
    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()
    data = r.json()
    arts = data.get("articles", [])
    rows = []
    for a in arts:
        rows.append({
            "time_published": a.get("seendate"),
            "title": a.get("title"),
            "summary": a.get("snippet"),
            "url": a.get("url"),
            "source": a.get("sourceCountry"),
        })
    return pd.DataFrame(rows)

In [None]:
import pandas as pd

DATA_DIR.mkdir(parents=True, exist_ok=True)

# --- 1) Выбор движка Parquet (fastparquet -> надёжнее, иначе pyarrow + reset) ---
_PARQUET_ENGINE = None
try:
    import fastparquet  # noqa: F401
    _PARQUET_ENGINE = "fastparquet"
except Exception:
    _PARQUET_ENGINE = "pyarrow"

def _reset_pyarrow_pandas_ext_types():
    try:
        import pyarrow as pa
    except Exception:
        return

    # Частые конфликтующие типы
    for name in ("pandas.period", "pandas.interval"):
        try:
            pa.unregister_extension_type(name)
        except Exception:
            pass

    # Попытка удалить всё pandas.* если API доступен
    for attr in ("registered_extension_types", "get_registered_extension_types", "list_registered_extension_types"):
        try:
            fn = getattr(pa, attr)
        except Exception:
            continue
        try:
            reg = fn()
            names = list(reg.keys()) if isinstance(reg, dict) else list(reg)
            for n in names:
                n = str(n)
                if n.startswith("pandas."):
                    try:
                        pa.unregister_extension_type(n)
                    except Exception:
                        pass
            break
        except Exception:
            pass

def safe_to_parquet(df: pd.DataFrame, path):
    try:
        df.to_parquet(path, index=False, engine=_PARQUET_ENGINE)
    except Exception as e:
        msg = str(e)
        if ("type extension with name pandas." in msg and "already defined" in msg) or "ArrowKeyError" in msg:
            _reset_pyarrow_pandas_ext_types()
            df.to_parquet(path, index=False, engine=_PARQUET_ENGINE)  # retry once
        else:
            raise

def safe_read_parquet(path) -> pd.DataFrame:
    # чтение обычно не падает, но пусть будет симметрично
    return pd.read_parquet(path, engine=_PARQUET_ENGINE)

print("Parquet engine:", _PARQUET_ENGINE)


# --- 2) Нормальный парсер time_published ---
def to_date_from_time_published(s) -> pd.Timestamp:
    # Alpha Vantage: YYYYMMDDTHHMMSS ; GDELT often: YYYYMMDDHHMMSS or variations
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return pd.NaT
    s = str(s).strip()
    if not s:
        return pd.NaT

    try:
        if "T" in s and len(s) >= 15:  # 20240131T235959
            dt = pd.to_datetime(s[:15], format="%Y%m%dT%H%M%S", utc=True, errors="coerce")
        elif len(s) >= 14:  # 20240131235959
            dt = pd.to_datetime(s[:14], format="%Y%m%d%H%M%S", utc=True, errors="coerce")
        elif len(s) >= 12:  # 202401312359
            dt = pd.to_datetime(s[:12], format="%Y%m%d%H%M", utc=True, errors="coerce")
        else:
            dt = pd.to_datetime(s, utc=True, errors="coerce")
    except Exception:
        dt = pd.NaT

    if pd.isna(dt):
        return pd.NaT
    return dt.tz_convert(None).normalize()


# --- 3) Загрузка/кеширование новостей ---
news_all = []

for t in TICKERS:
    out_path = DATA_DIR / f"news_raw_{t}.parquet"

    if out_path.exists():
        df = safe_read_parquet(out_path)
        print("Loaded cached:", out_path.name, df.shape)
    else:
        if USE_ALPHA_VANTAGE:
            df = fetch_alpha_vantage_news(t, START_DATE, END_DATE, window_days=30)
        else:
            # fallback: запрос по тикеру как по ключевому слову (ограничение истории)
            df = fetch_gdelt_artlist(query=t, timespan="3m", maxrecords=250)

        # гарантируем ticker
        df["ticker"] = t

        # гарантируем expected columns (на случай разной схемы источников)
        for col in ("time_published", "title", "url"):
            if col not in df.columns:
                df[col] = pd.NA

        df["date"] = df["time_published"].apply(to_date_from_time_published)

        # чистка
        df = df.dropna(subset=["date", "title"]).copy()
        if "url" in df.columns and df["url"].notna().any():
            df = df.drop_duplicates(subset=["url"])
        else:
            df = df.drop_duplicates(subset=["title", "date", "ticker"])

        safe_to_parquet(df, out_path)
        print("Saved:", out_path.name, df.shape)

    news_all.append(df)

news_all = pd.concat(news_all, ignore_index=True)
news_all.head()


Parquet engine: fastparquet


AV news AAPL: 100%|██████████| 61/61 [13:18<00:00, 13.09s/it]


Saved: news_raw_AAPL.parquet (3503, 7)


AV news XOM: 100%|██████████| 61/61 [13:07<00:00, 12.91s/it]

Saved: news_raw_XOM.parquet (2652, 7)





Unnamed: 0,ticker,time_published,title,summary,url,source,date
0,AAPL,20201229T125600,Intel shares rise after Third Point urges chip...,"Hedge fund Third Point, which recently acquire...",https://www.cnbc.com/2020/12/29/third-point-ur...,CNBC,2020-12-29
1,AAPL,20201230T022400,Exclusive: Hedge fund Third Point urges Intel ...,"Activist hedge fund Third Point LLC, which has...",https://www.reuters.com/business/retail-consum...,Reuters,2020-12-30
2,AAPL,20201230T052800,Apple loses copyright claims in lawsuit agains...,A federal judge in Florida dismissed Apple Inc...,https://www.reuters.com/business/apple-loses-c...,Reuters,2020-12-30
3,AAPL,20210104T120000,Apple Veterans’ Lidar Startup Adds $200 Millio...,"Aeva, a lidar startup founded by former Apple ...",https://www.bloomberg.com/news/articles/2021-0...,Bloomberg.com,2021-01-04
4,AAPL,20210104T212202,The TDVG ETF Is a Stellar Choice for Dividend ...,The TDVG ETF is highlighted as a strong option...,https://etfdb.com/active-etf-channel/tdvg-etf-...,ETF Database,2021-01-04


## 3) FinBERT → `s(x)` и дневной индекс `I_t`

Модель: **ProsusAI/finbert** (3 класса: positive/negative/neutral). citeturn0search1  

Считаем для каждой новости:

- `p_pos, p_neg, p_neu` — softmax вероятности
- `s(x) = p_pos - p_neg` (в диапазоне [-1, 1]) — удобная непрерывная метрика citeturn0search13

Далее агрегируем по дню и тикеру:

- `I_t = mean(s(x))` по всем новостям в этот день.

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("device:", device)
print("labels:", model.config.id2label)

device: cpu
labels: {0: 'positive', 1: 'negative', 2: 'neutral'}


In [8]:
def finbert_scores(texts, batch_size: int = 16, max_length: int = 256):
    # returns np.array shape [n,3] aligned with model.config.id2label
    probs_all = []
    for i in tqdm(range(0, len(texts), batch_size), desc="FinBERT", leave=False):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch, truncation=True, padding=True, max_length=max_length,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            logits = model(**enc).logits
            probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
        probs_all.append(probs)
    return np.vstack(probs_all) if probs_all else np.zeros((0,3), dtype=float)

def build_text(row) -> str:
    title = row.get("title") or ""
    summary = row.get("summary") or ""
    txt = (title + ". " + summary).strip()
    return txt[:5000]  # safety cap

news = news_all.copy()
news["text"] = news.apply(build_text, axis=1)

# ограничим пустые/короткие тексты
news = news[news["text"].str.len() >= 5].copy()
news = news.sort_values(["ticker","date","url"]).reset_index(drop=True)

print("News rows:", len(news))
news.head()

News rows: 6155


Unnamed: 0,ticker,time_published,title,summary,url,source,date,text
0,AAPL,20201229T125600,Intel shares rise after Third Point urges chip...,"Hedge fund Third Point, which recently acquire...",https://www.cnbc.com/2020/12/29/third-point-ur...,CNBC,2020-12-29,Intel shares rise after Third Point urges chip...
1,AAPL,20201230T052800,Apple loses copyright claims in lawsuit agains...,A federal judge in Florida dismissed Apple Inc...,https://www.reuters.com/business/apple-loses-c...,Reuters,2020-12-30,Apple loses copyright claims in lawsuit agains...
2,AAPL,20201230T022400,Exclusive: Hedge fund Third Point urges Intel ...,"Activist hedge fund Third Point LLC, which has...",https://www.reuters.com/business/retail-consum...,Reuters,2020-12-30,Exclusive: Hedge fund Third Point urges Intel ...
3,AAPL,20210104T212202,The TDVG ETF Is a Stellar Choice for Dividend ...,The TDVG ETF is highlighted as a strong option...,https://etfdb.com/active-etf-channel/tdvg-etf-...,ETF Database,2021-01-04,The TDVG ETF Is a Stellar Choice for Dividend ...
4,AAPL,20210104T120000,Apple Veterans’ Lidar Startup Adds $200 Millio...,"Aeva, a lidar startup founded by former Apple ...",https://www.bloomberg.com/news/articles/2021-0...,Bloomberg.com,2021-01-04,Apple Veterans’ Lidar Startup Adds $200 Millio...


In [None]:
import numpy as np
import pandas as pd

DATA_DIR.mkdir(parents=True, exist_ok=True)

# --- 1) Parquet safe writer (самодостаточно) ---
_PARQUET_ENGINE = None
try:
    import fastparquet  # noqa: F401
    _PARQUET_ENGINE = "fastparquet"
except Exception:
    _PARQUET_ENGINE = "pyarrow"

def _reset_pyarrow_pandas_ext_types():
    try:
        import pyarrow as pa
    except Exception:
        return
    for name in ("pandas.period", "pandas.interval"):
        try:
            pa.unregister_extension_type(name)
        except Exception:
            pass
    for attr in ("registered_extension_types", "get_registered_extension_types", "list_registered_extension_types"):
        try:
            fn = getattr(pa, attr)
        except Exception:
            continue
        try:
            reg = fn()
            names = list(reg.keys()) if isinstance(reg, dict) else list(reg)
            for n in names:
                n = str(n)
                if n.startswith("pandas."):
                    try:
                        pa.unregister_extension_type(n)
                    except Exception:
                        pass
            break
        except Exception:
            pass

def safe_to_parquet(df: pd.DataFrame, path):
    try:
        df.to_parquet(path, index=False, engine=_PARQUET_ENGINE)
    except Exception as e:
        msg = str(e)
        if ("type extension with name pandas." in msg and "already defined" in msg) or "ArrowKeyError" in msg:
            _reset_pyarrow_pandas_ext_types()
            df.to_parquet(path, index=False, engine=_PARQUET_ENGINE)  # retry once
        else:
            raise

print("Parquet engine:", _PARQUET_ENGINE)

# --- 2) FinBERT label mapping (robust к разным вариантам id2label) ---
id2label = {int(k): str(v).lower() for k, v in model.config.id2label.items()}

pos_id = next((i for i, lab in id2label.items() if "pos" in lab), None)
neg_id = next((i for i, lab in id2label.items() if "neg" in lab), None)
neu_id = next((i for i, lab in id2label.items() if "neu" in lab), None)

if pos_id is None or neg_id is None:
    raise RuntimeError(f"Unexpected FinBERT labels: {id2label}")

# --- 3) Готовим тексты: гарантируем str и без NaN ---
if "text" not in news.columns:
    raise RuntimeError("news must have a 'text' column")

texts = news["text"].fillna("").astype(str).tolist()

# --- 4) Скоринг ---
probs = finbert_scores(texts, batch_size=16, max_length=256)

# probs может быть list -> в ndarray
probs = np.asarray(probs, dtype="float32")

news["p_pos"] = probs[:, pos_id]
news["p_neg"] = probs[:, neg_id]
news["p_neu"] = probs[:, neu_id] if neu_id is not None else np.nan
news["s_x"] = news["p_pos"] - news["p_neg"]

# --- 5) Сохранение ---
out_path = DATA_DIR / "news_scored_all.parquet"
safe_to_parquet(news, out_path)
print("Saved:", out_path)

news.head()


Parquet engine: fastparquet


                                                          

Saved: data/news_scored_all.parquet




Unnamed: 0,ticker,time_published,title,summary,url,source,date,text,p_pos,p_neg,p_neu,s_x
0,AAPL,20201229T125600,Intel shares rise after Third Point urges chip...,"Hedge fund Third Point, which recently acquire...",https://www.cnbc.com/2020/12/29/third-point-ur...,CNBC,2020-12-29,Intel shares rise after Third Point urges chip...,0.375198,0.590296,0.034506,-0.215098
1,AAPL,20201230T052800,Apple loses copyright claims in lawsuit agains...,A federal judge in Florida dismissed Apple Inc...,https://www.reuters.com/business/apple-loses-c...,Reuters,2020-12-30,Apple loses copyright claims in lawsuit agains...,0.047186,0.759848,0.192967,-0.712662
2,AAPL,20201230T022400,Exclusive: Hedge fund Third Point urges Intel ...,"Activist hedge fund Third Point LLC, which has...",https://www.reuters.com/business/retail-consum...,Reuters,2020-12-30,Exclusive: Hedge fund Third Point urges Intel ...,0.026362,0.936882,0.036756,-0.91052
3,AAPL,20210104T212202,The TDVG ETF Is a Stellar Choice for Dividend ...,The TDVG ETF is highlighted as a strong option...,https://etfdb.com/active-etf-channel/tdvg-etf-...,ETF Database,2021-01-04,The TDVG ETF Is a Stellar Choice for Dividend ...,0.931475,0.010039,0.058486,0.921437
4,AAPL,20210104T120000,Apple Veterans’ Lidar Startup Adds $200 Millio...,"Aeva, a lidar startup founded by former Apple ...",https://www.bloomberg.com/news/articles/2021-0...,Bloomberg.com,2021-01-04,Apple Veterans’ Lidar Startup Adds $200 Millio...,0.949373,0.015201,0.035426,0.934172


In [10]:
# Дневной индекс I_t: средний sentiment по дню (и счётчик новостей)
daily_I = (
    news.groupby(["ticker","date"], as_index=False)
        .agg(I_t=("s_x","mean"), n_news=("s_x","size"))
)

safe_to_parquet(daily_I, DATA_DIR / "daily_sentiment_I_t.parquet")
daily_I.head()

Unnamed: 0,ticker,date,I_t,n_news
0,AAPL,2020-12-29,-0.215098,1
1,AAPL,2020-12-30,-0.811591,2
2,AAPL,2021-01-04,0.927804,2
3,AAPL,2021-01-05,0.254013,3
4,AAPL,2021-01-06,0.147563,3


## 4) Returns + RSI + MACD

In [11]:
import pandas as pd
import numpy as np

def yf_multiindex_to_long(prices: pd.DataFrame) -> pd.DataFrame:
    if not isinstance(prices.columns, pd.MultiIndex):
        raise TypeError(f"Expected MultiIndex columns, got {type(prices.columns)}")

    p = prices.copy()

    def norm(x) -> str:
        return str(x).strip().lower().replace(" ", "_")

    # 1) нормализуем имена полей (level 0) и тикеры (level 1)
    p.columns = pd.MultiIndex.from_tuples([
        (norm(a), str(b).strip().upper() if b is not None else "")
        for a, b in p.columns
    ])

    # 2) дата: у тебя она колонкой ('date','') -> делаем индексом
    if ("date", "") in p.columns:
        p = p.set_index(("date", ""))
        p.index.name = "date"
    else:
        # если даты нет колонкой, считаем что индекс и есть дата
        p.index.name = p.index.name or "date"

    # 3) ВАЖНО: выкидываем служебный ('ticker',''), чтобы не было конфликта при reset_index()
    if ("ticker", "") in p.columns:
        p = p.drop(columns=[("ticker", "")])

    # 4) stack по тикеру (level 1)
    out = (
        p.stack(level=1)
         .rename_axis(index=["date", "ticker"])
         .reset_index()
    )

    # 5) финальная чистка
    out.columns = [norm(c) for c in out.columns]
    out["date"] = pd.to_datetime(out["date"], errors="coerce")
    out["ticker"] = out["ticker"].astype("string").str.strip().str.upper()

    # оставим основные поля, если есть
    keep = [c for c in ["date","ticker","open","high","low","close","adj_close","volume"] if c in out.columns]
    out = (
        out[keep]
        .dropna(subset=["date","ticker"])
        .sort_values(["ticker","date"])
        .reset_index(drop=True)
    )
    return out

# --- usage ---
prices = yf_multiindex_to_long(prices)

# тикеры в том же виде, что в prices
TICKERS = [str(t).strip().upper() for t in TICKERS]

print(prices.columns)
print(prices.head())
print(prices["ticker"].value_counts().head())


Index(['date', 'ticker', 'open', 'high', 'low', 'close', 'adj_close',
       'volume'],
      dtype='object')
        date ticker        open        high         low       close  \
0 2020-12-28   AAPL  133.990005  137.339996  133.509995  136.690002   
1 2020-12-29   AAPL  138.050003  138.789993  134.339996  134.869995   
2 2020-12-30   AAPL  135.580002  135.990005  133.399994  133.720001   
3 2020-12-31   AAPL  134.080002  134.740005  131.720001  132.690002   
4 2021-01-04   AAPL  133.520004  133.610001  126.760002  129.410004   

    adj_close       volume  
0  133.061218  124486200.0  
1  131.289520  121047300.0  
2  130.170029   96452100.0  
3  129.167374   99116600.0  
4  125.974480  143301900.0  
ticker
AAPL    1256
XOM     1256
Name: count, dtype: Int64


  p.stack(level=1)


In [None]:
import numpy as np
import pandas as pd

# --- 0) sanity: нормализуем названия колонок (если вдруг где-то иначе) ---
prices = prices.copy()
prices.columns = [str(c).strip().lower().replace(" ", "_") for c in prices.columns]

# --- 1) выбираем правильную колонку цены для расчётов ---
# приоритет: adj_close -> close
price_col = "adj_close" if "adj_close" in prices.columns else ("close" if "close" in prices.columns else None)
if price_col is None:
    raise RuntimeError(f"prices must contain adj_close or close. Got columns: {list(prices.columns)}")

# --- 2) индикаторы ---
def ema(series: pd.Series, span: int) -> pd.Series:
    return series.ewm(span=span, adjust=False).mean()

def rsi(close: pd.Series, period: int = 14) -> pd.Series:
    delta = close.diff()
    gain = delta.clip(lower=0)
    loss = (-delta).clip(lower=0)
    avg_gain = gain.ewm(alpha=1/period, adjust=False).mean()
    avg_loss = loss.ewm(alpha=1/period, adjust=False).mean()
    rs = avg_gain / avg_loss.replace(0, np.nan)
    return 100 - (100 / (1 + rs))

def macd(close: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
    macd_line = ema(close, fast) - ema(close, slow)
    signal_line = ema(macd_line, signal)
    hist = macd_line - signal_line
    return macd_line, signal_line, hist

# --- 3) Фичи по каждому тикеру ---
feat_parts = []

for t in TICKERS:
    p = prices.loc[prices["ticker"] == t].copy()

    if p.empty:
        print(f"WARNING: no rows for {t}")
        continue

    p = p.sort_values("date")

    px = pd.to_numeric(p[price_col], errors="coerce")
    p["returns"] = px.pct_change()  # close-to-close
    p["RSI"] = rsi(px, period=14)
    macd_line, signal_line, hist = macd(px)
    p["MACD"] = macd_line
    p["MACD_signal"] = signal_line
    p["MACD_hist"] = hist

    feat_parts.append(p)

feat_prices = pd.concat(feat_parts, ignore_index=True)

# (опционально) оставим только ключевые колонки + индикаторы
keep_cols = [c for c in ["date", "ticker", price_col, "returns", "RSI", "MACD", "MACD_signal", "MACD_hist"] if c in feat_prices.columns]
feat_prices_out = feat_prices[keep_cols].copy()

safe_to_parquet(feat_prices_out, DATA_DIR / "price_features.parquet")
feat_prices_out.head()


Unnamed: 0,date,ticker,adj_close,returns,RSI,MACD,MACD_signal,MACD_hist
0,2020-12-28,AAPL,133.061218,,,0.0,0.0,0.0
1,2020-12-29,AAPL,131.28952,-0.013315,0.0,-0.141332,-0.028266,-0.113066
2,2020-12-30,AAPL,130.170029,-0.008527,0.0,-0.339756,-0.090564,-0.249192
3,2020-12-31,AAPL,129.167374,-0.007703,0.0,-0.571328,-0.186717,-0.384611
4,2021-01-04,AAPL,125.97448,-0.024719,0.0,-1.000952,-0.349564,-0.651388


## 5) Финальная таблица `date, returns, RSI, MACD, I_t`

Склеиваем price-features и `I_t` по (`ticker`, `date`). Если в какой-то день новостей нет — `I_t` будет NaN.

In [13]:
final = feat_prices.merge(daily_I[["ticker","date","I_t"]], on=["ticker","date"], how="left")

final_small = final[["ticker","date","returns","RSI","MACD","I_t"]].copy()
safe_to_parquet(final_small, DATA_DIR / "final_features_all.parquet")

# также отдельные файлы по тикерам
for t in TICKERS:
    df = final_small[final_small["ticker"] == t].copy()
    safe_to_parquet(df, DATA_DIR / f"final_features_{t}.parquet")

final_small.tail()

Unnamed: 0,ticker,date,returns,RSI,MACD,I_t
2507,XOM,2025-12-19,0.001287,50.088714,0.382271,0.438691
2508,XOM,2025-12-22,0.012512,54.44325,0.438767,0.520959
2509,XOM,2025-12-23,0.010749,57.885258,0.579341,0.630784
2510,XOM,2025-12-24,-0.001675,57.152934,0.66692,0.534321
2511,XOM,2025-12-26,-0.000923,56.727837,0.719162,0.52468
