In [1]:
import duckdb

import os
import time
import math
import requests
import pandas as pd
from typing import List, Dict, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed

# ================== USER SETTINGS ==================
API_KEY = "c5PobUQjaaMTHySILWqmWi9uyIDqYJBi"

SINGLE_DAY_MODE = True        # True -> from=to=TARGET_DATE; False -> use DATE_FROM/DATE_TO
TARGET_DATE = "2025-10-13"    # YYYY-MM-DD

# If SINGLE_DAY_MODE is False, set these:
#DATE_FROM = "2025-09-01"
#DATE_TO   = "2025-09-25"
# ===================================================

In [2]:
db_path = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"

con = duckdb.connect(db_path)

tickers_duck = con.execute("SELECT ticker FROM dim_ticker").fetchall()
tickers = [t[0] for t in tickers_duck]
TICKERS: List[str] = tickers 
con.close()

print(len(tickers))

"""
Daily OHLCV downloader (FMP) — concurrent, batched, with retries
- Reads from a Python list of tickers (or optional CSV)
- Fetches daily bars for a single date (from=to=TARGET_DATE) or a range
- Concurrency capped to ~4 in-flight requests (safe for FMP Starter)
- Short pauses between batches
- Retries 429/5xx with exponential backoff
- Prints a summary of tickers with no data for the requested day
- Produces a DataFrame `data` (no files written)
"""


# ---- Tuning (fast but generally safe for FMP Starter) -------------------------
# Parallel workers: FMP commonly allows ~4 parallel requests safely.
MAX_WORKERS = 4
# Batch size: how many symbols to schedule per wave
BATCH_SIZE = 100
# Pause between batches (seconds). Keep small to speed up end-to-end.
SLEEP_BETWEEN_BATCHES = 2.0
# Per-request connect/read timeout
REQUEST_TIMEOUT = 20
# Max retries for 429/5xx
MAX_RETRIES = 3
# Backoff base (seconds) for 429/5xx
BACKOFF_BASE = 1.5
# -----------------------------------------------------------------------------

def _daterange() -> Tuple[str, str]:
    if SINGLE_DAY_MODE:
        return TARGET_DATE, TARGET_DATE
    return DATE_FROM, DATE_TO

def fetch_daily(symbol: str, date_from: str, date_to: str) -> Optional[List[Dict]]:
    """
    Fetch daily bars for a symbol over [date_from, date_to].
    Retries on 429 and transient 5xx.
    Returns list of bar dicts or None on hard failure.
    """
    url = f"https://financialmodelingprep.com/api/v3/historical-price-full/{symbol}"
    params = {"from": date_from, "to": date_to, "apikey": API_KEY}

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.get(url, params=params, timeout=REQUEST_TIMEOUT)
        except requests.RequestException:
            # transient network error — backoff & retry
            if attempt < MAX_RETRIES:
                time.sleep(BACKOFF_BASE ** attempt)
                continue
            return None

        # Handle rate limiting / transient server errors
        if r.status_code in (429, 502, 503, 504):
            if attempt < MAX_RETRIES:
                # Try to respect Retry-After if present
                retry_after = r.headers.get("Retry-After")
                delay = float(retry_after) if retry_after else (BACKOFF_BASE ** attempt)
                time.sleep(delay)
                continue
            return None

        if r.status_code != 200:
            # Hard failure; don't retry further
            return None

        try:
            js = r.json()
        except ValueError:
            return None

        return js.get("historical", [])

    return None  # unreachable, but explicit

def chunked(lst: List[str], n: int):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

date_from, date_to = _daterange()
all_rows: List[Dict] = []
no_data: List[str] = []       # symbols that returned zero rows in the requested window
hard_fail: List[str] = []     # symbols that errored out after retries

total = len(TICKERS)
batches = list(chunked(TICKERS, BATCH_SIZE))

for bi, batch in enumerate(batches, start=1):
    print(f"Batch {bi}/{len(batches)}: {len(batch)} symbols")
    futures = {}
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        for sym in batch:
            futures[ex.submit(fetch_daily, sym, date_from, date_to)] = sym

        for fut in as_completed(futures):
            sym = futures[fut]
            hist = fut.result()
            if hist is None:
                hard_fail.append(sym)
                continue

            # Keep only rows that match the exact day in single-day mode
            if SINGLE_DAY_MODE:
                hist = [h for h in hist if h.get("date") == TARGET_DATE]

            if not hist:
                no_data.append(sym)
                continue

            for h in hist:
                all_rows.append({
                    "date": h.get("date"),
                    "ticker": sym,
                    "open": h.get("open"),
                    "high": h.get("high"),
                    "low": h.get("low"),
                    "close": h.get("close"),
                    "adjClose": h.get("adjClose"),
                    "volume": h.get("volume"),
                })

    if bi < len(batches):
        time.sleep(SLEEP_BETWEEN_BATCHES)

# Build final DataFrame
data = pd.DataFrame(all_rows)
if not data.empty:
    data = data.sort_values(["date", "ticker"]).reset_index(drop=True)

# Progress / diagnostics
fetched = data["ticker"].nunique() if not data.empty else 0
print(f"\nDone. Tickers requested: {total}")
print(f"Tickers with rows returned: {fetched}")
print(f"Rows fetched: {len(data)}")

if no_data:
    print("\nNo rows returned for the requested date/window:")
    # Show a few, then count
    preview = ", ".join(no_data[:20])
    more = f" ... (+{len(no_data)-20} more)" if len(no_data) > 20 else ""
    print(preview + more)

if hard_fail:
    print("\nFailed after retries (HTTP/network errors):")
    preview = ", ".join(hard_fail[:20])
    more = f" ... (+{len(hard_fail)-20} more)" if len(hard_fail) > 20 else ""
    print(preview + more)

# Show head for quick inspection
print("\nHead:")
print(data.head())

# XLB, ZTS, XLC, XLE, XLI, XLK, XLP, XLF, XLV, XLRE, XLU, XLY


519
Batch 1/6: 100 symbols
Batch 2/6: 100 symbols
Batch 3/6: 100 symbols
Batch 4/6: 100 symbols
Batch 5/6: 100 symbols
Batch 6/6: 19 symbols

Done. Tickers requested: 519
Tickers with rows returned: 507
Rows fetched: 507

No rows returned for the requested date/window:
WBA

Failed after retries (HTTP/network errors):
MET, MGM, MHK, META, MKTX, MLM, MKC, MMC, MMM, MO, MOH

Head:
         date ticker    open    high     low   close  adjClose    volume
0  2025-10-13      A  137.85  139.84  136.76  138.23    138.23   1481500
1  2025-10-13   AAPL  249.38  249.69  245.56  247.66    247.66  38142942
2  2025-10-13   ABBV  230.00  233.81  229.22  230.30    230.30   5411890
3  2025-10-13   ABNB  118.91  119.80  118.25  118.86    118.86   3344832
4  2025-10-13    ABT  132.00  132.76  130.87  131.38    131.38   4008306


In [3]:
# --- Config ---
# READING NECESSARY HISTORICAL INFORMATION TO DO ANALYSIS CALCS

import duckdb
import pandas as pd
import numpy as np
import math

DB_PATH = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"
ROWS_PER_TICKER = 750  # most recent N rows per ticker

# Connect read-only (avoids lock conflicts)
con = duckdb.connect(DB_PATH, read_only=True)

# Pull prices + ticker_type so our analytics can toggle volume usage
sql = f"""
WITH ranked AS (
  SELECT
    f.ticker_id,
    t.ticker,
    t.ticker_type,
    f.dt,
    f.open, f.high, f.low, f.close, f.adj_close, f.volume,
    ROW_NUMBER() OVER (PARTITION BY f.ticker_id ORDER BY f.dt DESC) AS rn
  FROM fact_price_daily AS f
  JOIN dim_ticker       AS t USING (ticker_id)
)
SELECT
  ticker, ticker_type, dt, open, high, low, close, adj_close, volume
FROM ranked
WHERE rn <= {ROWS_PER_TICKER}
ORDER BY ticker, dt
"""

prices = con.execute(sql).df()
print("\n", prices.tail())

# COMBINES TODAY'S PRICE DATA TO HISTORICAL DATABASE STOCK PRICE DATA

# Step 1: Align column names in 'data'
data_renamed = data.rename(columns={
    "date": "dt",
    "adjClose": "adj_close"
})

# Attach ticker_type to today's data rows using dim_ticker (so analytics knows whether to use volume)
dim_t = con.execute("SELECT ticker, ticker_type FROM dim_ticker").df()
ticker_to_type = dict(zip(dim_t["ticker"], dim_t["ticker_type"]))
data_renamed["ticker_type"] = data_renamed["ticker"].map(ticker_to_type)

# Step 2: Ensure same column order as 'prices' (ticker_type comes along as well)
# prices has: ticker, ticker_type, dt, open, high, low, close, adj_close, volume
data_renamed = data_renamed[["ticker", "ticker_type", "dt", "open", "high", "low", "close", "adj_close", "volume"]]

# Step 3: Combine with prices
combined = pd.concat([prices, data_renamed], ignore_index=True)

# Step 4: Sort by ticker, then date
combined = combined.sort_values(["ticker", "dt"]).reset_index(drop=True)

# (Optional) quick check
print(combined.head(), "\n", combined.tail())


# ANALYSIS ON OHLCV DATA. SPICY STUFF.

# Expect TARGET_DATE to already be defined by you; this just normalizes it
TARGET_DATE = pd.to_datetime(TARGET_DATE)

# ----------------------- CONFIG -----------------------
TRADING_DAYS_PER_YEAR = 252

# Windows (trading-day counts)
WIN_05   = 5
WIN_10   = 10
WIN_15   = 15
WIN_20   = 20
WIN_50   = 50
WIN_60   = 60
WIN_100  = 100
WIN_200  = 200
WIN_252  = 252
WIN_300  = 300
WIN_750  = 750
SMA_POS_LEN = 3
OBV_BASELINE_DATE = pd.Timestamp("2004-01-01")  # kept for reproducibility, not used now

# -------------------- Helper functions --------------------
def ols_slope_window(arr: np.ndarray) -> float:
    mask = np.isfinite(arr)
    y = arr[mask]
    n = y.size
    if n < 2:
        return np.nan
    x = np.arange(n, dtype=float)
    xm, ym = x.mean(), y.mean()
    denom = np.sum((x - xm) ** 2)
    if denom == 0:
        return np.nan
    num = np.sum((x - xm) * (y - ym))
    return float(num / denom)

def max_drawdown_only(arr: np.ndarray) -> float:
    a = np.asarray(arr, dtype=float)
    if not np.isfinite(a).any():
        return np.nan
    start_idx = -1
    peak = np.nan
    for i in range(a.size):
        if math.isfinite(a[i]) and a[i] > 0.0:
            peak = a[i]
            start_idx = i
            break
    if start_idx < 0:
        return np.nan
    best_dd = 0.0
    for j in range(start_idx + 1, a.size):
        pj = a[j]
        if not math.isfinite(pj) or pj <= 0.0:
            continue
        if pj > peak:
            peak = pj
        dd = pj / peak - 1.0
        if dd < best_dd:
            best_dd = dd
    return float(best_dd)

def max_drawdown_duration_only(arr: np.ndarray) -> float:
    a = np.asarray(arr, dtype=float)
    if not np.isfinite(a).any():
        return np.nan
    max_price = np.nan
    max_idx = -1
    for i in range(a.size):
        if math.isfinite(a[i]) and a[i] > 0.0:
            max_price = a[i]
            max_idx = i
            break
    if max_idx < 0:
        return np.nan
    best_dd = 0.0
    best_dur = 0
    for j in range(max_idx + 1, a.size):
        pj = a[j]
        if not math.isfinite(pj) or pj <= 0.0:
            continue
        if pj > max_price:
            max_price = pj
            max_idx = j
        dd = pj / max_price - 1.0
        if dd < best_dd:
            best_dd = dd
            best_dur = j - max_idx
    return float(best_dur)

def per_ticker_metrics_all_rows(g: pd.DataFrame) -> pd.DataFrame:
    """
    Compute metrics for a single ticker (sorted by date).

    Fixes:
    - De-dupes by calendar date to avoid duplicate index labels.
    - Drawdown helpers use position-based (NumPy) indexing to return scalars even when
      labels repeat; robust to NaNs.

    Behavior:
    - If g.ticker_type != 'Stock' (case-insensitive), volume-based metrics are NaN.
    - Price-only metrics computed for all ticker types.
    """
    # ---- Normalize, sort, and de-duplicate by date (CRITICAL FIX) -------------
    g = g.copy()
    if "date" not in g.columns and "dt" in g.columns:
        g["date"] = pd.to_datetime(g["dt"])
    else:
        g["date"] = pd.to_datetime(g["date"])
    g = g.sort_values("date").drop_duplicates(subset="date", keep="last").reset_index(drop=True)

    # Determine whether to use volume
    ttype = (str(g["ticker_type"].iloc[0]) if "ticker_type" in g.columns else "Stock").strip().lower()
    uses_volume = (ttype == "stock")

    # Base series
    close = pd.to_numeric(g["adj_close"], errors="coerce").astype(float)
    open_ = pd.to_numeric(g["open"],      errors="coerce").astype(float)
    high  = pd.to_numeric(g["high"],      errors="coerce").astype(float)
    low   = pd.to_numeric(g["low"],       errors="coerce").astype(float)

    # Volume series for storage and for calculations (NaN if not Stock)
    if "volume" in g.columns:
        vol_original = pd.to_numeric(g["volume"], errors="coerce").astype(float)
    else:
        vol_original = pd.Series(np.nan, index=g.index, dtype=float)
    vol_calc = vol_original if uses_volume else pd.Series(np.nan, index=g.index, dtype=float)

    # Safe variants for logs/divisions
    close_safe = close.replace(0, np.nan)

    # ---------- Core logs & returns ----------
    log_price = np.log(close_safe)
    prev_close = close_safe.shift(1)
    with np.errstate(divide='ignore', invalid='ignore'):
        log_returns = np.where((close_safe > 0) & (prev_close > 0),
                               np.log(close_safe / prev_close), np.nan)
    log_returns = pd.Series(log_returns, index=g.index)
    if len(g) > 0 and pd.isna(log_returns.iloc[0]):
        log_returns.iloc[0] = 0.0

    # ---------- Moving averages / VWAP ----------
    WIN_05, WIN_10, WIN_15, WIN_20, WIN_50 = 5, 10, 15, 20, 50
    WIN_60, WIN_100, WIN_200, WIN_252, WIN_300, WIN_750 = 60, 100, 200, 252, 300, 750
    TRADING_DAYS_PER_YEAR = 252
    SMA_POS_LEN = 3

    ma20   = close.rolling(WIN_20,  min_periods=1).mean()
    ma50   = close.rolling(WIN_50,  min_periods=1).mean()
    ma100  = close.rolling(WIN_100, min_periods=1).mean()
    ma200  = close.rolling(WIN_200, min_periods=1).mean()

    # All dollar-volume things use vol_calc (becomes NaN for non-Stock)
    dv = close * vol_calc

    # vwap20 (NaN if not using volume)
    sum_px_vol_20 = dv.rolling(WIN_20, min_periods=1).sum()
    sum_vol_20    = vol_calc.rolling(WIN_20, min_periods=1).sum()
    with np.errstate(invalid='ignore', divide='ignore'):
        vwap20 = np.where(sum_vol_20 > 0, sum_px_vol_20 / sum_vol_20, np.nan)

    # Dollar-volume accelerations
    with np.errstate(divide='ignore', invalid='ignore'):
        ln_dv = np.where((vol_calc > 0) & (close > 0), np.log(vol_calc * close), np.nan)
    ln_dv = pd.Series(ln_dv, index=g.index)
    vol_accel_5d  = ln_dv - ln_dv.shift(5)
    vol_accel_10d = ln_dv - ln_dv.shift(10)

    # 10v60 abnormal volume
    avg10_dv = dv.rolling(WIN_10, min_periods=1).mean()
    avg60_dv = dv.rolling(WIN_60, min_periods=1).mean()
    std60_dv = dv.rolling(WIN_60, min_periods=2).std(ddof=1)
    with np.errstate(invalid='ignore', divide='ignore'):
        abn_vol_60d = np.where(std60_dv > 0, (avg10_dv - avg60_dv) / std60_dv, np.nan)

    # Volatility (annualized) & mean return 100d
    vol20_ann  = log_returns.rolling(WIN_20,  min_periods=2).std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)
    vol100_ann = log_returns.rolling(WIN_100, min_periods=2).std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)
    mean100    = log_returns.rolling(WIN_100, min_periods=1).mean()

    # Range/position block
    low_10  = low.rolling(WIN_10, min_periods=1).min()
    high_10 = high.rolling(WIN_10, min_periods=1).max()
    rng_10  = high_10 - low_10
    with np.errstate(invalid='ignore', divide='ignore'):
        pos_10d = np.where(rng_10 != 0, (close - low_10) / rng_10, 0.0)
    pos_10d = pd.Series(pos_10d, index=g.index)
    five_day_range_pos = pos_10d.rolling(SMA_POS_LEN, min_periods=1).mean()

    daily_range = (high - low)
    avg_rng_10  = daily_range.rolling(WIN_10, min_periods=1).mean()
    avg_rng_60  = daily_range.rolling(WIN_60, min_periods=1).mean()
    std_rng_60  = daily_range.rolling(WIN_60, min_periods=2).std(ddof=1)
    with np.errstate(invalid='ignore', divide='ignore'):
        z_60_10_highlowrange = np.where(std_rng_60 > 0, (avg_rng_10 - avg_rng_60) / std_rng_60, 0.0)

    # Multi-horizon log returns (use 0.0 when lag missing)
    def safe_lr(curr, lagged):
        return np.where((curr > 0) & (lagged > 0), np.log(curr / lagged), 0.0)

    ret5   = safe_lr(close, close.shift(5))
    ret10  = safe_lr(close, close.shift(10))
    ret20  = safe_lr(close, close.shift(20))
    ret40  = safe_lr(close, close.shift(40))
    ret60  = safe_lr(close, close.shift(60))
    ret200 = safe_lr(close, close.shift(200))
    ret300 = safe_lr(close, close.shift(300))

    # Median return over last 100 trading days
    median_return_100d = log_returns.rolling(WIN_100, min_periods=1).median()

    # ---------- Robust rolling drawdown (position-based) ----------
    closes_by_date = pd.Series(close.values, index=g["date"])

    def _dd_percent(w: pd.Series) -> float:
        v = pd.to_numeric(w, errors="coerce").to_numpy(dtype="float64")
        if v.size <= 1 or not np.isfinite(v).any():
            return 0.0
        with np.errstate(invalid='ignore'):
            try:
                peak_pos = np.nanargmax(v)
            except ValueError:
                return 0.0
            peak = v[peak_pos]
            if not np.isfinite(peak):
                return 0.0
            suffix = v[peak_pos:]
            if suffix.size == 0 or not np.isfinite(suffix).any():
                return 0.0
            trough = np.nanmin(suffix)
            return float(trough / peak - 1.0)

    def _dd_duration(w: pd.Series) -> float:
        v = pd.to_numeric(w, errors="coerce").to_numpy(dtype="float64")
        idx = w.index
        if v.size <= 1 or not np.isfinite(v).any():
            return 0.0
        with np.errstate(invalid='ignore'):
            try:
                peak_pos = np.nanargmax(v)
            except ValueError:
                return 0.0
            suffix = v[peak_pos:]
            if suffix.size == 0 or not np.isfinite(suffix).any():
                return 0.0
            trough_rel = np.nanargmin(suffix)
            trough_pos = peak_pos + trough_rel
            # calendar-day duration
            return float((idx[trough_pos] - idx[peak_pos]).days)

    drawdown_percent_100 = closes_by_date.rolling(WIN_100, min_periods=1).apply(_dd_percent, raw=False).values
    drawdown_days_100    = closes_by_date.rolling(WIN_100, min_periods=1).apply(_dd_duration, raw=False).values

    # ---------- Extended analytics ----------
    lr = pd.Series(log_returns, index=g.index)
    vol_5   = lr.rolling(WIN_05,  min_periods=WIN_05).std(ddof=1)
    vol_15  = lr.rolling(WIN_15,  min_periods=WIN_15).std(ddof=1)
    vol_60  = lr.rolling(WIN_60,  min_periods=WIN_60).std(ddof=1)
    vol_252 = lr.rolling(WIN_252, min_periods=WIN_252).std(ddof=1)

    neg = np.minimum(lr, 0.0)
    pos = np.maximum(lr, 0.0)
    dd_15  = (pd.Series(neg).pow(2).rolling(WIN_15,  min_periods=WIN_15).mean()) ** 0.5
    dd_60  = (pd.Series(neg).pow(2).rolling(WIN_60,  min_periods=WIN_60).mean()) ** 0.5
    dd_252 = (pd.Series(neg).pow(2).rolling(WIN_252, min_periods=WIN_252).mean()) ** 0.5
    ud_15  = (pd.Series(pos).pow(2).rolling(WIN_15,  min_periods=WIN_15).mean()) ** 0.5
    ud_60  = (pd.Series(pos).pow(2).rolling(WIN_60,  min_periods=WIN_60).mean()) ** 0.5
    ud_252 = (pd.Series(pos).pow(2).rolling(WIN_252, min_periods=WIN_252).mean()) ** 0.5

    # Parkinson HL volatility (20d), non-annualized
    hl_log = np.log((high.replace(0, np.nan)) / (low.replace(0, np.nan)))
    k = 1.0 / (4.0 * math.log(2.0))
    pk20 = np.sqrt(k * (hl_log.pow(2).rolling(WIN_20, min_periods=WIN_20).mean()))

    # Change in 10-day cumulative log returns
    sum10 = lr.rolling(WIN_10, min_periods=WIN_10).sum()
    change_10dayret = sum10 - sum10.shift(WIN_10)

    # 60d return acceleration via slopes of log_price
    def ols_slope_window(arr: np.ndarray) -> float:
        mask = np.isfinite(arr)
        y = arr[mask]
        n = y.size
        if n < 2:
            return np.nan
        x = np.arange(n, dtype=float)
        xm, ym = x.mean(), y.mean()
        denom = np.sum((x - xm) ** 2)
        if denom == 0:
            return np.nan
        num = np.sum((x - xm) * (y - ym))
        return float(num / denom)

    slope_lp_recent60 = pd.Series(np.log(close.replace(0, np.nan))).rolling(WIN_60, min_periods=WIN_60).apply(ols_slope_window, raw=True)
    slope_lp_prev60   = slope_lp_recent60.shift(WIN_60)
    ret_accel_60      = slope_lp_recent60 - slope_lp_prev60

    # Vol slopes
    slope_vol60_over20  = vol_60.rolling(WIN_20, min_periods=WIN_20).apply(ols_slope_window, raw=True)
    slope_vol252_over60 = vol_252.rolling(WIN_60, min_periods=WIN_60).apply(ols_slope_window, raw=True)

    # Dollar volume long windows & correlation (NaN for non-Stock)
    dv_sma_252 = dv.rolling(WIN_252, min_periods=WIN_252).mean()
    dv_sma_60  = dv.rolling(WIN_60,  min_periods=WIN_60).mean()
    dv252_accel_60 = dv_sma_252.rolling(WIN_60, min_periods=WIN_60).apply(ols_slope_window, raw=True)
    corr_px_dv_60  = close.rolling(WIN_60, min_periods=WIN_60).corr(dv)

    # EMA(5) of 15d volatility
    ema5_of_vol15 = vol_15.ewm(span=5, adjust=False).mean()

    # Rolling 750-day max drawdown & duration (in trading days)
    def max_drawdown_only(arr: np.ndarray) -> float:
        a = np.asarray(arr, dtype=float)
        if not np.isfinite(a).any():
            return np.nan
        start_idx = -1
        peak = np.nan
        for i in range(a.size):
            if math.isfinite(a[i]) and a[i] > 0.0:
                peak = a[i]
                start_idx = i
                break
        if start_idx < 0:
            return np.nan
        best_dd = 0.0
        for j in range(start_idx + 1, a.size):
            pj = a[j]
            if not math.isfinite(pj) or pj <= 0.0:
                continue
            if pj > peak:
                peak = pj
            dd = pj / peak - 1.0
            if dd < best_dd:
                best_dd = dd
        return float(best_dd)

    def max_drawdown_duration_only(arr: np.ndarray) -> float:
        a = np.asarray(arr, dtype=float)
        if not np.isfinite(a).any():
            return np.nan
        max_price = np.nan
        max_idx = -1
        for i in range(a.size):
            if math.isfinite(a[i]) and a[i] > 0.0:
                max_price = a[i]
                max_idx = i
                break
        if max_idx < 0:
            return np.nan
        best_dd = 0.0
        best_dur = 0
        for j in range(max_idx + 1, a.size):
            pj = a[j]
            if not math.isfinite(pj) or pj <= 0.0:
                continue
            if pj > max_price:
                max_price = pj
                max_idx = j
            dd = pj / max_price - 1.0
            if dd < best_dd:
                best_dd = dd
                best_dur = j - max_idx
        return float(best_dur)

    mdd_750     = close.rolling(WIN_750, min_periods=2).apply(max_drawdown_only, raw=True)
    mdd_dur_750 = close.rolling(WIN_750, min_periods=2).apply(max_drawdown_duration_only, raw=True)

    # ---------- Assemble ----------
    out = pd.DataFrame({
        "date": g["date"].values,
        "ticker": g["ticker"].values,
        "ticker_type": g.get("ticker_type", pd.Series(["Stock"]*len(g), index=g.index)).values,

        # Original OHLCV for price upsert (volume preserved even if not used in calcs)
        "open": open_.values,
        "high": high.values,
        "low": low.values,
        "adj_close": close.values,
        "volume": vol_original.values,

        # SQL-parity / price-volume set (momentum labels removed)
        "log_returns": log_returns.values,
        "volatility_20d": vol20_ann.values,
        "volatility_100d": vol100_ann.values,
        "mean_return_100d": mean100.values,
        "moving_avg_20d": ma20.values,
        "moving_avg_50d": ma50.values,
        "moving_avg_100d": ma100.values,
        "moving_avg_200d": ma200.values,
        "vwap_20d": vwap20,                         # NaN for non-Stock
        "vol_accel_5d": vol_accel_5d.values,        # NaN for non-Stock
        "vol_accel_10d": vol_accel_10d.values,      # NaN for non-Stock
        "abn_vol_60d": abn_vol_60d,                 # NaN for non-Stock
        "5_day_range_pos": five_day_range_pos.values,
        "60_10_highlowrange_zscore": z_60_10_highlowrange,
        "5_day_ret": ret5,
        "10_day_ret": ret10,
        "20_day_ret": ret20,
        "40_day_ret": ret40,
        "60_day_ret": ret60,
        "200_day_ret": ret200,
        "300_day_ret": ret300,
        "median_return_100d": median_return_100d.values,
        "drawdown_percent": drawdown_percent_100,
        "drawdown_duration_days": drawdown_days_100,

        # Extended analytics (no OBV fields)
        "log_prices": log_price.values,
        "change_10dayret": change_10dayret.values,
        "slope_over60_of_logprice": slope_lp_recent60.values,
        "prior_slope_over60_of_logprice": slope_lp_prev60.values,
        "60d_return_accel": ret_accel_60.values,

        "750d_drawdown": mdd_750.values,
        "750d_drawdownduration": mdd_dur_750.values,

        "15d_downsidedeviation": dd_15.values,
        "60d_downsidedeviation": dd_60.values,
        "252d_downsidedeviation": dd_252.values,

        "15d_upsidevolatility": ud_15.values,
        "60d_upsidevolatility": ud_60.values,
        "252d_upsidevolatility": ud_252.values,

        "5d_volatility": vol_5.values,
        "15d_volatility": vol_15.values,
        "60d_volatility": vol_60.values,
        "252d_volatility": vol_252.values,

        "20d_parkinson_HL_volatility": pk20.values,
        "5d_EMA_15dayvolatility": ema5_of_vol15.values,

        "slope_over20_of_60d_volatility": slope_vol60_over20.values,
        "slope_over60_of_252d_volatility": slope_vol252_over60.values,

        # Dollar-volume family (all NaN for non-Stock)
        "252d_dollar_volume_SMA": dv_sma_252.values,
        "60d_dollar_volume_SMA":  dv_sma_60.values,
        "252d_dollar_volume_accel": dv252_accel_60.values,
        "60d_price_dollarVolume_correlation": corr_px_dv_60.values,
    }, index=g.index)

    # Clean edge cases
    out.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Zero-fill only structural indicators where needed for SQL parity
    out[[
        "5_day_range_pos","60_10_highlowrange_zscore",
        "drawdown_percent","drawdown_duration_days"
    ]] = out[[
        "5_day_range_pos","60_10_highlowrange_zscore",
        "drawdown_percent","drawdown_duration_days"
    ]].fillna(0.0)

    return out


# -------------------- Run for TARGET_DATE (from `combined`) --------------------
_ = combined.copy()
_.columns = [c.strip().lower() for c in _.columns]

# Build a proper datetime column named 'date'
if "dt" in _.columns and "date" not in _.columns:
    _["date"] = pd.to_datetime(_["dt"])
else:
    _["date"] = pd.to_datetime(_["date"])

for c in ["open","high","low","close","adj_close"]:
    _[c] = pd.to_numeric(_[c], errors="coerce")
if "volume" in _.columns:
    _["volume"] = pd.to_numeric(_["volume"], errors="coerce")

# Ensure ticker_type exists (fallback Stock if missing)
if "ticker_type" not in _.columns:
    dim_t = con.execute("SELECT ticker, ticker_type FROM dim_ticker").df()
    ticker_to_type = dict(zip(dim_t["ticker"], dim_t["ticker_type"]))
    _["ticker_type"] = _["ticker"].map(ticker_to_type).fillna("Stock")

_ = _.sort_values(["ticker","date"]).reset_index(drop=True)

# Compare on PURE DATE to avoid dtype/timezone issues
T_DATE = pd.Timestamp(TARGET_DATE).date()

daily_chunks = []
for tkr, g in _.groupby("ticker", sort=False):
    # If this ticker has any row on the calendar date, keep it
    if not (g["date"].dt.date == T_DATE).any():
        continue
    all_rows = per_ticker_metrics_all_rows(g)
    day_row = all_rows[all_rows["date"].dt.date == T_DATE]
    if not day_row.empty:
        daily_chunks.append(day_row)

daily_metrics = (
    pd.concat(daily_chunks, ignore_index=True)
      .sort_values(["ticker","date"])
      .reset_index(drop=True)
    if daily_chunks else pd.DataFrame()
)

print(f"Daily metrics rows for {T_DATE}: {len(daily_metrics):,}")
daily_metrics.head()

num_rows = len(data)
print(num_rows)

# CHANGING DATA TO DATABASE FORMAT

# You already have daily_metrics as a pandas DataFrame
# Assumes it has at least: 'ticker' column, plus metric columns named by metric_code, and possibly 'date' etc.

db_path = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"

# --- Connect & load dimensions ---
con.close()
con = duckdb.connect(db_path, read_only=True)

dim_ticker = con.execute("""
    SELECT ticker_id, ticker
    FROM dim_ticker
""").df()

dim_metric = con.execute("""
    SELECT metric_id, metric_code
    FROM dim_metric
""").df()

con.close()

# --- Map ticker -> ticker_id ---
ticker_map = dict(zip(dim_ticker["ticker"], dim_ticker["ticker_id"]))
daily_metrics["ticker_id"] = daily_metrics["ticker"].map(ticker_map)

# Optional: sanity check for unmapped tickers
unmapped_tickers = daily_metrics.loc[daily_metrics["ticker_id"].isna(), "ticker"].unique()
if len(unmapped_tickers):
    print(f"[WARN] {len(unmapped_tickers)} tickers not found in dim_ticker:", unmapped_tickers[:10], "..." if len(unmapped_tickers) > 10 else "")

# Drop original ticker (keep ticker_type only if you want; it’s not written to fact tables)
daily_metrics = daily_metrics.drop(columns=["ticker"])

# --- Rename metric_code columns -> metric_id ---
metric_map = dict(zip(dim_metric["metric_code"], dim_metric["metric_id"]))

# Build rename dict only for columns that match known metric_codes
rename_dict = {col: metric_map[col] for col in daily_metrics.columns if col in metric_map}

# Apply rename
daily_metrics = daily_metrics.rename(columns=rename_dict)

# Optional: report which metric columns could not be mapped
metric_like_cols = [c for c in daily_metrics.columns if c not in {"date", "ticker_id", "ticker_type"}]
unmapped_metrics = [c for c in metric_like_cols if c not in metric_map.values()]  # after rename, mapped ones are metric_ids
if unmapped_metrics:
    print(f"[INFO] {len(unmapped_metrics)} columns did not match any metric_code -> metric_id mapping (kept as-is):")
    print(unmapped_metrics[:20], "..." if len(unmapped_metrics) > 20 else "")

# --- Show result ---
print(daily_metrics.head())

# SENDING THE DATA TO THE DUCK DATABASE

# --- Existing connection (adjust if needed) ---
con = duckdb.connect(db_path)

# --- Assumes you already have a pandas DataFrame named `daily_metrics` in memory ---
PRICE_COLS_ORDERED = ["open", "high", "low", "close", "adj_close", "volume"]

# 1) Normalize date column -> 'dt' (date type)
df = daily_metrics.copy()
if "dt" not in df.columns and "date" in df.columns:
    df = df.rename(columns={"date": "dt"})
if "dt" not in df.columns:
    raise ValueError("daily_metrics must have a 'dt' or 'date' column.")
df["dt"] = pd.to_datetime(df["dt"]).dt.date

# Keep ticker_type for debugging if present (not written to fact tables)
if "ticker_type" in df.columns:
    df_price_debug = df[["ticker_id","dt","ticker_type"] + [c for c in PRICE_COLS_ORDERED if c in df.columns]].copy()

# 2) Insert/Upsert into fact_price_daily
present_price_cols = [c for c in PRICE_COLS_ORDERED if c in df.columns]
missing_id_cols = [c for c in ["ticker_id"] if c not in df.columns]
if missing_id_cols:
    raise ValueError(f"Missing required ID columns in daily_metrics: {missing_id_cols}")

if present_price_cols:
    price_insert_cols = ["ticker_id", "dt"] + present_price_cols
    df_price = df[price_insert_cols].copy()
    con.register("df_price", df_price)

    set_clauses = ", ".join([f"{c} = s.{c}" for c in present_price_cols])  # no qualifier on left
    insert_cols = ", ".join(price_insert_cols)
    insert_vals = ", ".join([f"s.{c}" for c in price_insert_cols])

    merge_sql_price = f"""
    BEGIN TRANSACTION;

    MERGE INTO fact_price_daily t
    USING df_price s
    ON t.ticker_id = s.ticker_id AND t.dt = s.dt
    WHEN MATCHED THEN UPDATE SET {set_clauses}
    WHEN NOT MATCHED THEN INSERT ({insert_cols}) VALUES ({insert_vals});

    COMMIT;
    """
    con.execute(merge_sql_price)
else:
    df_price = pd.DataFrame(columns=["ticker_id","dt"])  # for counts later

# 3) Drop price columns from the working DataFrame
df_no_price = df.drop(columns=[c for c in PRICE_COLS_ORDERED if c in df.columns], errors="ignore")

# 4) Unpivot metrics and upsert into fact_metric_daily
NON_METRIC = set(["ticker_id", "dt", "ticker_type"] + PRICE_COLS_ORDERED)
metric_cols = [c for c in df_no_price.columns if c not in NON_METRIC]

if metric_cols:
    long_metrics = df_no_price.melt(
        id_vars=["ticker_id", "dt"],
        value_vars=metric_cols,
        var_name="metric_id",
        value_name="value"
    ).dropna(subset=["value"])

    con.register("df_metrics_long", long_metrics)

    merge_sql_metrics = """
    BEGIN TRANSACTION;

    MERGE INTO fact_metric_daily t
    USING df_metrics_long s
    ON t.ticker_id = s.ticker_id AND t.dt = s.dt AND t.metric_id = s.metric_id
    WHEN MATCHED THEN UPDATE SET value = s.value
    WHEN NOT MATCHED THEN INSERT (metric_id, dt, ticker_id, value)
    VALUES (s.metric_id, s.dt, s.ticker_id, s.value);

    COMMIT;
    """
    con.execute(merge_sql_metrics)
else:
    long_metrics = pd.DataFrame(columns=["ticker_id","dt","metric_id","value"])

# 5) Update your in-session daily_metrics without price fields
daily_metrics = df_no_price

# Quick peek
print("Inserted/updated prices for rows:", len(df_price))
print("Inserted/updated metric rows:", len(long_metrics))
print("daily_metrics columns after dropping price fields:", list(daily_metrics.columns))

# 1) Ensure snapshot table exists (idempotent)
con.execute("""
CREATE TABLE IF NOT EXISTS snapshot_metric_latest (
  ticker_id INTEGER NOT NULL,
  metric_id INTEGER NOT NULL,
  dt        DATE    NOT NULL,
  value     DOUBLE,
  PRIMARY KEY (ticker_id, metric_id)
);
""")

# 2) Materialize latest rows WITHOUT window functions (no QUALIFY)
con.execute("""
CREATE OR REPLACE TEMP TABLE _latest_rows AS
WITH mx AS (
  SELECT ticker_id, metric_id, MAX(dt) AS dt
  FROM fact_metric_daily
  GROUP BY 1,2
)
SELECT
  f.ticker_id,
  f.metric_id,
  CAST(f.dt AS DATE) AS dt,
  f.value
FROM fact_metric_daily f
JOIN mx
  ON f.ticker_id = mx.ticker_id
 AND f.metric_id = mx.metric_id
 AND f.dt = mx.dt;
""")

# 3) Upsert into snapshot via MERGE
con.execute("""
MERGE INTO snapshot_metric_latest AS t
USING _latest_rows AS s
ON t.ticker_id = s.ticker_id AND t.metric_id = s.metric_id
WHEN MATCHED AND (t.dt <> s.dt OR (t.value IS DISTINCT FROM s.value)) THEN
  UPDATE SET dt = s.dt, value = s.value
WHEN NOT MATCHED THEN
  INSERT (ticker_id, metric_id, dt, value)
  VALUES (s.ticker_id, s.metric_id, s.dt, s.value);
""")

# 4) Optional: prune keys that no longer exist in fact_metric_daily
con.execute("""
DELETE FROM snapshot_metric_latest t
WHERE NOT EXISTS (
  SELECT 1 FROM (
    SELECT DISTINCT ticker_id, metric_id FROM fact_metric_daily
  ) k
  WHERE k.ticker_id = t.ticker_id AND k.metric_id = t.metric_id
);
""")

# 5) Sanity check
print("Rows in snapshot:", con.execute("SELECT COUNT(*) FROM snapshot_metric_latest").fetchone()[0])
print(con.execute("""
  SELECT * FROM snapshot_metric_latest
  ORDER BY ticker_id, metric_id
  LIMIT 10
""").fetchdf())

con.close()



        ticker ticker_type         dt    open    high     low  close  \
387614    ZTS       Stock 2025-10-06  146.43  147.04  144.85    NaN   
387615    ZTS       Stock 2025-10-07  146.20  146.20  142.15    NaN   
387616    ZTS       Stock 2025-10-08  142.46  144.87  142.28    NaN   
387617    ZTS       Stock 2025-10-09  143.82  145.22  143.25    NaN   
387618    ZTS       Stock 2025-10-10  143.39  143.49  140.83    NaN   

        adj_close   volume  
387614     145.36  3114876  
387615     142.77  2745916  
387616     143.49  3016455  
387617     143.39  3505222  
387618     141.11  2958395  
  ticker ticker_type                   dt    open    high     low   close  \
0      A       Stock  2022-10-14 00:00:00  129.00  130.22  125.47  125.70   
1      A       Stock  2022-10-17 00:00:00  127.38  131.09  127.38  130.56   
2      A       Stock  2022-10-18 00:00:00  133.92  134.68  131.20  132.30   
3      A       Stock  2022-10-19 00:00:00  130.11  130.27  127.24  128.96   
4      A    

In [3]:
# --- Config ---
#READING NECESSARY HISTORICAL INFORMATION TO DO ANALYSIS CALCS

DB_PATH = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"
ROWS_PER_TICKER = 750  # most recent N rows per ticker

# Connect read-only (avoids lock conflicts)
con = duckdb.connect(DB_PATH, read_only=True)

sql = f"""
WITH ranked AS (
  SELECT
    f.ticker_id,
    t.ticker,
    f.dt,
    f.open, f.high, f.low, f.close, f.adj_close, f.volume,
    ROW_NUMBER() OVER (PARTITION BY f.ticker_id ORDER BY f.dt DESC) AS rn
  FROM fact_price_daily AS f
  JOIN dim_ticker       AS t USING (ticker_id)
)
SELECT
  ticker, dt, open, high, low, close, adj_close, volume
FROM ranked
WHERE rn <= {ROWS_PER_TICKER}
ORDER BY ticker, dt
"""

prices = con.execute(sql).df()
print( "\n", prices.tail())

# COMBINES TODAYS PRICE DATA TO HISTORICAL DATABASE STOCK PRICE DATA

# Step 1: Align column names in 'data'
data_renamed = data.rename(columns={
    "date": "dt",
    "adjClose": "adj_close"
})

# Step 2: Ensure same column order as 'prices'
data_renamed = data_renamed[["ticker", "dt", "open", "high", "low", "close", "adj_close", "volume"]]

# Step 3: Combine with prices
combined = pd.concat([prices, data_renamed], ignore_index=True)

# Step 4: Sort by ticker, then date
combined = combined.sort_values(["ticker", "dt"]).reset_index(drop=True)

# (Optional) quick check
print(combined.head(), "\n", combined.tail())


# ANALYSIS ON OHLCV DATA. SPICY STUFF.


import math
import numpy as np
import pandas as pd

# Expect TARGET_DATE to already be defined by you; this just normalizes it
TARGET_DATE = pd.to_datetime(TARGET_DATE)

# ----------------------- CONFIG -----------------------
TRADING_DAYS_PER_YEAR = 252

# Windows (trading-day counts)
WIN_05   = 5
WIN_10   = 10
WIN_15   = 15
WIN_20   = 20
WIN_50   = 50
WIN_60   = 60
WIN_100  = 100
WIN_200  = 200
WIN_252  = 252
WIN_300  = 300
WIN_750  = 750
SMA_POS_LEN = 3
OBV_BASELINE_DATE = pd.Timestamp("2004-01-01")  # kept for reproducibility, not used now

# -------------------- Helper functions --------------------
def ols_slope_window(arr: np.ndarray) -> float:
    mask = np.isfinite(arr)
    y = arr[mask]
    n = y.size
    if n < 2:
        return np.nan
    x = np.arange(n, dtype=float)
    xm, ym = x.mean(), y.mean()
    denom = np.sum((x - xm) ** 2)
    if denom == 0:
        return np.nan
    num = np.sum((x - xm) * (y - ym))
    return float(num / denom)

def max_drawdown_only(arr: np.ndarray) -> float:
    a = np.asarray(arr, dtype=float)
    if not np.isfinite(a).any():
        return np.nan
    start_idx = -1
    peak = np.nan
    for i in range(a.size):
        if math.isfinite(a[i]) and a[i] > 0.0:
            peak = a[i]
            start_idx = i
            break
    if start_idx < 0:
        return np.nan
    best_dd = 0.0
    for j in range(start_idx + 1, a.size):
        pj = a[j]
        if not math.isfinite(pj) or pj <= 0.0:
            continue
        if pj > peak:
            peak = pj
        dd = pj / peak - 1.0
        if dd < best_dd:
            best_dd = dd
    return float(best_dd)

def max_drawdown_duration_only(arr: np.ndarray) -> float:
    a = np.asarray(arr, dtype=float)
    if not np.isfinite(a).any():
        return np.nan
    max_price = np.nan
    max_idx = -1
    for i in range(a.size):
        if math.isfinite(a[i]) and a[i] > 0.0:
            max_price = a[i]
            max_idx = i
            break
    if max_idx < 0:
        return np.nan
    best_dd = 0.0
    best_dur = 0
    for j in range(max_idx + 1, a.size):
        pj = a[j]
        if not math.isfinite(pj) or pj <= 0.0:
            continue
        if pj > max_price:
            max_price = pj
            max_idx = j
        dd = pj / max_price - 1.0
        if dd < best_dd:
            best_dd = dd
            best_dur = j - max_idx
    return float(best_dur)

def per_ticker_metrics_all_rows(g: pd.DataFrame) -> pd.DataFrame:
    """Compute metrics for a single ticker (sorted by date)."""
    g = g.sort_values("date").copy()

    # Base series
    close = g["adj_close"].astype(float)
    open_ = g["open"].astype(float)
    high  = g["high"].astype(float)
    low   = g["low"].astype(float)
    if "volume" in g.columns:
        vol = pd.to_numeric(g["volume"], errors="coerce").astype(float)
    else:
        vol = pd.Series(np.nan, index=g.index, dtype=float)

    # Safe variants for logs/divisions
    close_safe = close.replace(0, np.nan)

    # ---------- Core logs & returns ----------
    log_price = np.log(close_safe)
    prev_close = close_safe.shift(1)
    with np.errstate(divide='ignore', invalid='ignore'):
        log_returns = np.where((close_safe > 0) & (prev_close > 0),
                               np.log(close_safe / prev_close), np.nan)
    log_returns = pd.Series(log_returns, index=g.index)
    if len(g) > 0 and pd.isna(log_returns.iloc[0]):
        log_returns.iloc[0] = 0.0

    # ---------- Moving averages / VWAP (momentum labels removed) ----------
    ma20   = close.rolling(WIN_20,  min_periods=1).mean()
    ma50   = close.rolling(WIN_50,  min_periods=1).mean()
    ma100  = close.rolling(WIN_100, min_periods=1).mean()
    ma200  = close.rolling(WIN_200, min_periods=1).mean()

    dv = close * vol
    sum_px_vol_20 = dv.rolling(WIN_20, min_periods=1).sum()
    sum_vol_20    = vol.rolling(WIN_20, min_periods=1).sum()
    with np.errstate(invalid='ignore', divide='ignore'):
        vwap20 = np.where(sum_vol_20 > 0, sum_px_vol_20 / sum_vol_20, np.nan)

    # Dollar-volume accelerations
    with np.errstate(divide='ignore', invalid='ignore'):
        ln_dv = np.where((vol > 0) & (close > 0), np.log(vol * close), np.nan)
    ln_dv = pd.Series(ln_dv, index=g.index)
    vol_accel_5d  = ln_dv - ln_dv.shift(5)
    vol_accel_10d = ln_dv - ln_dv.shift(10)

    # 10v60 abnormal volume
    avg10_dv = dv.rolling(WIN_10, min_periods=1).mean()
    avg60_dv = dv.rolling(WIN_60, min_periods=1).mean()
    std60_dv = dv.rolling(WIN_60, min_periods=2).std(ddof=1)
    with np.errstate(invalid='ignore', divide='ignore'):
        abn_vol_60d = np.where(std60_dv > 0, (avg10_dv - avg60_dv) / std60_dv, np.nan)

    # Volatility (annualized) & mean return 100d
    vol20_ann  = log_returns.rolling(WIN_20,  min_periods=2).std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)
    vol100_ann = log_returns.rolling(WIN_100, min_periods=2).std(ddof=1) * np.sqrt(TRADING_DAYS_PER_YEAR)
    mean100    = log_returns.rolling(WIN_100, min_periods=1).mean()

    # Range/position block
    low_10  = low.rolling(WIN_10, min_periods=1).min()
    high_10 = high.rolling(WIN_10, min_periods=1).max()
    rng_10  = high_10 - low_10
    with np.errstate(invalid='ignore', divide='ignore'):
        pos_10d = np.where(rng_10 != 0, (close - low_10) / rng_10, 0.0)
    pos_10d = pd.Series(pos_10d, index=g.index)
    five_day_range_pos = pos_10d.rolling(SMA_POS_LEN, min_periods=1).mean()

    daily_range = (high - low)
    avg_rng_10  = daily_range.rolling(WIN_10, min_periods=1).mean()
    avg_rng_60  = daily_range.rolling(WIN_60, min_periods=1).mean()
    std_rng_60  = daily_range.rolling(WIN_60, min_periods=2).std(ddof=1)
    with np.errstate(invalid='ignore', divide='ignore'):
        z_60_10_highlowrange = np.where(std_rng_60 > 0, (avg_rng_10 - avg_rng_60) / std_rng_60, 0.0)

    # Multi-horizon log returns (use 0.0 when lag missing)
    def safe_lr(curr, lagged):
        return np.where((curr > 0) & (lagged > 0), np.log(curr / lagged), 0.0)

    ret5   = safe_lr(close, close.shift(5))
    ret10  = safe_lr(close, close.shift(10))
    ret20  = safe_lr(close, close.shift(20))
    ret40  = safe_lr(close, close.shift(40))
    ret60  = safe_lr(close, close.shift(60))
    ret200 = safe_lr(close, close.shift(200))
    ret300 = safe_lr(close, close.shift(300))

    # Median return over last 100 trading days
    median_return_100d = log_returns.rolling(WIN_100, min_periods=1).median()

    # 100-day rolling drawdown percent & duration (calendar-day duration)
    closes_by_date = pd.Series(close.values, index=g["date"])
    def _dd_percent(window: pd.Series) -> float:
        w = window.dropna()
        if len(w) <= 1:
            return 0.0
        dmax = w.idxmax()
        max_close = w.loc[dmax]
        suffix = w.loc[dmax:]
        if suffix.empty:
            return 0.0
        return float(suffix.min() / max_close - 1.0)

    def _dd_duration(window: pd.Series) -> float:
        w = window.dropna()
        if len(w) <= 1:
            return 0.0
        dmax = w.idxmax()
        dmin = w.loc[dmax:].idxmin()
        return float((dmin - dmax).days)

    drawdown_percent_100 = closes_by_date.rolling(WIN_100, min_periods=1).apply(_dd_percent, raw=False).values
    drawdown_days_100    = closes_by_date.rolling(WIN_100, min_periods=1).apply(_dd_duration, raw=False).values

    # ---------- Extended analytics (non-annualized vols, deviations, slopes, etc.) ----------
    lr = pd.Series(log_returns, index=g.index)
    vol_5   = lr.rolling(WIN_05,  min_periods=WIN_05).std(ddof=1)
    vol_15  = lr.rolling(WIN_15,  min_periods=WIN_15).std(ddof=1)
    vol_60  = lr.rolling(WIN_60,  min_periods=WIN_60).std(ddof=1)
    vol_252 = lr.rolling(WIN_252, min_periods=WIN_252).std(ddof=1)

    neg = np.minimum(lr, 0.0)
    pos = np.maximum(lr, 0.0)
    dd_15  = (pd.Series(neg).pow(2).rolling(WIN_15,  min_periods=WIN_15).mean()) ** 0.5
    dd_60  = (pd.Series(neg).pow(2).rolling(WIN_60,  min_periods=WIN_60).mean()) ** 0.5
    dd_252 = (pd.Series(neg).pow(2).rolling(WIN_252, min_periods=WIN_252).mean()) ** 0.5
    ud_15  = (pd.Series(pos).pow(2).rolling(WIN_15,  min_periods=WIN_15).mean()) ** 0.5
    ud_60  = (pd.Series(pos).pow(2).rolling(WIN_60,  min_periods=WIN_60).mean()) ** 0.5
    ud_252 = (pd.Series(pos).pow(2).rolling(WIN_252, min_periods=WIN_252).mean()) ** 0.5

    # Parkinson HL volatility (20d), non-annualized
    hl_log = np.log((high.replace(0, np.nan)) / (low.replace(0, np.nan)))
    k = 1.0 / (4.0 * math.log(2.0))
    pk20 = np.sqrt(k * (hl_log.pow(2).rolling(WIN_20, min_periods=WIN_20).mean()))

    # Change in 10-day cumulative log returns
    sum10 = lr.rolling(WIN_10, min_periods=WIN_10).sum()
    change_10dayret = sum10 - sum10.shift(WIN_10)

    # 60d return acceleration via slopes of log_price
    slope_lp_recent60 = pd.Series(np.log(close.replace(0, np.nan))).rolling(WIN_60, min_periods=WIN_60).apply(ols_slope_window, raw=True)
    slope_lp_prev60   = slope_lp_recent60.shift(WIN_60)
    ret_accel_60      = slope_lp_recent60 - slope_lp_prev60

    # Vol slopes
    slope_vol60_over20  = vol_60.rolling(WIN_20, min_periods=WIN_20).apply(ols_slope_window, raw=True)
    slope_vol252_over60 = vol_252.rolling(WIN_60, min_periods=WIN_60).apply(ols_slope_window, raw=True)

    # Dollar volume long windows & correlation
    dv_sma_252 = dv.rolling(WIN_252, min_periods=WIN_252).mean()
    dv_sma_60  = dv.rolling(WIN_60,  min_periods=WIN_60).mean()
    dv252_accel_60 = dv_sma_252.rolling(WIN_60, min_periods=WIN_60).apply(ols_slope_window, raw=True)
    corr_px_dv_60  = close.rolling(WIN_60, min_periods=WIN_60).corr(dv)

    # EMA(5) of 15d volatility
    ema5_of_vol15 = vol_15.ewm(span=5, adjust=False).mean()

    # Rolling 750-day max drawdown & duration (in trading days)
    mdd_750     = close.rolling(WIN_750, min_periods=2).apply(max_drawdown_only, raw=True)
    mdd_dur_750 = close.rolling(WIN_750, min_periods=2).apply(max_drawdown_duration_only, raw=True)

    # ---------- Assemble ----------
    out = pd.DataFrame({
        "date": g["date"].values,
        "ticker": g["ticker"].values,

        # Original OHLCV (with "adj_close" as the working close)
        "open": open_.values,
        "high": high.values,
        "low": low.values,
        "adj_close": close.values,
        "volume": vol.values,

        # SQL-parity / price-volume set (momentum labels removed)
        "log_returns": lr.values,
        "volatility_20d": vol20_ann.values,
        "volatility_100d": vol100_ann.values,
        "mean_return_100d": mean100.values,
        "moving_avg_20d": ma20.values,
        "moving_avg_50d": ma50.values,
        "moving_avg_100d": ma100.values,
        "moving_avg_200d": ma200.values,
        "vwap_20d": vwap20,
        "vol_accel_5d": vol_accel_5d.values,
        "vol_accel_10d": vol_accel_10d.values,
        "abn_vol_60d": abn_vol_60d,
        "5_day_range_pos": five_day_range_pos.values,
        "60_10_highlowrange_zscore": z_60_10_highlowrange,
        "5_day_ret": ret5,
        "10_day_ret": ret10,
        "20_day_ret": ret20,
        "40_day_ret": ret40,
        "60_day_ret": ret60,
        "200_day_ret": ret200,
        "300_day_ret": ret300,
        "median_return_100d": median_return_100d.values,
        "drawdown_percent": drawdown_percent_100,
        "drawdown_duration_days": drawdown_days_100,

        # Extended analytics (no OBV fields)
        "log_prices": log_price.values,
        "change_10dayret": change_10dayret.values,
        "slope_over60_of_logprice": slope_lp_recent60.values,
        "prior_slope_over60_of_logprice": slope_lp_prev60.values,
        "60d_return_accel": ret_accel_60.values,

        "750d_drawdown": mdd_750.values,
        "750d_drawdownduration": mdd_dur_750.values,

        "15d_downsidedeviation": dd_15.values,
        "60d_downsidedeviation": dd_60.values,
        "252d_downsidedeviation": dd_252.values,

        "15d_upsidevolatility": ud_15.values,
        "60d_upsidevolatility": ud_60.values,
        "252d_upsidevolatility": ud_252.values,

        "5d_volatility": vol_5.values,
        "15d_volatility": vol_15.values,
        "60d_volatility": vol_60.values,
        "252d_volatility": vol_252.values,

        "20d_parkinson_HL_volatility": pk20.values,
        "5d_EMA_15dayvolatility": ema5_of_vol15.values,

        "slope_over20_of_60d_volatility": slope_vol60_over20.values,
        "slope_over60_of_252d_volatility": slope_vol252_over60.values,

        "252d_dollar_volume_SMA": dv_sma_252.values,
        "60d_dollar_volume_SMA":  dv_sma_60.values,
        "252d_dollar_volume_accel": dv252_accel_60.values,
        "60d_price_dollarVolume_correlation": corr_px_dv_60.values,
    }, index=g.index)

        # Clean edge cases
    out.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Previously we were filling core fields with 0.0 (including returns & vol).
    # Now we *only* zero-fill the handful of structural indicators
    # where SQL compatibility requires it (like range positions, z-scores, drawdowns).
    out[[
        "5_day_range_pos","60_10_highlowrange_zscore",
        "drawdown_percent","drawdown_duration_days"
    ]] = out[[
        "5_day_range_pos","60_10_highlowrange_zscore",
        "drawdown_percent","drawdown_duration_days"
    ]].fillna(0.0)


    return out

# -------------------- Run for TARGET_DATE (from `combined`) --------------------
_ = combined.copy()
_.columns = [c.strip().lower() for c in _.columns]

# Build a proper datetime column named 'date'
if "dt" in _.columns and "date" not in _.columns:
    _["date"] = pd.to_datetime(_["dt"])
else:
    _["date"] = pd.to_datetime(_["date"])

for c in ["open","high","low","close","adj_close"]:
    _[c] = pd.to_numeric(_[c], errors="coerce")
if "volume" in _.columns:
    _["volume"] = pd.to_numeric(_["volume"], errors="coerce")

_ = _.sort_values(["ticker","date"]).reset_index(drop=True)

# Compare on PURE DATE to avoid dtype/timezone issues
T_DATE = pd.Timestamp(TARGET_DATE).date()

daily_chunks = []
for tkr, g in _.groupby("ticker", sort=False):
    # If this ticker has any row on the calendar date, keep it
    if not (g["date"].dt.date == T_DATE).any():
        continue
    all_rows = per_ticker_metrics_all_rows(g)
    day_row = all_rows[all_rows["date"].dt.date == T_DATE]
    if not day_row.empty:
        daily_chunks.append(day_row)

daily_metrics = (
    pd.concat(daily_chunks, ignore_index=True)
      .sort_values(["ticker","date"])
      .reset_index(drop=True)
    if daily_chunks else pd.DataFrame()
)

print(f"Daily metrics rows for {T_DATE}: {len(daily_metrics):,}")
daily_metrics.head()

num_rows = len(data)
print(num_rows)

#CHANGING DATA TO DATABASE FORMAT


# You already have daily_metrics as a pandas DataFrame
# Assumes it has at least: 'ticker' column, plus metric columns named by metric_code, and possibly 'date' etc.

db_path = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"

# --- Connect & load dimensions ---
con = duckdb.connect(db_path, read_only=True)

dim_ticker = con.execute("""
    SELECT ticker_id, ticker
    FROM dim_ticker
""").df()

dim_metric = con.execute("""
    SELECT metric_id, metric_code
    FROM dim_metric
""").df()

con.close()

# --- Map ticker -> ticker_id ---
ticker_map = dict(zip(dim_ticker["ticker"], dim_ticker["ticker_id"]))
daily_metrics["ticker_id"] = daily_metrics["ticker"].map(ticker_map)

# Optional: sanity check for unmapped tickers
unmapped_tickers = daily_metrics.loc[daily_metrics["ticker_id"].isna(), "ticker"].unique()
if len(unmapped_tickers):
    print(f"[WARN] {len(unmapped_tickers)} tickers not found in dim_ticker:", unmapped_tickers[:10], "..." if len(unmapped_tickers) > 10 else "")

# Drop original ticker
daily_metrics = daily_metrics.drop(columns=["ticker"])

# --- Rename metric_code columns -> metric_id ---
metric_map = dict(zip(dim_metric["metric_code"], dim_metric["metric_id"]))

# Build rename dict only for columns that match known metric_codes
rename_dict = {col: metric_map[col] for col in daily_metrics.columns if col in metric_map}

# Apply rename
daily_metrics = daily_metrics.rename(columns=rename_dict)

# Optional: report which metric columns could not be mapped
metric_like_cols = [c for c in daily_metrics.columns if c not in {"date", "ticker_id"}]
unmapped_metrics = [c for c in metric_like_cols if c not in metric_map.values()]  # after rename, mapped ones are metric_ids
if unmapped_metrics:
    print(f"[INFO] {len(unmapped_metrics)} columns did not match any metric_code -> metric_id mapping (kept as-is):")
    print(unmapped_metrics[:20], "..." if len(unmapped_metrics) > 20 else "")

# --- Show result ---
print(daily_metrics.head())

# SENDING THE DATA TO THE DUCK DATA BASE

# --- Existing connection (adjust if needed) ---
db_path = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"
con = duckdb.connect(db_path)

# --- Assumes you already have a pandas DataFrame named `daily_metrics` in memory ---
PRICE_COLS_ORDERED = ["open", "high", "low", "close", "adj_close", "volume"]

# 1) Normalize date column -> 'dt' (date type)
df = daily_metrics.copy()
if "dt" not in df.columns and "date" in df.columns:
    df = df.rename(columns={"date": "dt"})
if "dt" not in df.columns:
    raise ValueError("daily_metrics must have a 'dt' or 'date' column.")
df["dt"] = pd.to_datetime(df["dt"]).dt.date

# 2) Insert/Upsert into fact_price_daily
present_price_cols = [c for c in PRICE_COLS_ORDERED if c in df.columns]
missing_id_cols = [c for c in ["ticker_id"] if c not in df.columns]
if missing_id_cols:
    raise ValueError(f"Missing required ID columns in daily_metrics: {missing_id_cols}")

if present_price_cols:
    price_insert_cols = ["ticker_id", "dt"] + present_price_cols
    df_price = df[price_insert_cols].copy()
    con.register("df_price", df_price)

    set_clauses = ", ".join([f"{c} = s.{c}" for c in present_price_cols])  # no qualifier on left
    insert_cols = ", ".join(price_insert_cols)
    insert_vals = ", ".join([f"s.{c}" for c in price_insert_cols])

    merge_sql_price = f"""
    BEGIN TRANSACTION;

    MERGE INTO fact_price_daily t
    USING df_price s
    ON t.ticker_id = s.ticker_id AND t.dt = s.dt
    WHEN MATCHED THEN UPDATE SET {set_clauses}
    WHEN NOT MATCHED THEN INSERT ({insert_cols}) VALUES ({insert_vals});

    COMMIT;
    """
    con.execute(merge_sql_price)
else:
    df_price = pd.DataFrame(columns=["ticker_id","dt"])  # for counts later

# 3) Drop price columns from the working DataFrame
df_no_price = df.drop(columns=[c for c in PRICE_COLS_ORDERED if c in df.columns])

# 4) Unpivot metrics and upsert into fact_metric_daily
NON_METRIC = set(["ticker_id", "dt"] + PRICE_COLS_ORDERED)
metric_cols = [c for c in df_no_price.columns if c not in NON_METRIC]

if metric_cols:
    long_metrics = df_no_price.melt(
        id_vars=["ticker_id", "dt"],
        value_vars=metric_cols,
        var_name="metric_id",
        value_name="value"
    ).dropna(subset=["value"])

    # Optionally coerce numeric values
    # long_metrics["value"] = pd.to_numeric(long_metrics["value"], errors="coerce").dropna()

    con.register("df_metrics_long", long_metrics)

    merge_sql_metrics = """
    BEGIN TRANSACTION;

    MERGE INTO fact_metric_daily t
    USING df_metrics_long s
    ON t.ticker_id = s.ticker_id AND t.dt = s.dt AND t.metric_id = s.metric_id
    WHEN MATCHED THEN UPDATE SET value = s.value
    WHEN NOT MATCHED THEN INSERT (metric_id, dt, ticker_id, value)
    VALUES (s.metric_id, s.dt, s.ticker_id, s.value);

    COMMIT;
    """
    con.execute(merge_sql_metrics)
else:
    long_metrics = pd.DataFrame(columns=["ticker_id","dt","metric_id","value"])

# 5) Update your in-session daily_metrics without price columns
daily_metrics = df_no_price

# Quick peek
print("Inserted/updated prices for rows:", len(df_price))
print("Inserted/updated metric rows:", len(long_metrics))
print("daily_metrics columns after dropping price fields:", list(daily_metrics.columns))


# 1) Ensure snapshot table exists (idempotent)
con.execute("""
CREATE TABLE IF NOT EXISTS snapshot_metric_latest (
  ticker_id INTEGER NOT NULL,
  metric_id INTEGER NOT NULL,
  dt        DATE    NOT NULL,
  value     DOUBLE,
  PRIMARY KEY (ticker_id, metric_id)
);
""")

# 2) Materialize latest rows WITHOUT window functions (no QUALIFY)
#    We join fact_metric_daily to its per-key MAX(dt)
con.execute("""
CREATE OR REPLACE TEMP TABLE _latest_rows AS
WITH mx AS (
  SELECT ticker_id, metric_id, MAX(dt) AS dt
  FROM fact_metric_daily
  GROUP BY 1,2
)
SELECT
  f.ticker_id,
  f.metric_id,
  CAST(f.dt AS DATE) AS dt,
  f.value
FROM fact_metric_daily f
JOIN mx
  ON f.ticker_id = mx.ticker_id
 AND f.metric_id = mx.metric_id
 AND f.dt = mx.dt;
""")

# 3) Upsert into snapshot via MERGE
con.execute("""
MERGE INTO snapshot_metric_latest AS t
USING _latest_rows AS s
ON t.ticker_id = s.ticker_id AND t.metric_id = s.metric_id
WHEN MATCHED AND (t.dt <> s.dt OR (t.value IS DISTINCT FROM s.value)) THEN
  UPDATE SET dt = s.dt, value = s.value
WHEN NOT MATCHED THEN
  INSERT (ticker_id, metric_id, dt, value)
  VALUES (s.ticker_id, s.metric_id, s.dt, s.value);
""")

# 4) Optional: prune keys that no longer exist in fact_metric_daily
con.execute("""
DELETE FROM snapshot_metric_latest t
WHERE NOT EXISTS (
  SELECT 1 FROM (
    SELECT DISTINCT ticker_id, metric_id FROM fact_metric_daily
  ) k
  WHERE k.ticker_id = t.ticker_id AND k.metric_id = t.metric_id
);
""")

# 5) Sanity check
print("Rows in snapshot:", con.execute("SELECT COUNT(*) FROM snapshot_metric_latest").fetchone()[0])
print(con.execute("""
  SELECT * FROM snapshot_metric_latest
  ORDER BY ticker_id, metric_id
  LIMIT 10
""").fetchdf())

con.close()



        ticker         dt    open    high     low  close  adj_close   volume
376081    ZTS 2025-09-26  141.86  143.79  141.27    NaN     143.50  2524344
376082    ZTS 2025-09-29  143.86  144.15  142.50    NaN     143.06  2854165
376083    ZTS 2025-09-30  142.89  147.00  142.40    NaN     146.32  3699138
376084    ZTS 2025-10-01  146.32  147.38  145.00    NaN     146.95  3669613
376085    ZTS 2025-10-06  146.43  147.04  144.85    NaN     145.36  3114876
  ticker                   dt    open    high     low   close  adj_close  \
0      A  2022-10-10 00:00:00  128.05  128.05  124.31  125.95     123.47   
1      A  2022-10-11 00:00:00  125.00  127.89  124.16  125.64     123.17   
2      A  2022-10-12 00:00:00  126.20  127.10  125.27  125.69     123.22   
3      A  2022-10-13 00:00:00  123.00  128.83  122.35  127.90     125.38   
4      A  2022-10-14 00:00:00  129.00  130.22  125.47  125.70     123.23   

    volume  
0  1246005  
1  1985348  
2  1069633  
3  1553600  
4  1217213   
      

In [5]:
import sys
!{sys.executable} -m pip install duckdb


Collecting duckdb
  Using cached duckdb-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (14 kB)
Using cached duckdb-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl (17.3 MB)
Installing collected packages: duckdb
Successfully installed duckdb-1.4.0


In [None]:
!pip install duckdb