In [4]:
#!/usr/bin/env python3
import asyncio, aiohttp
import duckdb
import pandas as pd
from datetime import datetime, timezone



# ====== USER SETTINGS ======
DB_PATH = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"
MAX_WORKERS = 6          # ~5–6 is safe for FMP Starter (~300/min)
TIMEOUT_SEC = 10
RETRIES = 2
API_KEY = "c5PobUQjaaMTHySILWqmWi9uyIDqYJBi"

BASE = "https://financialmodelingprep.com"

# ====== DB CONNECT ======
con = duckdb.connect(DB_PATH, read_only=True)
tickers = [t[0] for t in con.execute("SELECT ticker FROM dim_ticker").fetchall()]
con.close()

# ====== HTTP HELPERS ======
sem = asyncio.Semaphore(MAX_WORKERS)

async def fetch_json(session: aiohttp.ClientSession, url: str):
    for attempt in range(RETRIES):
        try:
            async with sem:
                async with session.get(url, timeout=TIMEOUT_SEC) as r:
                    if r.status == 429:
                        await asyncio.sleep(0.8)  # short pause
                        continue
                    if r.status == 200:
                        return await r.json()
                    return None
        except Exception:
            await asyncio.sleep(0.3 * (attempt + 1))
    return None

def parse_shares_outstanding(payload):
    if isinstance(payload, list) and payload:
        rec = payload[0]
        so = rec.get("sharesOutstanding")
        if so and so > 0:
            return float(so)
    return None

def parse_float(payload):
    if isinstance(payload, list) and payload:
        rec = payload[0]
        for k in ("floatShares", "sharesFloat", "freeFloat", "float"):
            v = rec.get(k)
            if v and v > 0:
                return float(v)
    return None

# ====== PER-TICKER WORK ======
async def fetch_one(session, sym: str):
    # shares outstanding (prefer TTM, then latest KM)
    km_ttm = await fetch_json(session, f"{BASE}/stable/key-metrics-ttm?symbol={sym}&apikey={API_KEY}")
    shares_out = parse_shares_outstanding(km_ttm)
    if shares_out is None:
        km = await fetch_json(session, f"{BASE}/stable/key-metrics?symbol={sym}&limit=1&apikey={API_KEY}")
        shares_out = parse_shares_outstanding(km)

    # float shares (prefer stable, then v4)
    f1 = await fetch_json(session, f"{BASE}/stable/shares-float?symbol={sym}&apikey={API_KEY}")
    float_shares = parse_float(f1)
    if float_shares is None:
        f2 = await fetch_json(session, f"{BASE}/api/v4/shares_float?symbol={sym}&apikey={API_KEY}")
        float_shares = parse_float(f2)

    float_pct = (float_shares / shares_out) if (float_shares and shares_out and shares_out > 0) else None

    return {
        "ticker": sym,
        "shares_outstanding": shares_out,
        "float_shares": float_shares,
        "float_pct": float_pct,
        "dt": datetime.now(timezone.utc).strftime("%Y-%m-%d")
    }

# ====== MAIN ======
async def main() -> pd.DataFrame:
    conn = aiohttp.TCPConnector(limit=None, ssl=False)
    async with aiohttp.ClientSession(connector=conn) as session:
        rows = []
        for i in range(0, len(tickers), 30):  # batch 30 tickers at a time
            batch = tickers[i:i+30]
            batch_rows = await asyncio.gather(*[fetch_one(session, sym) for sym in batch])
            rows.extend(batch_rows)
            await asyncio.sleep(5)  # ≈300/minute pacing for Starter
    df = pd.DataFrame(rows)
    print(df.head())
    print(f"Completed {len(df)} tickers")
    return df  # ✅ return the DataFrame

# ====== SAFE RUNNER ======
if __name__ == "__main__":
    try:
        import nest_asyncio
        nest_asyncio.apply()
        loop = asyncio.get_running_loop()
        df = loop.run_until_complete(main())  # ✅ df available after run
    except RuntimeError:
        df = asyncio.run(main())              # ✅ df available after run

    # df is now defined in memory; optionally persist if you want:
    # df.to_parquet("/Users/martingobbo/stock-dashboard/data/serving/shares_info.parquet", index=False)
    # df.to_csv("/Users/martingobbo/stock-dashboard/data/serving/shares_info.csv", index=False)


  ticker shares_outstanding  float_shares float_pct          dt
0      A               None  2.824940e+08      None  2025-10-08
1   AAPL               None  1.481427e+10      None  2025-10-08
2   ABBV               None  1.762954e+09      None  2025-10-08
3   ABNB               None  5.995028e+08      None  2025-10-08
4    ABT               None  1.730225e+09      None  2025-10-08
Completed 514 tickers


In [3]:
import duckdb
import pandas as pd
from datetime import date

# ADD MARKET CAP TO THE ABOVE DOWNLOADED FLOAR_SHARES MARKET CAP

DB_PATH = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"

# Ensure types are sane in the in-memory df
# (tolerates 'None' and strings for dt; keeps only today's rows if multiple dates)
df_norm = df.copy()
df_norm["ticker"] = df_norm["ticker"].str.upper()
df_norm["dt"] = pd.to_datetime(df_norm["dt"]).dt.date

# Coerce numeric fields (handles None, 'None', '', etc.)
for col in ["shares_outstanding", "float_shares"]:
    df_norm[col] = pd.to_numeric(df_norm[col].replace({"None": None, "": None}), errors="coerce")

target_dt = max(df_norm["dt"])
df_norm = df_norm[df_norm["dt"] == target_dt]

con = duckdb.connect(DB_PATH)
con.register("df_source", df_norm)

sql = """
WITH src AS (
  SELECT
    UPPER(ticker) AS ticker,
    CAST(dt AS DATE) AS dt,
    shares_outstanding::DOUBLE AS shares_outstanding,
    float_shares::DOUBLE AS shares_float
  FROM df_source
),
joined AS (
  SELECT
    dtk.ticker_id,
    s.dt,
    s.shares_outstanding,
    s.shares_float,
    fpd.adj_close,
    fpd.adj_close * s.shares_float AS market_cap
  FROM src s
  JOIN dim_ticker dtk ON dtk.ticker = s.ticker
  JOIN fact_price_daily fpd ON fpd.ticker_id = dtk.ticker_id AND fpd.dt = s.dt
  WHERE s.shares_float IS NOT NULL
)
MERGE INTO fact_marketcap_daily AS t
USING joined AS s
ON t.ticker_id = s.ticker_id AND t.dt = s.dt
WHEN MATCHED THEN UPDATE SET
  shares_outstanding = s.shares_outstanding,
  shares_float      = s.shares_float,
  adj_close         = s.adj_close,
  market_cap        = s.market_cap
WHEN NOT MATCHED THEN INSERT (ticker_id, dt, shares_outstanding, shares_float, adj_close, market_cap)
VALUES (s.ticker_id, s.dt, s.shares_outstanding, s.shares_float, s.adj_close, s.market_cap);
"""
con.execute(sql)


# Optional: quick sanity check for what was written today
preview = con.execute(f"""
  SELECT t.ticker_id, d.ticker, t.dt, t.shares_float, t.adj_close, t.market_cap
  FROM fact_marketcap_daily t
  JOIN dim_ticker d ON d.ticker_id = t.ticker_id
  WHERE t.dt = DATE '{target_dt}'
  ORDER BY d.ticker
""").df()

con.close()

preview


Unnamed: 0,ticker_id,ticker,dt,shares_float,adj_close,market_cap


In [3]:
# I WILL USE THE ABOVE FLOAT SHARES TO POPULATE TO DUCKDB TABLE

Unnamed: 0,ticker,shares_outstanding,float_shares,float_pct,dt
0,A,,282494000.0,,2025-10-07
1,AAPL,,14814270000.0,,2025-10-07
2,ABBV,,1762954000.0,,2025-10-07
3,ABNB,,599502800.0,,2025-10-07
4,ABT,,1730225000.0,,2025-10-07


In [7]:
#!/usr/bin/env python3
"""
GICS Sector ETFs — Daily OHLCV downloader (FMP)

- Edit API_KEY, START_DATE, END_DATE below (YYYY-MM-DD)
- Safe for FMP Starter: small concurrency, short timeouts, retries
- Returns a single pandas DataFrame `df` with columns:
  ['symbol','date','open','high','low','close','adjClose','volume']

Optionally set SAVE_CSV or SAVE_PARQUET paths to persist results.
"""

import asyncio, aiohttp, time
from typing import Dict, List, Any
import pandas as pd

# ========= USER INPUTS =========
API_KEY     = "c5PobUQjaaMTHySILWqmWi9uyIDqYJBi"     # <-- put your key here
START_DATE  = "2025-10-07"                # e.g., "2025-09-01"
END_DATE    = START_DATE                # e.g., "2025-10-07" (or same as start for single day)

# Optional outputs (set to a filepath or leave as None)
SAVE_CSV     = None                       # e.g., "/path/sector_etf_prices.csv"
SAVE_PARQUET = None                       # e.g., "/path/sector_etf_prices.parquet"

# ========= SETTINGS (Starter-safe) =========
MAX_WORKERS  = 4          # keep modest for FMP Starter
TIMEOUT_SEC  = 20
RETRIES      = 3
RETRY_BACKOFF_BASE = 1.6  # exponential backoff base

# ========= TICKERS =========
SECTOR_ETFS: Dict[str, str] = {
    "XLK":  "Technology Select Sector SPDR Fund",
    "XLF":  "Financials Select Sector SPDR Fund",
    "XLI":  "Industrials Select Sector SPDR Fund",
    "XLY":  "Consumer Discretionary Select Sector SPDR Fund",
    "XLP":  "Consumer Staples Select Sector SPDR Fund",
    "XLV":  "Health Care Select Sector SPDR Fund",
    "XLE":  "Energy Select Sector SPDR Fund",
    "XLU":  "Utilities Select Sector SPDR Fund",
    "XLB":  "Materials Select Sector SPDR Fund",
    "XLC":  "Communication Services Select Sector SPDR Fund",
    "XLRE": "Real Estate Select Sector SPDR Fund",
}

BASE = "https://financialmodelingprep.com/api/v3/historical-price-full"

# ========= HTTP HELPERS =========
sem = asyncio.Semaphore(MAX_WORKERS)

async def fetch_json(session: aiohttp.ClientSession, url: str) -> Any:
    last_err = None
    for attempt in range(1, RETRIES + 1):
        try:
            async with sem:
                async with session.get(url, timeout=TIMEOUT_SEC) as r:
                    if r.status == 200:
                        return await r.json()
                    # For rate limits / server hiccups, backoff and retry
                    if r.status in {429, 500, 502, 503, 504}:
                        last_err = RuntimeError(f"HTTP {r.status} on {url}")
                    else:
                        text = await r.text()
                        raise RuntimeError(f"HTTP {r.status} on {url}: {text[:200]}")
        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
            last_err = e

        # backoff before next try
        if attempt < RETRIES:
            sleep_for = (RETRY_BACKOFF_BASE ** (attempt - 1))
            await asyncio.sleep(sleep_for)
    # If we’re here, all retries failed
    raise last_err or RuntimeError(f"Failed to fetch {url}")

async def fetch_symbol(session: aiohttp.ClientSession, symbol: str, start: str, end: str) -> pd.DataFrame:
    # Note: no serietype param so we get full OHLCV, not close-only
    url = f"{BASE}/{symbol}?from={start}&to={end}&apikey={API_KEY}"
    data = await fetch_json(session, url)
    hist = data.get("historical", []) or []

    # Normalize rows
    rows = []
    for h in hist:
        rows.append({
            "symbol":   symbol,
            "date":     h.get("date"),
            "open":     h.get("open"),
            "high":     h.get("high"),
            "low":      h.get("low"),
            "close":    h.get("close"),
            "adjClose": h.get("adjClose"),
            "volume":   h.get("volume"),
        })
    if not rows:
        return pd.DataFrame(columns=["symbol","date","open","high","low","close","adjClose","volume"])
    df = pd.DataFrame(rows)
    # Types & ordering
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values(["symbol","date"]).reset_index(drop=True)
    return df

async def main() -> pd.DataFrame:
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_symbol(session, s, START_DATE, END_DATE) for s in SECTOR_ETFS.keys()]
        dfs = await asyncio.gather(*tasks)
    df = pd.concat(dfs, ignore_index=True)
    # Drop exact dupes just in case
    df = df.drop_duplicates(subset=["symbol","date"]).reset_index(drop=True)
    # Optional saves
    if SAVE_CSV:
        df.to_csv(SAVE_CSV, index=False)
    if SAVE_PARQUET:
        df.to_parquet(SAVE_PARQUET, index=False)
    return df

if __name__ == "__main__":
    # Run the async downloader and show a quick preview
    start_t = time.time()
    df = asyncio.run(main())
    dur = time.time() - start_t
    print(f"Downloaded {df['symbol'].nunique()} ETFs, {len(df):,} rows in {dur:.1f}s "
          f"for {START_DATE} → {END_DATE}.")
    print(df.head(10))


Downloaded 11 ETFs, 11 rows in 0.3s for 2025-10-07 → 2025-10-07.
  symbol       date    open    high     low   close  adjClose    volume
0    XLK 2025-10-07  289.08  289.50  284.17  285.68    285.68   7790563
1    XLF 2025-10-07   53.88   54.05   53.52   53.77     53.77  31121468
2    XLI 2025-10-07  155.36  155.77  153.90  154.25    154.25  11197585
3    XLY 2025-10-07  239.27  239.28  235.62  235.70    235.70   7312014
4    XLP 2025-10-07   77.56   78.30   77.16   78.18     78.18  12083115
5    XLV 2025-10-07  144.12  144.66  143.28  144.23    144.23  12478099
6    XLE 2025-10-07   89.27   89.56   87.91   89.49     89.49  12646147
7    XLU 2025-10-07   90.14   90.57   89.90   90.18     90.18  16449326
8    XLB 2025-10-07   89.92   90.28   89.10   89.42     89.42   5625613
9    XLC 2025-10-07  116.65  117.10  115.96  116.06    116.06   4220322


In [11]:
# DOWNLOAD SECTOR ETF PRICES AND SAVE TO DUCKDB

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# === USER SETTINGS ==========================================================
DB_PATH      = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"
FMP_API_KEY  = "c5PobUQjaaMTHySILWqmWi9uyIDqYJBi"
TICKERS      = ["XLK","XLF","XLI","XLY","XLP","XLV","XLE","XLU","XLB","XLC","XLRE"]  # <-- fixed XLU
BATCH_SIZE   = 100
SLEEP_BETWEEN_CALLS   = 0.25
SLEEP_BETWEEN_BATCHES = 3.0
MAX_RETRIES  = 5

# === IMPORTS ================================================================
import sys, time, datetime as dt
import duckdb
import requests
import pandas as pd
from typing import List, Dict, Any

API_BASE = "https://financialmodelingprep.com/api/v3/historical-price-full"

# === HELPERS ================================================================
def fmp_symbol(symbol: str) -> str:
    s = symbol.strip().upper()
    return s.replace("/", "-").replace(".", "-")

def date_range_last_5y():
    today = dt.date.today()
    start = today - dt.timedelta(days=365*5 + 3)
    return start.isoformat(), today.isoformat()

def request_with_retries(url: str, params: Dict[str, Any], max_retries: int = MAX_RETRIES):
    wait = 1.0
    for attempt in range(1, max_retries + 1):
        try:
            r = requests.get(url, params=params, timeout=30)
            if r.status_code in (429, 500, 502, 503, 504):
                ra = r.headers.get("Retry-After")
                time.sleep(int(ra) if ra and ra.isdigit() else wait)
                wait = min(wait * 2, 30)
                continue
            r.raise_for_status()
            return r.json()
        except requests.RequestException:
            if attempt == max_retries:
                raise
            time.sleep(wait)
            wait = min(wait * 2, 30)

def to_float(x):
    try:
        return float(x) if x is not None else None
    except (TypeError, ValueError):
        return None

def fetch_ohlcv_last_5y(symbol: str) -> List[Dict[str, Any]]:
    fsym = fmp_symbol(symbol)
    start, end = date_range_last_5y()
    params = {"from": start, "to": end, "apikey": FMP_API_KEY}
    url = f"{API_BASE}/{fsym}"
    data = request_with_retries(url, params)
    hist = data.get("historical") or []
    rows = []
    for h in hist:
        rows.append({
            "ticker": symbol.upper(),
            "dt": h.get("date"),  # ISO yyyy-mm-dd
            "open": to_float(h.get("open")),
            "high": to_float(h.get("high")),
            "low":  to_float(h.get("low")),
            "close": to_float(h.get("close")),
            "adj_close": to_float(h.get("adjClose", h.get("close"))),
            "volume": int(h.get("volume")) if h.get("volume") not in (None, "", "null") else None,
        })
    rows.sort(key=lambda r: r["dt"] or "")
    return rows

def chunked(lst: List[Any], n: int):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

# === DUCKDB IO ==============================================================
def load_ticker_ids(con: duckdb.DuckDBPyConnection, symbols: List[str]) -> Dict[str, int]:
    if not symbols:
        return {}
    placeholders = ",".join(["?"] * len(symbols))
    sql = f"""
        SELECT UPPER(ticker) AS ticker, ticker_id
        FROM dim_ticker
        WHERE UPPER(ticker) IN ({placeholders})
    """
    df = con.execute(sql, [s.upper() for s in symbols]).fetchdf()
    return dict(zip(df["ticker"], df["ticker_id"]))

def delete_existing_range(con: duckdb.DuckDBPyConnection, ticker_ids: List[int], dt_from: str, dt_to: str):
    if not ticker_ids:
        return
    placeholders = ",".join(["?"] * len(ticker_ids))
    sql = f"""
        DELETE FROM fact_price_daily
        WHERE ticker_id IN ({placeholders})
          AND dt BETWEEN CAST(? AS DATE) AND CAST(? AS DATE)
    """
    con.execute(sql, [*ticker_ids, dt_from, dt_to])  # positional params

def insert_price_rows(con: duckdb.DuckDBPyConnection, df: pd.DataFrame):
    if df.empty:
        return
    df = df.copy()
    df["dt"] = pd.to_datetime(df["dt"]).dt.date  # ensure DATE
    con.register("to_insert", df)
    con.execute("""
        INSERT INTO fact_price_daily (ticker_id, dt, open, high, low, close, adj_close, volume)
        SELECT ticker_id, dt, open, high, low, close, adj_close, volume
        FROM to_insert
    """)
    con.unregister("to_insert")

# === MAIN ===================================================================
def main():
    if not FMP_API_KEY or FMP_API_KEY == "YOUR_FMP_API_KEY":
        raise SystemExit("Set FMP_API_KEY first.")

    con = duckdb.connect(DB_PATH, read_only=False)

    # 1) Map tickers -> ticker_id
    ticker_map = load_ticker_ids(con, TICKERS)
    missing = sorted(set(t.upper() for t in TICKERS) - set(ticker_map.keys()))
    if missing:
        raise SystemExit(f"These tickers are missing in dim_ticker (add them first): {', '.join(missing)}")

    dt_from, dt_to = date_range_last_5y()
    print(f"Fetching daily OHLCV for {len(TICKERS)} ETFs from {dt_from} to {dt_to}")

    # 2) Idempotency: pre-delete existing rows in range
    ids = [ticker_map[t.upper()] for t in TICKERS]
    delete_existing_range(con, ids, dt_from, dt_to)

    # 3) Download in batches and insert
    total = len(TICKERS)
    done = 0
    buffer_rows: List[Dict[str, Any]] = []

    for batch_i, batch in enumerate(chunked(TICKERS, BATCH_SIZE), start=1):
        print(f"\nBatch {batch_i}: {len(batch)} tickers")
        for t in batch:
            sym = t.strip().upper()
            if not sym:
                continue
            try:
                rows = fetch_ohlcv_last_5y(sym)
                tid = ticker_map[sym]
                for r in rows:
                    r["ticker_id"] = tid
                buffer_rows.extend(rows)
                done += 1
                print(f"  [{done}/{total}] {sym}: {len(rows)} rows")
            except Exception as e:
                print(f"  [{done}/{total}] {sym}: ERROR -> {e}", file=sys.stderr)
            time.sleep(SLEEP_BETWEEN_CALLS)

        if buffer_rows:
            df = pd.DataFrame(buffer_rows, columns=[
                "ticker_id","dt","open","high","low","close","adj_close","volume"
            ])
            insert_price_rows(con, df[["ticker_id","dt","open","high","low","close","adj_close","volume"]])
            buffer_rows.clear()

        time.sleep(SLEEP_BETWEEN_BATCHES)

    # final flush (should be empty)
    if buffer_rows:
        df = pd.DataFrame(buffer_rows, columns=[
            "ticker_id","dt","open","high","low","close","adj_close","volume"
        ])
        insert_price_rows(con, df[["ticker_id","dt","open","high","low","close","adj_close","volume"]])

    # 4) Verify count
    sql = f"""
        SELECT COUNT(*) AS n_rows
        FROM fact_price_daily
        WHERE ticker_id IN ({",".join(["?"]*len(ids))})
          AND dt BETWEEN CAST(? AS DATE) AND CAST(? AS DATE)
    """
    n_rows = con.execute(sql, [*ids, dt_from, dt_to]).fetchone()[0]
    print(f"\nInserted/kept rows in range: {n_rows:,}")

    con.close()
    print("Done.")

if __name__ == "__main__":
    main()


Fetching daily OHLCV for 11 ETFs from 2020-10-05 to 2025-10-07

Batch 1: 11 tickers
  [1/11] XLK: 1258 rows
  [2/11] XLF: 1258 rows
  [3/11] XLI: 1258 rows
  [4/11] XLY: 1258 rows
  [5/11] XLP: 1258 rows
  [6/11] XLV: 1258 rows
  [7/11] XLE: 1258 rows
  [8/11] XLU: 1258 rows
  [9/11] XLB: 1258 rows
  [10/11] XLC: 1258 rows
  [11/11] XLRE: 1258 rows

Inserted/kept rows in range: 13,838
Done.


In [4]:
# ====== PORTFOLIO WEIGHTS BY SECTOR / SUBSECTOR (fixed: fact_price_daily) ======
import duckdb
import pandas as pd

# Re-use DB_PATH from above

con = duckdb.connect(DB_PATH, read_only=True)

# Detect the correct price table name
tbl_candidates = ("fact_price_daily", "fact_daily_price")
existing = {r[0] for r in con.execute(
    "SELECT table_name FROM information_schema.tables WHERE table_name IN (?,?)",
    tbl_candidates
).fetchall()}
if not existing:
    con.close()
    raise RuntimeError("Neither fact_price_daily nor fact_daily_price exists.")
PRICE_TABLE = next(iter(existing))  # pick the one that exists

sql_latest_prices = f"""
WITH latest AS (
  SELECT ticker_id, max(dt) AS dt
  FROM {PRICE_TABLE}
  GROUP BY 1
)
SELECT
  dt.ticker_id,
  dt.ticker,
  dt.name,
  dt.gics_sector,
  dt.gics_subsector,
  p.dt,
  p.adj_close
FROM (
  SELECT name, ticker, ticker_id, gics_sector, gics_subsector
  FROM dim_ticker
) dt
JOIN latest l USING (ticker_id)
JOIN {PRICE_TABLE} p
  ON p.ticker_id = l.ticker_id AND p.dt = l.dt
"""

px_df = con.execute(sql_latest_prices).df()
con.close()

# --- Float shares mapping from previous cell's DataFrame `df` ---
if "df" in globals() and isinstance(df, pd.DataFrame):
    floats_raw = df.copy()
else:
    floats_raw = pd.DataFrame(columns=["ticker", "float_shares", "shares_outstanding"])

use_cols = [c for c in ("ticker", "float_shares", "shares_outstanding") if c in floats_raw.columns]
floats = floats_raw[use_cols].copy()
for c in ("float_shares", "shares_outstanding"):
    if c in floats.columns:
        floats[c] = pd.to_numeric(floats[c], errors="coerce")

# --- Merge & market cap ---
master = px_df.merge(floats, on="ticker", how="left")
master["effective_shares"] = master.get("float_shares")
if "shares_outstanding" in master.columns:
    master["effective_shares"] = master["effective_shares"].fillna(master["shares_outstanding"])
master["market_cap"] = master["adj_close"] * master["effective_shares"]
master = master.dropna(subset=["market_cap"]).reset_index(drop=True)

def build_weights(frame: pd.DataFrame, group_col: str) -> pd.DataFrame:
    grp_cap = frame.groupby(group_col, dropna=False, as_index=False)["market_cap"].sum().rename(
        columns={"market_cap": "group_market_cap"}
    )
    out = frame.merge(grp_cap, on=group_col, how="left")
    out["weight"] = out["market_cap"] / out["group_market_cap"]
    out = out.sort_values([group_col, "weight"], ascending=[True, False])
    cols = [
        group_col, "ticker_id", "ticker", "name",
        "adj_close", "effective_shares", "market_cap",
        "group_market_cap", "weight"
    ]
    if group_col != "gics_sector" and "gics_sector" in out.columns:
        cols.insert(1, "gics_sector")
    if group_col != "gics_subsector" and "gics_subsector" in out.columns:
        cols.insert(1, "gics_subsector")
    return out[cols]

sector_weights = build_weights(master, "gics_sector")
subsector_weights = build_weights(master, "gics_subsector")

display(sector_weights.head(20))
display(subsector_weights.head(20))

sector_summaries = sector_weights.groupby("gics_sector", dropna=False, as_index=False)["group_market_cap"].first() \
                                 .sort_values("group_market_cap", ascending=False)
subsector_summaries = subsector_weights.groupby("gics_subsector", dropna=False, as_index=False)["group_market_cap"].first() \
                                       .sort_values("group_market_cap", ascending=False)

display(sector_summaries.head(20))
display(subsector_summaries.head(20))


Unnamed: 0,gics_sector,gics_subsector,ticker_id,ticker,name,adj_close,effective_shares,market_cap,group_market_cap,weight
234,Basic Materials,Chemicals - Specialty,280,LIN,Linde plc,470.37,467235500.0,219773600000.0,817181900000.0,0.268941
384,Basic Materials,Gold,332,NEM,Newmont Corporation,88.51,1095901000.0,96998230000.0,817181900000.0,0.118698
145,Basic Materials,Chemicals - Specialty,152,ECL,Ecolab Inc.,278.99,285402300.0,79624380000.0,817181900000.0,0.097438
132,Basic Materials,Chemicals - Specialty,406,SHW,The Sherwin-Williams Company,337.0,229916600.0,77481880000.0,817181900000.0,0.094816
0,Basic Materials,Chemicals - Specialty,37,APD,"Air Products and Chemicals, Inc.",270.89,221436400.0,59984910000.0,817181900000.0,0.073405
5,Basic Materials,Copper,182,FCX,Freeport-McMoRan Inc.,40.69,1427590000.0,58088650000.0,817181900000.0,0.071084
244,Basic Materials,Construction Materials,470,VMC,Vulcan Materials Company,300.9,131813700.0,39662730000.0,817181900000.0,0.048536
358,Basic Materials,Construction Materials,309,MLM,"Martin Marietta Materials, Inc.",636.11,59842250.0,38066250000.0,817181900000.0,0.046582
3,Basic Materials,Chemicals - Specialty,128,DD,"DuPont de Nemours, Inc.",78.88,417372600.0,32922350000.0,817181900000.0,0.040288
100,Basic Materials,Chemicals - Specialty,379,PPG,"PPG Industries, Inc.",101.7,224950700.0,22877480000.0,817181900000.0,0.027996


Unnamed: 0,gics_subsector,gics_sector,ticker_id,ticker,name,adj_close,effective_shares,market_cap,group_market_cap,weight
306,Advertising Agencies,Communication Services,245,IPG,"The Interpublic Group of Companies, Inc.",27.24,364215300.0,9921225000.0,9921225000.0,1.0
302,Aerospace & Defense,Industrials,199,GE,GE Aerospace,301.74,1056622000.0,318825100000.0,1088970000000.0,0.292777
54,Aerospace & Defense,Industrials,401,RTX,RTX Corporation,169.27,1223374000.0,207080500000.0,1088970000000.0,0.190162
111,Aerospace & Defense,Industrials,50,BA,The Boeing Company,221.82,724308300.0,160666100000.0,1088970000000.0,0.14754
74,Aerospace & Defense,Industrials,283,LMT,Lockheed Martin Corporation,511.07,233291500.0,119228300000.0,1088970000000.0,0.109487
122,Aerospace & Defense,Industrials,197,GD,General Dynamics Corporation,343.43,251748400.0,86457960000.0,1088970000000.0,0.079394
353,Aerospace & Defense,Industrials,430,TDG,TransDigm Group Incorporated,1284.38,56087130.0,72037190000.0,1088970000000.0,0.066152
73,Aerospace & Defense,Industrials,278,LHX,"L3Harris Technologies, Inc.",301.43,186288400.0,56152920000.0,1088970000000.0,0.051565
32,Aerospace & Defense,Industrials,47,AXON,"Axon Enterprise, Inc.",714.6,74817870.0,53464850000.0,1088970000000.0,0.049097
164,Aerospace & Defense,Industrials,454,TXT,Textron Inc.,85.03,177075500.0,15056730000.0,1088970000000.0,0.013827


Unnamed: 0,gics_sector,group_market_cap
9,Technology,13806700000000.0
1,Communication Services,7780416000000.0
5,Financial Services,5585706000000.0
2,Consumer Cyclical,5085236000000.0
6,Healthcare,4840871000000.0
7,Industrials,3309809000000.0
3,Consumer Defensive,1828586000000.0
10,Utilities,1190462000000.0
4,Energy,890875200000.0
8,Real Estate,839486500000.0


Unnamed: 0,gics_subsector,group_market_cap
62,Internet Content & Information,6999472000000.0
102,Software - Infrastructure,4699201000000.0
29,Consumer Electronics,3835258000000.0
100,Semiconductors,3327202000000.0
33,Drug Manufacturers - General,2406938000000.0
105,Specialty Retail,2317136000000.0
39,Financial - Credit Services,1465119000000.0
14,Banks - Diversified,1392193000000.0
12,Auto - Manufacturers,1280487000000.0
1,Aerospace & Defense,1088970000000.0


In [9]:
# ====== AGG TABLES: SECTOR TOTALS and SECTOR×SUBSECTOR TOTALS ======
import pandas as pd

# We assume `master` exists from the previous cell and has columns:
# ["gics_sector","gics_subsector","market_cap"]

# 1) Eleven sectors with total market cap (two columns)
sector_totals = (
    master.groupby("gics_sector", dropna=False, as_index=False)["market_cap"]
          .sum()
          .rename(columns={"market_cap": "total_market_cap"})
          .sort_values("total_market_cap", ascending=False)
    # optional: keep only the top 11 if you have extras/noise
    # .head(11)
)

display(sector_totals)

# 2) Sector × Subsector totals, ordered by sector (A→Z) then total cap (desc)
sector_subsector_totals = (
    master.groupby(["gics_sector", "gics_subsector"], dropna=False, as_index=False)["market_cap"]
          .sum()
          .rename(columns={"market_cap": "total_market_cap"})
          .sort_values(["gics_sector", "total_market_cap"], ascending=[True, False])
)

display(sector_subsector_totals)

# OPTIONAL: save to disk
# sector_totals.to_parquet("sector_totals.parquet", index=False)
# sector_subsector_totals.to_parquet("sector_subsector_totals.parquet", index=False)


Unnamed: 0,gics_sector,total_market_cap
9,Technology,19887020000000.0
1,Communication Services,8542241000000.0
5,Financial Services,5658439000000.0
2,Consumer Cyclical,5239337000000.0
6,Healthcare,4089869000000.0
7,Industrials,3414570000000.0
3,Consumer Defensive,2402576000000.0
4,Energy,1499526000000.0
10,Utilities,1166088000000.0
8,Real Estate,846759000000.0


Unnamed: 0,gics_sector,gics_subsector,total_market_cap
2,Basic Materials,Chemicals - Specialty,1.435190e+11
5,Basic Materials,Gold,9.699823e+10
3,Basic Materials,Construction Materials,7.822592e+10
0,Basic Materials,Agricultural Inputs,6.948176e+10
4,Basic Materials,Copper,5.756044e+10
...,...,...,...
108,Utilities,Independent Power Producers,9.494930e+10
110,Utilities,Regulated Gas,4.841579e+10
111,Utilities,Regulated Water,2.704411e+10
107,Utilities,General Utilities,2.562551e+10


In [10]:
# ====== SUBSECTOR PORTFOLIO: 60-DAY RETURN (uses dim_metric/fact_metric_daily if possible) ======
import duckdb
import pandas as pd

# Prereqs from previous cells:
# - DB_PATH
# - subsector_weights (with columns: gics_subsector, ticker_id, weight)
# - PRICE_TABLE (detected earlier) OR we will detect again if missing
# - master (optional; only used to sanity-check membership)

con = duckdb.connect(DB_PATH, read_only=True)

# --- Detect presence of ticker_id in fact_metric_daily ---
cols = con.execute("""
  SELECT column_name
  FROM information_schema.columns
  WHERE table_name = 'fact_metric_daily'
""").fetchall()
colset = {c[0].lower() for c in cols}

# --- Get the metric_id for 60_day_ret ---
metric_row = con.execute("""
  SELECT metric_id
  FROM dim_metric
  WHERE metric_code = '60_day_ret'
  LIMIT 1
""").fetchone()
metric_id_60 = metric_row[0] if metric_row else None

def compute_ret_from_metric():
    # Latest available 60d return per ticker_id using fact_metric_daily
    return con.execute(f"""
        WITH latest AS (
          SELECT
            ticker_id,
            value AS ret_60d,
            ROW_NUMBER() OVER (PARTITION BY ticker_id ORDER BY dt DESC) AS rn
          FROM fact_metric_daily
          WHERE metric_id = {metric_id_60}
        )
        SELECT ticker_id, ret_60d
        FROM latest
        WHERE rn = 1
    """).df()

def detect_price_table():
    existing = {
        r[0] for r in con.execute(
            "SELECT table_name FROM information_schema.tables WHERE table_name IN ('fact_price_daily','fact_daily_price')"
        ).fetchall()
    }
    if not existing:
        raise RuntimeError("Could not find a price table (fact_price_daily or fact_daily_price) for fallback.")
    return next(iter(existing))

def compute_ret_from_prices(price_table: str):
    # Compute 60-trading-day return from prices; take the latest row per ticker_id
    return con.execute(f"""
        WITH with_lag AS (
          SELECT
            ticker_id,
            dt,
            adj_close,
            LAG(adj_close, 60) OVER (PARTITION BY ticker_id ORDER BY dt) AS adj_close_lag60
          FROM {price_table}
        ),
        latest AS (
          SELECT
            ticker_id,
            dt,
            adj_close,
            adj_close_lag60,
            ROW_NUMBER() OVER (PARTITION BY ticker_id ORDER BY dt DESC) AS rn
          FROM with_lag
        )
        SELECT
          ticker_id,
          CASE
            WHEN adj_close_lag60 IS NULL OR adj_close_lag60 = 0 THEN NULL
            ELSE (adj_close / adj_close_lag60) - 1
          END AS ret_60d
        FROM latest
        WHERE rn = 1
    """).df()

# --- Choose source for 60d return ---
if metric_id_60 is not None and "ticker_id" in colset:
    ret_df = compute_ret_from_metric()
else:
    price_table = globals().get("PRICE_TABLE", None)
    if not price_table:
        price_table = detect_price_table()
    ret_df = compute_ret_from_prices(price_table)

con.close()

# --- Merge returns into subsector weights ---
need_cols = ["gics_subsector", "ticker_id", "weight"]
assert set(need_cols).issubset(subsector_weights.columns), "subsector_weights is missing required columns."

sw = subsector_weights[need_cols].copy()
ret_map = ret_df.rename(columns={"ret_60d": "ret_60d"})
merged = sw.merge(ret_map, on="ticker_id", how="left")

# Handle partial coverage: re-normalize weights on available returns per subsector
agg = (
    merged.dropna(subset=["ret_60d"])
          .groupby("gics_subsector", dropna=False)
          .apply(lambda g: pd.Series({
              "coverage_weight": g["weight"].sum(),
              "tickers_used": g["ticker_id"].nunique(),
              "subsector_60d_return": (g["weight"] * g["ret_60d"]).sum() / g["weight"].sum() if g["weight"].sum() > 0 else None
          }))
          .reset_index()
)

# Optional: include subsectors with zero coverage (no 60d return available)
all_subs = sw[["gics_subsector"]].drop_duplicates()
out = all_subs.merge(agg, on="gics_subsector", how="left") \
              .sort_values(["gics_subsector"], ascending=True)

# Display result
display(out[["gics_subsector", "subsector_60d_return", "coverage_weight", "tickers_used"]])

# If you prefer a minimal two-column view:
subsector_60d_table = out[["gics_subsector", "subsector_60d_return"]] \
    .sort_values("gics_subsector")
display(subsector_60d_table)

# OPTIONAL: save
# out.to_parquet("subsector_portfolio_60d_returns.parquet", index=False)


  .apply(lambda g: pd.Series({


Unnamed: 0,gics_subsector,subsector_60d_return,coverage_weight,tickers_used
0,Advertising Agencies,0.087693,1.0,2.0
1,Aerospace & Defense,0.081526,1.0,8.0
2,Agricultural - Machinery,0.083201,1.0,3.0
3,Agricultural Farm Products,0.089882,1.0,3.0
4,Agricultural Inputs,-0.104664,1.0,3.0
...,...,...,...,...
108,Tobacco,-0.077831,1.0,2.0
109,Travel Lodging,-0.063221,1.0,1.0
110,Travel Services,-0.063426,1.0,4.0
111,Trucking,-0.158921,1.0,1.0


Unnamed: 0,gics_subsector,subsector_60d_return
0,Advertising Agencies,0.087693
1,Aerospace & Defense,0.081526
2,Agricultural - Machinery,0.083201
3,Agricultural Farm Products,0.089882
4,Agricultural Inputs,-0.104664
...,...,...
108,Tobacco,-0.077831
109,Travel Lodging,-0.063221
110,Travel Services,-0.063426
111,Trucking,-0.158921


In [11]:
import json
from pathlib import Path
from datetime import datetime
import pandas as pd

# === CONFIG: edit if your project path differs ===
PROJECT = Path("/Users/martingobbo/stock-dashboard")
HIGHLIGHTS_PATH = PROJECT / "public" / "data" / "fundamentals_highlights.json"

# ---------- Helpers ----------
def parse_date_safe(s):
    try:
        return datetime.fromisoformat(s).timestamp()
    except Exception:
        return float("nan")

def sort_by_date_asc(rows):
    rows = [r for r in (rows or []) if r and r.get("date")]
    return sorted(rows, key=lambda r: parse_date_safe(r["date"]))

def latest_fy(rec):
    fy = sort_by_date_asc(rec.get("FY", []))
    return fy[-1] if fy else None

def latest_q(rec):
    q = sort_by_date_asc(rec.get("Q", []))
    return q[-1] if q else None

def get_number(obj, key_candidates=("revenue", "Revenue", "totalRevenue")):
    """Try several possible keys, return numeric or None."""
    for k in key_candidates:
        if k in obj and obj[k] is not None:
            try:
                return float(obj[k])
            except Exception:
                pass
    return None

# ---------- Load & Build ----------
with open(HIGHLIGHTS_PATH, "r", encoding="utf-8") as f:
    bundle = json.load(f)  # [{ symbol, FY: [...], Q: [...] }, ...]

rows = []
for rec in bundle:
    symbol = rec.get("symbol") or rec.get("ticker") or rec.get("Symbol")  # be tolerant
    if not symbol:
        continue

    fy_latest = latest_fy(rec)
    q_latest  = latest_q(rec)

    fy_rev = get_number(fy_latest) if fy_latest else None
    q_rev  = get_number(q_latest)  if q_latest  else None

    rows.append({
        "ticker": symbol,
        "annual_revenue": fy_rev,
        "recent_quarter_revenue": q_rev,
    })

# DataFrame named `revenue`
revenue = pd.DataFrame(rows).sort_values("ticker").reset_index(drop=True)

# Optional: show a few rows
revenue.head(20)


Unnamed: 0,ticker,annual_revenue,recent_quarter_revenue
0,A,6510000000.0,1738000000.0
1,AAPL,391035000000.0,94036000000.0
2,ABBV,56334000000.0,15423000000.0
3,ABNB,11102000000.0,3096000000.0
4,ABT,41950000000.0,11142000000.0
5,ACGL,16930000000.0,5213000000.0
6,ACN,69672980000.0,17596260000.0
7,ADBE,21505000000.0,5988000000.0
8,ADI,9427157000.0,2880348000.0
9,ADM,85530000000.0,21188000000.0


In [3]:
pip install aiohttp async-timeout


Collecting async-timeout
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Downloading async_timeout-5.0.1-py3-none-any.whl (6.2 kB)
Installing collected packages: async-timeout
Successfully installed async-timeout-5.0.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install nest_asyncio


Note: you may need to restart the kernel to use updated packages.


In [6]:
print(df)

None
