In [1]:
#!/usr/bin/env python3
import asyncio, aiohttp, math, sys
from datetime import datetime, timedelta, timezone, date
import pandas as pd
import duckdb

# ==================== USER SETTINGS ====================
DB_PATH     = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"
API_KEY     = "c5PobUQjaaMTHySILWqmWi9uyIDqYJBi"
BASE        = "https://financialmodelingprep.com"
# Concurrency/pacing tuned for Starter (~300 req/min). One request per symbol.
MAX_WORKERS = 6
TIMEOUT_SEC = 20
RETRIES     = 3
BATCH_SLEEP_SEC = 5.0   # small pause between batches to smooth bursts
BATCH_SIZE  = 50        # symbols per batch (<= MAX_WORKERS * ~8 is fine)

# ==================== PREP TICKERS (Stocks only) ====================
con = duckdb.connect(DB_PATH, read_only=True)
tickers = [t[0] for t in con.execute("""
  SELECT ticker
  FROM dim_ticker
  WHERE UPPER(COALESCE(ticker_type,'')) = 'STOCK'
""").fetchall()]
con.close()

if not tickers:
    print("No tickers with ticker_type='Stock' found.", file=sys.stderr)
    sys.exit(0)

# ==================== DATE WINDOW ====================
today_utc = datetime.now(timezone.utc).date()
from_dt   = today_utc - timedelta(days=365*3 + 5)   # pad a few days
from_iso  = from_dt.isoformat()

# ==================== HTTP HELPERS ====================
sem = asyncio.Semaphore(MAX_WORKERS)

async def fetch_json(session: aiohttp.ClientSession, url: str):
    for attempt in range(RETRIES):
        try:
            async with sem:
                async with session.get(url, timeout=TIMEOUT_SEC) as r:
                    if r.status == 429:
                        # brief backoff; progressive
                        await asyncio.sleep(0.6 * (attempt + 1))
                        continue
                    if r.status == 200:
                        return await r.json()
                    # soft-fail for non-200
                    return None
        except Exception:
            await asyncio.sleep(0.5 * (attempt + 1))
    return None

def parse_float_point(d: dict):
    """Robustly extract a float-shares value from a historical row."""
    for k in ("floatShares", "sharesFloat", "freeFloat", "float"):
        v = d.get(k)
        if v is not None:
            try:
                v = float(v)
                if v > 0:
                    return v
            except Exception:
                pass
    return None

# ==================== PER-SYMBOL WORK ====================
async def fetch_float_history(session: aiohttp.ClientSession, sym: str):
    """
    Pull ~3y historical float shares for a single symbol.
    Endpoint (historical): /api/v4/historical/shares_float?symbol=SYMB
    """
    url = f"{BASE}/api/v4/historical/shares_float?symbol={sym}&apikey={API_KEY}"
    payload = await fetch_json(session, url)
    rows = []
    if isinstance(payload, list) and payload:
        for rec in payload:
            dt_str = rec.get("date")
            if not dt_str:
                continue
            try:
                dt = datetime.fromisoformat(dt_str[:10]).date()
            except Exception:
                continue
            if dt < from_dt:
                continue
            fs = parse_float_point(rec)
            if fs is None:
                continue
            rows.append({"ticker": sym, "dt": dt, "shares_float": fs})
    return rows

# ==================== MAIN ASYNC ====================
async def main() -> pd.DataFrame:
    conn = aiohttp.TCPConnector(limit=None, ssl=False)
    all_rows = []
    async with aiohttp.ClientSession(connector=conn) as session:
        for i in range(0, len(tickers), BATCH_SIZE):
            batch = tickers[i:i+BATCH_SIZE]
            batch_rows_list = await asyncio.gather(*[fetch_float_history(session, t) for t in batch])
            for br in batch_rows_list:
                if br:
                    all_rows.extend(br)
            # gentle pacing between batches to stay well under ~300/min budget
            await asyncio.sleep(BATCH_SLEEP_SEC)
    df = pd.DataFrame(all_rows, columns=["ticker","dt","shares_float"])
    if not df.empty:
        df["ticker"] = df["ticker"].str.upper()
        df["dt"] = pd.to_datetime(df["dt"]).dt.date
        # De-dup in case endpoint returns dup dates
        df = df.drop_duplicates(subset=["ticker","dt"], keep="last")
    print(f"Float-shares history rows: {len(df)} for {df['ticker'].nunique() if not df.empty else 0} tickers")
    return df

# ==================== RUN ====================
if __name__ == "__main__":
    try:
        import nest_asyncio
        nest_asyncio.apply()
        loop = asyncio.get_running_loop()
        df_hist = loop.run_until_complete(main())
    except RuntimeError:
        df_hist = asyncio.run(main())

    if df_hist.empty:
        print("No historical float-shares rows retrieved.")
        sys.exit(0)

    # ==================== UPSERT INTO DUCKDB ====================
    # We compute free-float market cap per day when a matching price exists.
    con = duckdb.connect(DB_PATH)
    con.register("df_hist", df_hist)

    # 1) Create a temp prepared set joined to dim_ticker & price
    #    Note: if fact_price_daily lacks a row for a given date (holiday/weekend),
    #    that row won’t produce market_cap; we still store shares_float and adj_close (NULL).
    sql_merge = """
    WITH src AS (
      SELECT
        UPPER(ticker) AS ticker,
        CAST(dt AS DATE) AS dt,
        CAST(shares_float AS DOUBLE) AS shares_float
      FROM df_hist
    ),
    j AS (
      SELECT
        dtk.ticker_id,
        s.dt,
        /* Keep shares_outstanding NULL in this loader (we're loading float history).
           You can augment later with a separate historical shares outstanding job if desired. */
        NULL::DOUBLE AS shares_outstanding,
        s.shares_float,
        fpd.adj_close,
        CASE WHEN fpd.adj_close IS NOT NULL THEN fpd.adj_close * s.shares_float ELSE NULL END AS market_cap
      FROM src s
      JOIN dim_ticker dtk ON dtk.ticker = s.ticker
      LEFT JOIN fact_price_daily fpd
        ON fpd.ticker_id = dtk.ticker_id
       AND fpd.dt = s.dt
    )
    MERGE INTO fact_marketcap_daily AS t
    USING j AS s
    ON t.ticker_id = s.ticker_id AND t.dt = s.dt
    WHEN MATCHED THEN UPDATE SET
      shares_outstanding = COALESCE(s.shares_outstanding, t.shares_outstanding),
      shares_float      = s.shares_float,
      adj_close         = COALESCE(s.adj_close, t.adj_close),
      market_cap        = COALESCE(s.market_cap, t.market_cap)
    WHEN NOT MATCHED THEN INSERT (ticker_id, dt, shares_outstanding, shares_float, adj_close, market_cap)
    VALUES (s.ticker_id, s.dt, s.shares_outstanding, s.shares_float, s.adj_close, s.market_cap);
    """
    con.execute(sql_merge)

    # 2) Optional: quick preview of most recent few days written
    preview = con.execute("""
      SELECT d.ticker, t.dt, t.shares_float, t.adj_close, t.market_cap
      FROM fact_marketcap_daily t
      JOIN dim_ticker d ON d.ticker_id = t.ticker_id
      WHERE t.dt >= CURRENT_DATE - INTERVAL 7 DAY
        AND d.ticker IN (SELECT DISTINCT ticker FROM df_hist)
      ORDER BY d.ticker, t.dt DESC
      LIMIT 200
    """).df()

    con.close()

    # Print a tiny sample to console
    print(preview.head(10).to_string(index=False))


Float-shares history rows: 510454 for 492 tickers
ticker         dt  shares_float  adj_close   market_cap
     A 2025-10-08     282494000        NaN          NaN
     A 2025-10-07     282494000     138.56 3.914237e+10
     A 2025-10-06     282494000     141.61 4.000398e+10
     A 2025-10-05     282494000        NaN          NaN
     A 2025-10-04     282494000        NaN          NaN
     A 2025-10-03     282494000     141.64 4.001245e+10
     A 2025-10-02     282494000     138.70 3.918192e+10
     A 2025-10-01     282494000     138.58 3.914802e+10
  AAPL 2025-10-08   14814270914        NaN          NaN
  AAPL 2025-10-07   14814270914     256.48 3.799564e+12


In [2]:
import duckdb
import pandas as pd

DB_PATH = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"

con = duckdb.connect(DB_PATH, read_only=True)

# --- First 10 rows (oldest) ---
first10 = con.execute("""
    SELECT *
    FROM fact_marketcap_daily
    ORDER BY dt ASC
    LIMIT 10
""").df()

# --- Last 10 rows (most recent) ---
last10 = con.execute("""
    SELECT *
    FROM fact_marketcap_daily
    ORDER BY dt DESC
    LIMIT 10
""").df()

con.close()

print("=== FIRST 10 ROWS (oldest) ===")
print(first10.to_string(index=False))
print("\n=== LAST 10 ROWS (most recent) ===")
print(last10.to_string(index=False))


=== FIRST 10 ROWS (oldest) ===
 ticker_id         dt  shares_outstanding  shares_float  adj_close   market_cap
       247 2022-10-04                <NA>     402130770      46.92 1.886798e+10
       377 2022-10-04                <NA>      69048256     244.27 1.686642e+10
       241 2022-10-04                <NA>    4101935060      26.35 1.080860e+11
       236 2022-10-04                <NA>     555249883      90.51 5.025567e+10
       239 2022-10-04                <NA>     229383541      88.46 2.029127e+10
       244 2022-10-04                <NA>     360398632      29.00 1.045156e+10
       245 2022-10-04                <NA>     389499326      23.99 9.344089e+09
       242 2022-10-04                <NA>     273923967     403.90 1.106379e+11
       410 2022-10-04                <NA>      52346888     198.23 1.037672e+10
       273 2022-10-04                <NA>    1698036534        NaN          NaN

=== LAST 10 ROWS (most recent) ===
 ticker_id         dt  shares_outstanding  shares_flo