In [2]:
# Jupyter-friendly: download "max" daily history for a messy ticker list and save per-ticker files.
# pip install yfinance pandas pyarrow

import re
import time
from pathlib import Path

import pandas as pd
import yfinance as yf


RAW_TICKERS = r"""
NASDAQ:GOOGL,
NASDAQ:AMZN,
NASDAQ:AMGN,
NASDAQ:AAPL,
NASDAQ:AMAT,
NASDAQ:ASML,
NASDAQ:BIIB,
NASDAQ:AVGO,
NASDAQ:CSCO,
NASDAQ:CSX,
NASDAQ:GILD,
NASDAQ:INTC,
NASDAQ:ISRG
NASDAQ:META,
NASDAQ:MU,
NASDAQ:MSFT,
NASDAQ:NDAQ,
NASDAQ:NVDA,
NASDAQ:PLTR,
NASDAQ:PEP,
NASDAQ:QCOM,
 NASDAQ: PEP 
 NASDAQ: CSCO 
 NASDAQ:NVDA, 
 NASDAQ: ADP 
 NASDAQ: PYPL 
 NASDAQ: AAPL 
 NASDAQ:TXN 
 NASDAQ:QCOM, 
 NASDAQ: AMD 
 NASDAQ: ADBE 
 NASDAQ: MDLZ 
 NASDAQ:CHTR, 
 NASDAQ:AMD, 
 NASDAQ:AMAT, 
 NASDAQ: INTC 
 NASDAQ: CSX 
 NASDAQ: SBUX 
 NASDAQ:CMCSA 
 NASDAQ: NDAQ 
 NASDAQ:ADBE, 
 NASDAQ:AVGO, 
 NASDAQ:AMD, 
 NASDAQ:ADBE, 
 NASDAQ:PEP, 
 NASDAQ:AMD, 
 NASDAQ:AMAT, 
 NASDAQ: MDLZ 
 NASDAQ:QCOM, 
 NASDAQ:AMGN, 
 NASDAQ:CMCSA, 
 NASDAQ:CSX, 
 NASDAQ: SBUX 
 NASDAQ:TXN, 
 NASDAQ:AMAT, 
 NASDAQ:MSFT, 
 NASDAQ:TXN, 
 NASDAQ:BIDU, 
 NASDAQ:PYPL, 
 NASDAQ:QCOM, 
 NASDAQ:AVGO, 
 NASDAQ:NVDA, 
 NASDAQ:TXN, 
 NASDAQ:META, 
 NASDAQ:AAPL, 
 NASDAQ:BIDU, 
 NASDAQ:SBUX, 
 NYSE:FI, 
NASDAQ:MDLZ,
NASDAQ:PYPL,
 NASDAQ:CHTR, 
NASDAQ:TXN,
NASDAQ:AMAT,
NASDAQ:MDLZ,
NASDAQ:SBUX,
NASDAQ:CSX,
NASDAQ:PYPL,
NASDAQ:TXN,
NASDAQ:ADP,
NASDAQ:COST,
NASDAQ:TMUS,
NASDAQ:META,
NASDAQ:PEP,
NASDAQ:AMD,
NASDAQ:BIDU,
NASDAQ:NFLX,
NASDAQ:PLTR,
"""


def parse_tickers(raw: str) -> list[str]:
    # Split on commas/newlines, strip, remove exchange prefixes like "NASDAQ:" / "NYSE:"
    parts = re.split(r"[,\n\r]+", raw)
    out = []
    for p in parts:
        p = p.strip()
        if not p:
            continue
        if ":" in p:
            p = p.split(":", 1)[1].strip()  # keep symbol part after first ":"
        p = re.sub(r"\s+", "", p)  # remove internal spaces
        if not p:
            continue
        out.append(p)
    # de-dup while preserving order
    seen = set()
    deduped = []
    for t in out:
        if t not in seen:
            seen.add(t)
            deduped.append(t)
    return deduped


tickers = parse_tickers(RAW_TICKERS)
print(f"Tickers ({len(tickers)}): {tickers}")

# Output folder (matches your "books" idea)
OUT_DIR = Path("books") / "data" / "raw_yf"
OUT_DIR.mkdir(parents=True, exist_ok=True)

BATCH_SIZE = 15
SLEEP_BETWEEN_BATCHES_SEC = 1.0

all_frames = []

for i in range(0, len(tickers), BATCH_SIZE):
    batch = tickers[i : i + BATCH_SIZE]
    print(f"\nDownloading batch {i//BATCH_SIZE + 1} / {(len(tickers)-1)//BATCH_SIZE + 1}: {batch}")

    df = yf.download(
        tickers=batch,
        period="max",
        interval="1d",
        group_by="ticker",
        auto_adjust=False,
        threads=True,
        progress=True,
    )

    # yfinance returns:
    # - single ticker: columns like ["Open","High",...]
    # - multiple tickers: MultiIndex columns (Ticker, Field)
    if isinstance(df.columns, pd.MultiIndex):
        for t in batch:
            if t not in df.columns.get_level_values(0):
                print(f"  - missing: {t}")
                continue
            tdf = df[t].dropna(how="all")
            if tdf.empty:
                print(f"  - empty: {t}")
                continue
            tdf.index.name = "Date"
            tdf["Ticker"] = t
            all_frames.append(tdf.reset_index())
            # save per ticker
            try:
                tdf.to_parquet(OUT_DIR / f"{t}.parquet")
            except Exception:
                tdf.to_csv(OUT_DIR / f"{t}.csv")
    else:
        # single ticker batch (rare with this chunking, but handle it)
        t = batch[0]
        tdf = df.dropna(how="all")
        if not tdf.empty:
            tdf.index.name = "Date"
            tdf["Ticker"] = t
            all_frames.append(tdf.reset_index())
            try:
                tdf.to_parquet(OUT_DIR / f"{t}.parquet")
            except Exception:
                tdf.to_csv(OUT_DIR / f"{t}.csv")

    time.sleep(SLEEP_BETWEEN_BATCHES_SEC)

# Combined file (handy for analysis)
if all_frames:
    combined = pd.concat(all_frames, ignore_index=True)
    combined.sort_values(["Ticker", "Date"], inplace=True)
    try:
        combined.to_parquet(OUT_DIR / "_ALL.parquet", index=False)
    except Exception:
        combined.to_csv(OUT_DIR / "_ALL.csv", index=False)

print(f"\nDone. Files in: {OUT_DIR.resolve()}")


Tickers (35): ['GOOGL', 'AMZN', 'AMGN', 'AAPL', 'AMAT', 'ASML', 'BIIB', 'AVGO', 'CSCO', 'CSX', 'GILD', 'INTC', 'ISRG', 'META', 'MU', 'MSFT', 'NDAQ', 'NVDA', 'PLTR', 'PEP', 'QCOM', 'ADP', 'PYPL', 'TXN', 'AMD', 'ADBE', 'MDLZ', 'CHTR', 'SBUX', 'CMCSA', 'BIDU', 'FI', 'COST', 'TMUS', 'NFLX']

Downloading batch 1 / 3: ['GOOGL', 'AMZN', 'AMGN', 'AAPL', 'AMAT', 'ASML', 'BIIB', 'AVGO', 'CSCO', 'CSX', 'GILD', 'INTC', 'ISRG', 'META', 'MU']


[*********************100%***********************]  15 of 15 completed
[                       0%                       ]


Downloading batch 2 / 3: ['MSFT', 'NDAQ', 'NVDA', 'PLTR', 'PEP', 'QCOM', 'ADP', 'PYPL', 'TXN', 'AMD', 'ADBE', 'MDLZ', 'CHTR', 'SBUX', 'CMCSA']


[*********************100%***********************]  15 of 15 completed



Downloading batch 3 / 3: ['BIDU', 'FI', 'COST', 'TMUS', 'NFLX']


[*********************100%***********************]  5 of 5 completed



Done. Files in: C:\Users\MaartenEnde\Repos\scanner\books\books\data\raw_yf
