In [1]:
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
import pandas as pd, json, glob, tqdm, os

In [2]:
SRC = Path(os.path.expanduser("~/data/polygonio_data/day_aggs_v1"))           # root of raw csv.gz tree
CHUNK = 2_000_000                      # rows per pandas chunk
N_WORKERS = max(1, os.cpu_count() - 2)  # leave a couple cores free

In [3]:
def tickers_from_file(csv_gz: str) -> set[str]:
    """Return unique tickers in a single .csv.gz file (stream-read)."""
    tickers = set()
    for chunk in pd.read_csv(
            csv_gz,
            usecols=["ticker"],
            dtype={"ticker": "string"},
            compression="gzip",
            chunksize=CHUNK):
        tickers.update(chunk["ticker"].unique())
    return tickers

In [4]:
files = sorted(glob.glob(str(SRC / "**/*.csv.gz"), recursive=True))
global_set: set[str] = set()

with ProcessPoolExecutor(max_workers=N_WORKERS) as pool:
    for local_set in tqdm.tqdm(pool.map(tickers_from_file, files),
                                total=len(files),
                                desc="scanning"):
        global_set.update(local_set)

clean_tickers = [t for t in global_set if isinstance(t, str) and t]
bad = [t for t in global_set if not (isinstance(t, str) and t)]
if bad:
    print(f"⚠️  skipped {len(bad)} non-string tickers (e.g. {bad[:3]})")

# Path("ticker_catalog2.json").write_text(json.dumps(sorted(clean_tickers)))
print("unique tickers:", len(global_set))

scanning: 100%|██████████| 5517/5517 [00:07<00:00, 698.01it/s]


⚠️  skipped 1 non-string tickers (e.g. [<NA>])
unique tickers: 33833


In [5]:
from pathlib import Path
import json, gzip, pickle
import pandas as pd

def save_tickers(tickers: set[str], name: str, out_dir="ticker_lists"):
    """
    Save a set of tickers to disk in TXT, CSV, JSON and compressed Pickle.
    
    Parameters
    ----------
    tickers : set[str]
        Unique tickers (case-sensitive).
    name : str
        Base filename (e.g. "sp500", "nasdaq100").
    out_dir : str | Path
        Folder where files will be written.
    """
    out = Path(out_dir)
    out.mkdir(exist_ok=True)

    # 1 — Plain-text, one ticker per line
    (out / f"{name}.txt").write_text("\n".join(sorted(tickers)))

    # 2 — CSV (no header, no index)
    pd.Series(sorted(tickers)).to_csv(out / f"{name}.csv",
                                      index=False, header=False)

    # 3 — JSON list
    json.dump(sorted(tickers), open(out / f"{name}.json", "w"))

    # 4 — Compressed pickle (fastest for Python re-load)
    with gzip.open(out / f"{name}.pkl.gz", "wb") as f:
        pickle.dump(tickers, f, protocol=4)

    print(f"✔ saved {len(tickers):,} tickers to {out.resolve()}")

In [6]:
save_tickers(clean_tickers, "all_polygonio_tickers")

✔ saved 33,832 tickers to /home/mengren/projects/Polygon.io-data-ingestion-pipeline-main/ticker_lists
