## Data Sample Creation Notebook

In [1]:
import os, json, pickle, random, math, pathlib
from typing import Any, Iterable
import numpy as np
import pandas as pd

#### General Helpers

In [2]:
def safe_pickle_load(path: str):
    """
    WARNING: pickle.load executes code on deserialization. Only load files you trust.
    """
    with open(path, "rb") as f:
        return pickle.load(f)

def ensure_dir(p: str):
    pathlib.Path(p).parent.mkdir(parents=True, exist_ok=True)

def sizeof_bytes(obj: Any) -> int:
    try:
        return len(json.dumps(obj))
    except Exception:
        return 0

def truncate_str(s: str, max_len=160):
    return s if len(s) <= max_len else s[:max_len] + "…"

def to_jsonable(x, max_depth=10, max_list=50, max_dict=50, max_str=160, depth=0):
    """
    Convert arbitrary Python objects into JSON-serializable samples,
    limiting depth, container sizes, and string length.
    """
    if depth >= max_depth:
        return f"<…depth={depth} truncated…>"
    if isinstance(x, (str, int, float, bool)) or x is None:
        return truncate_str(x, max_str) if isinstance(x, str) else x
    if isinstance(x, (np.integer, np.floating, np.bool_)):
        return x.item()
    if isinstance(x, (bytes, bytearray, memoryview)):
        return f"<{type(x).__name__} len={len(x)}>"
    if isinstance(x, (list, tuple)):
        out = []
        for i, v in enumerate(x[:max_list]):
            out.append(to_jsonable(v, max_depth, max_list, max_dict, max_str, depth+1))
        if len(x) > max_list:
            out.append(f"<…{len(x)-max_list} more…>")
        return out
    if isinstance(x, set):
        x = list(x)
        return to_jsonable(x, max_depth, max_list, max_dict, max_str, depth)
    if isinstance(x, dict):
        out = {}
        for i, (k, v) in enumerate(list(x.items())[:max_dict]):
            kk = str(k)
            out[truncate_str(kk, max_str)] = to_jsonable(v, max_depth, max_list, max_dict, max_str, depth+1)
        if len(x) > max_dict:
            out["<…more_keys…>"] = len(x)-max_dict
        return out
    # numpy arrays
    if isinstance(x, np.ndarray):
        shape = list(x.shape)
        head = x
        if x.ndim == 1:
            head = x[: min(len(x), 50)]
        elif x.ndim >= 2:
            head = x[: min(x.shape[0], 20), : min(x.shape[1], 20)]
        return {"__ndarray__": True, "dtype": str(x.dtype), "shape": shape, "preview": head.tolist()}
    # pandas
    if isinstance(x, pd.DataFrame):
        return {
            "__dataframe__": True,
            "shape": list(x.shape),
            "dtypes": {c: str(t) for c, t in x.dtypes.items()},
            "head": json.loads(x.head(10).to_json(orient="records"))
        }
    if isinstance(x, pd.Series):
        return {
            "__series__": True,
            "name": str(x.name),
            "dtype": str(x.dtype),
            "head": json.loads(x.head(20).to_json(orient="records"))
        }
    # fallback
    return f"<{type(x).__name__}>"


#### DataFrame-specific sampling

In [3]:
def sample_dataframe(df: pd.DataFrame,
                     target_bytes: int = 1_500_000,
                     max_rows: int = 3000,
                     random_state: int = 42) -> pd.DataFrame:
    if len(df) == 0:
        return df.copy()

    # Estimate bytes per row (fallback if deep fails)
    try:
        approx = df.head(min(2000, len(df))).memory_usage(deep=True).sum()
        est_bpr = max(1, approx / min(2000, len(df)))
    except Exception:
        est_bpr = max(1, (df.memory_usage().sum() / max(1, len(df))))

    n_by_size = int(target_bytes / est_bpr)
    n = max(1, min(max_rows, n_by_size, len(df)))

    if n == len(df):
        return df.copy()

    # Prefer stratified-ish sample if a categorical-like column exists
    cat_cols = [c for c in df.columns if str(df[c].dtype) == "category" or df[c].nunique() <= 50]
    if cat_cols:
        # Simple group-wise sample across the first available categorical column
        key = cat_cols[0]
        frac = n / len(df)
        try:
            return df.groupby(key, group_keys=False).apply(lambda g: g.sample(max(1, int(math.ceil(len(g)*frac))), random_state=random_state)).head(n)
        except Exception:
            pass

    return df.sample(n=n, random_state=random_state)


#### Write “mini-pickle” + “LLM pack” (JSONL + schema)

In [4]:
def write_llm_pack_from_df(df: pd.DataFrame, out_stem: str, string_max_len=200):
    """
    Writes two artifacts for a DataFrame:
      1) JSONL with normalized, truncated records: <out_stem>.jsonl
      2) Plain-text schema/preview (no code fences): <out_stem>.schema.md
    """

    def normalize_value(v):
        if isinstance(v, str):
            return truncate_str(v, string_max_len)
        if isinstance(v, (np.integer, np.floating, np.bool_)):
            return v.item()
        if isinstance(v, (list, tuple, dict, set, np.ndarray, pd.Series, pd.Timestamp)):
            try:
                return to_jsonable(v, max_depth=10, max_list=10, max_dict=10, max_str=string_max_len)
            except Exception:
                return str(v)[:string_max_len]
        return v

    # Records -> JSONL
    records = []
    for rec in df.to_dict(orient="records"):
        records.append({k: normalize_value(v) for k, v in rec.items()})

    jsonl_path = f"{out_stem}.jsonl"
    ensure_dir(jsonl_path)
    with open(jsonl_path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    # Build dtypes table (markdown-like but plain text; no fences)
    dtypes_df = pd.DataFrame({"column": df.columns, "dtype": df.dtypes.astype(str).values})
    try:
        dtypes_md = dtypes_df.to_markdown(index=False)
    except Exception:
        # Fallback simple table
        header = "column | dtype\n---|---\n"
        rows = "\n".join(f"{c} | {t}" for c, t in zip(dtypes_df["column"], dtypes_df["dtype"]))
        dtypes_md = header + rows

    example_rows_text = json.dumps(records[:5], ensure_ascii=False, indent=2)

    # Plain-text schema (no backticks, no code fences)
    schema_lines = []
    schema_lines.append("Schema for: " + os.path.basename(out_stem))
    schema_lines.append("")
    schema_lines.append(f"Rows: {len(df)}")
    schema_lines.append(f"Columns ({len(df.columns)}): " + ", ".join(map(str, df.columns)))
    schema_lines.append("")
    schema_lines.append("dtypes table:")
    schema_lines.append(dtypes_md)
    schema_lines.append("")
    schema_lines.append("Example rows (first 5) as JSON:")
    schema_lines.append(example_rows_text)
    schema_lines.append("")

    md_path = f"{out_stem}.schema.md"
    with open(md_path, "w", encoding="utf-8") as f:
        f.write("\n".join(schema_lines))


def write_llm_pack_from_obj(obj: Any, out_stem: str):
    """
    Writes two artifacts for an arbitrary Python object:
      1) JSON preview with bounds on nesting/size: <out_stem>.json
      2) Plain-text structure note (no code fences): <out_stem>.schema.md
    """
    preview = to_jsonable(obj, max_depth=5, max_list=130, max_dict=100)

    json_path = f"{out_stem}.json"
    ensure_dir(json_path)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(preview, f, ensure_ascii=False, indent=2)

    schema_lines = []
    schema_lines.append("Structure for: " + os.path.basename(out_stem))
    schema_lines.append("Type: " + type(obj).__name__)
    schema_lines.append("Approx JSON size (chars): ~" + str(sizeof_bytes(preview)))
    schema_lines.append("")

    md_path = f"{out_stem}.schema.md"
    with open(md_path, "w", encoding="utf-8") as f:
        f.write("\n".join(schema_lines))


#### Main sampler (works for DataFrames, lists, dicts, etc.)

In [5]:
def make_pickle_sample(
    input_path: str,
    output_dir: str = "samples",
    target_bytes: int = 1_500_000,       # ~1.5 MB preview target
    df_max_rows: int = 3000,
    mini_pickle_name: str | None = None,
    random_state: int = 42
):
    obj = safe_pickle_load(input_path)
    base = os.path.splitext(os.path.basename(input_path))[0]
    out_stem = os.path.join(output_dir, base)

    # Case A: pandas DataFrame
    if isinstance(obj, pd.DataFrame):
        sdf = sample_dataframe(obj, target_bytes=target_bytes, max_rows=df_max_rows, random_state=random_state)
        # 1) mini-pickle
        pkl_path = f"{out_stem}.mini.pkl" if mini_pickle_name is None else os.path.join(output_dir, mini_pickle_name)
        ensure_dir(pkl_path)
        with open(pkl_path, "wb") as f:
            pickle.dump(sdf, f, protocol=pickle.HIGHEST_PROTOCOL)
        # 2) LLM pack
        write_llm_pack_from_df(sdf, out_stem)
        return {"type": "dataframe", "mini_pickle": pkl_path, "llm_jsonl": f"{out_stem}.jsonl", "schema": f"{out_stem}.schema.md", "rows": len(sdf)}

    # Case B: list/tuple -> sample slice + a bit of randomness
    if isinstance(obj, (list, tuple)):
        k = min(len(obj), 2000)
        head = list(obj[: min(1000, len(obj))])
        remainder = obj[min(1000, len(obj)):]
        if remainder:
            rnd_take = min(len(remainder), max(0, 1000))
            random.seed(random_state)
            head += random.sample(remainder, rnd_take)
        mini = head[:k]
        # 1) mini-pickle
        pkl_path = f"{out_stem}.mini.pkl"
        ensure_dir(pkl_path)
        with open(pkl_path, "wb") as f:
            pickle.dump(mini, f, protocol=pickle.HIGHEST_PROTOCOL)
        # 2) LLM pack
        write_llm_pack_from_obj(mini, out_stem)
        return {"type": type(obj).__name__, "mini_pickle": pkl_path, "llm_json": f"{out_stem}.json", "schema": f"{out_stem}.schema.md", "items": len(mini)}

    # Case C: dict -> keep up to 1000 keys (head + random)
    if isinstance(obj, dict):
        items = list(obj.items())
        head = items[: 1]
        # head = items[: min(2, len(items))]
        # remainder = items[min(2, len(items)):]
        # if remainder:
        #     random.seed(random_state)
        #     head += random.sample(remainder, min(2, len(remainder)))
        mini = dict(head)
        pkl_path = f"{out_stem}.mini.pkl"
        ensure_dir(pkl_path)
        with open(pkl_path, "wb") as f:
            pickle.dump(mini, f, protocol=pickle.HIGHEST_PROTOCOL)
        write_llm_pack_from_obj(mini, out_stem)
        return {"type": "dict", "mini_pickle": pkl_path, "llm_json": f"{out_stem}.json", "schema": f"{out_stem}.schema.md", "keys": len(mini)}
    # if isinstance(obj, dict):
    #     items = list(obj.items())
    #     head = items[: min(600, len(items))]
    #     remainder = items[min(600, len(items)):]
    #     if remainder:
    #         random.seed(random_state)
    #         head += random.sample(remainder, min(400, len(remainder)))
    #     mini = dict(head)
    #     pkl_path = f"{out_stem}.mini.pkl"
    #     ensure_dir(pkl_path)
    #     with open(pkl_path, "wb") as f:
    #         pickle.dump(mini, f, protocol=pickle.HIGHEST_PROTOCOL)
    #     write_llm_pack_from_obj(mini, out_stem)
    #     return {"type": "dict", "mini_pickle": pkl_path, "llm_json": f"{out_stem}.json", "schema": f"{out_stem}.schema.md", "keys": len(mini)}

    # Case D: anything else -> store original as mini if already small, else JSON preview only
    pkl_path = f"{out_stem}.mini.pkl"
    ensure_dir(pkl_path)
    try:
        # Try dumping as-is (object might already be small, e.g., train_airport.pkl)
        with open(pkl_path, "wb") as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
        dumped = True
    except Exception:
        dumped = False

    write_llm_pack_from_obj(obj, out_stem)
    return {"type": type(obj).__name__, "mini_pickle": pkl_path if dumped else None, "llm_json": f"{out_stem}.json", "schema": f"{out_stem}.schema.md"}


#### Run sampler on processed files 

In [10]:
inputs = [
    "latest_traffic.pkl",         # ~730 MB
    "latest_volume_pickups.pkl",  # ~223 MB
    "train_airport.pkl",          # ~2 KB
]

results = {}
for p in inputs:
    results[p] = make_pickle_sample(
        p,
        output_dir="samples",
        target_bytes=1_500_000,   # ~1.5 MB preview target (adjust up/down)
        df_max_rows=3000,         # hard cap on rows for DataFrames
        random_state=42
    )

results


{'latest_traffic.pkl': {'type': 'dict',
  'mini_pickle': 'samples/latest_traffic.mini.pkl',
  'llm_json': 'samples/latest_traffic.json',
  'schema': 'samples/latest_traffic.schema.md',
  'keys': 1000},
 'latest_volume_pickups.pkl': {'type': 'dict',
  'mini_pickle': 'samples/latest_volume_pickups.mini.pkl',
  'llm_json': 'samples/latest_volume_pickups.json',
  'schema': 'samples/latest_volume_pickups.schema.md',
  'keys': 1000},
 'train_airport.pkl': {'type': 'dict',
  'mini_pickle': 'samples/train_airport.mini.pkl',
  'llm_json': 'samples/train_airport.json',
  'schema': 'samples/train_airport.schema.md',
  'keys': 21}}

#### Run sampler on all_trajs.pkl

In [6]:
all_trajs = "all_trajs.pkl"

all_trajs_results = make_pickle_sample(
        all_trajs,
        output_dir="samples",
        target_bytes=1_500_000,
        df_max_rows=3000,
        random_state=42
)

all_trajs_results

{'type': 'dict',
 'mini_pickle': 'samples/all_trajs.mini.pkl',
 'llm_json': 'samples/all_trajs.json',
 'schema': 'samples/all_trajs.schema.md',
 'keys': 1}