In [1]:
# ============================================
# Step 2 — Tokenize CPC CSVs and save HF dataset
# ============================================
from pathlib import Path
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import RobertaTokenizerFast
import numpy as np

# --- config ---
CSV_DIR   = Path("../data/cpc_cls")
OUT_DIR   = Path("../data/cpc_cls_encoded_vs8000_len128")  # change name if MAX_LEN changes
TOKENIZER = "../artifacts/patroberta-tokenizers/vs8000"
MAX_LEN   = 128   # start with 128; rerun with 256 if truncation is high

LABELS = list("ABCDEFGH")  # we filtered to single-section A–H
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# --- load csvs -> HF datasets ---
def load_split(name):
    df = pd.read_csv(CSV_DIR / f"cpc_cls_{name}.csv")
    # keep only A–H just in case
    df = df[df["label"].isin(LABELS)].copy()
    df["labels"] = df["label"].map(label2id).astype(int)
    return Dataset.from_pandas(df[["text","labels"]], preserve_index=False)

ds = DatasetDict({
    "train": load_split("train"),
    "validation": load_split("val"),
    "test": load_split("test"),
})
print({k: len(v) for k,v in ds.items()})

# --- tokenize ---
tok = RobertaTokenizerFast.from_pretrained(TOKENIZER)
tok.model_max_length = MAX_LEN

def enc(batch):
    out = tok(batch["text"], truncation=True, max_length=MAX_LEN)
    # track truncation
    lengths = [len(ids) for ids in out["input_ids"]]
    trunc = [int(l > MAX_LEN) for l in lengths]  # will always be 0 because truncation=True
    out["len"] = lengths
    out["was_truncated"] = [
        1 if (len(tok(t)["input_ids"]) > MAX_LEN) else 0 for t in batch["text"]
    ]
    out["labels"] = batch["labels"]
    return out

enc_ds = ds.map(enc, batched=True, remove_columns=["text"])
enc_ds = enc_ds.with_format("torch")

# --- truncation stats ---
def stats(split):
    a = enc_ds[split]["len"]
    t = enc_ds[split]["was_truncated"]
    return {
        "avg_len_tokens": float(np.mean(a)),
        "p95_len_tokens": int(np.percentile(a,95)),
        "pct_truncated": float(100*np.mean(t)),
    }

print("Stats @ MAX_LEN=", MAX_LEN)
print("train:", stats("train"))
print("val  :", stats("validation"))
print("test :", stats("test"))

# --- save to disk ---
OUT_DIR.mkdir(parents=True, exist_ok=True)
enc_ds.save_to_disk(str(OUT_DIR))
print("Saved encoded dataset to", OUT_DIR)


{'train': 194656, 'validation': 10814, 'test': 10814}


Map:   0%|          | 0/194656 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (296 > 128). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/10814 [00:00<?, ? examples/s]

Map:   0%|          | 0/10814 [00:00<?, ? examples/s]

Stats @ MAX_LEN= 128
train: {'avg_len_tokens': 126.41225032878513, 'p95_len_tokens': 128, 'pct_truncated': 96.08488821305276}
val  : {'avg_len_tokens': 126.44451636767154, 'p95_len_tokens': 128, 'pct_truncated': 96.30109117810247}
test : {'avg_len_tokens': 126.34390604771592, 'p95_len_tokens': 128, 'pct_truncated': 95.79249121509154}


Saving the dataset (0/1 shards):   0%|          | 0/194656 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10814 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10814 [00:00<?, ? examples/s]

Saved encoded dataset to ../data/cpc_cls_encoded_vs8000_len128
