In [1]:
import re
from pathlib import Path

import polars as pl
import srsly
from datasets import load_from_disk

from commands.utilities import MODEL_EVAL_PATH, TOK_PATH

In [2]:
path = "data/me-minipile-evals/me100M_finewebedu-20B_bpe32000minipile.parquet"

In [3]:
window_size = 5000

run_name = Path(path).stem
s = run_name.split("_")
model = s[0].removeprefix("me")
trainset = s[1]
tokenizer = s[2]
vocab_size = int(max(re.findall("\d+", tokenizer), key=len))

print(f"{model=}, {trainset=}, {tokenizer=}, {vocab_size=}, {window_size=}")

model='100M', trainset='finewebedu-20B', tokenizer='bpe32000minipile', vocab_size=32000, window_size=5000


In [4]:
tok_path = Path(f"{TOK_PATH}/{tokenizer}")
data_path = Path(f"data/minipile/{tokenizer}/test")
eval_path = Path(f"{MODEL_EVAL_PATH}/{run_name}.parquet")

assert tok_path.exists(), f"Tokenizer path {tok_path} does not exist."
assert data_path.exists(), f"Data path {data_path} does not exist."
assert eval_path.exists(), f"Eval file {eval_path} does not exist."

out_path = Path("data/me-minipile-regdata")
out_path.mkdir(exist_ok=True, parents=True)

### Load merges

In [5]:
with (tok_path / "raw_tok_path.txt").open("r") as fl:
    raw_tok_path = Path(fl.read())

merges_df = (
    pl.DataFrame(srsly.read_jsonl(raw_tok_path / "implemented_merges.jsonl"))
    .with_columns(tok_a=pl.col("pair").list.get(0), tok_b=pl.col("pair").list.get(1))
    .drop(["pair", "new_token", "part_a", "part_b"])
    .rename({"new_token_id": "tok"})
)

# Filter merges based on the window size and vocab size
merges_df = (
    merges_df.filter((pl.col("tok") < vocab_size + window_size) & (pl.col("tok") >= vocab_size - window_size))
    .sort("tok")
    .drop("count")
)

# Find tokens (in-vocab) that got merged into others, either as first or second part of the token
to_drop = pl.concat(
    [
        merges_df.filter(pl.col("tok") < vocab_size).join(
            merges_df.select(["tok", col]), left_on="tok", right_on=col, how="inner", suffix="_new"
        )
        for col in ["tok_a", "tok_b"]
    ]
).select(["tok", "tok_new"])
print(f"{len(to_drop)} tokens dropped because are part of other tokens in the window (window size: {window_size} * 2)")
merges_df = merges_df.filter(pl.col("tok").is_in(to_drop["tok"].implode()).not_())

# We only need this to get the tokens composing the OOV tokens
merges_df = merges_df.filter(pl.col("tok") >= vocab_size)  # notice the '='

263 tokens dropped because are part of other tokens in the window (window size: 5000 * 2)


### Load dataset and align evaluations and dataset

From this section we get back the original dataset where documents are aligned with evaluations

In [6]:
# Load data
data = pl.from_arrow(load_from_disk(data_path).data.table).shrink_to_fit()
doc_len = data.with_columns(len=pl.col("input_ids").list.len().cast(pl.Int64)).drop("input_ids")

print("Example of data\n", data.head())

Example of data
 shape: (5, 2)
┌──────────────────────┬─────────┐
│ input_ids            ┆ uid     │
│ ---                  ┆ ---     │
│ list[u16]            ┆ i64     │
╞══════════════════════╪═════════╡
│ [899, 1390, … 14]    ┆ 1000500 │
│ [19181, 3048, … 199] ┆ 1000501 │
│ [5318, 29501, … 14]  ┆ 1000502 │
│ [27064, 284, … 14]   ┆ 1000503 │
│ [2141, 11758, … 14]  ┆ 1000504 │
└──────────────────────┴─────────┘


In [7]:
# Read (a subset) of the evaluations and add step column
df = (
    pl.scan_parquet(eval_path)
    .filter(pl.col("step") == 0)
    # pl.scan_parquet(list(path.rglob("step0*.parquet")), include_file_paths="filename")
    # .with_columns(step=pl.col("filename").str.extract("step(\d+)").cast(pl.Int64))
    # .drop("filename")
    .collect()
    .shrink_to_fit()
)

print("Example of evaluations\n", df.head())

Example of evaluations
 shape: (5, 4)
┌─────────┬─────────────────────────────────┬─────────────────────┬──────┐
│ uid     ┆ token_logprob                   ┆ token_ids           ┆ step │
│ ---     ┆ ---                             ┆ ---                 ┆ ---  │
│ i64     ┆ list[f64]                       ┆ list[i64]           ┆ i64  │
╞═════════╪═════════════════════════════════╪═════════════════════╪══════╡
│ 1009373 ┆ [-10.568663, -12.305836, … -10… ┆ [199, 9389, … 199]  ┆ 0    │
│ 1000646 ┆ [-10.00154, -10.200759, … -10.… ┆ [199, 3, … 199]     ┆ 0    │
│ 1004697 ┆ [-10.943292, -10.172346, … -10… ┆ [199, 14672, … 199] ┆ 0    │
│ 1000604 ┆ [-10.00154, -10.669509, … -12.… ┆ [199, 4144, … 688]  ┆ 0    │
│ 1003502 ┆ [-10.655336, -10.646159, … -11… ┆ [199, 1507, … 221]  ┆ 0    │
└─────────┴─────────────────────────────────┴─────────────────────┴──────┘


In [8]:
df = (
    df.join(doc_len, on="uid", how="left")
    # consider that minimum between the original length of the doc and the evaluation
    # there might be differences becausa when we have padding then the first token is in
    # the evaluations; when there is no padding, the first token is only used as context
    .with_columns(offset=pl.min_horizontal(pl.col("token_logprob").list.len(), pl.col("len")))
    .with_columns(
        pl.col("token_logprob").list.slice(-pl.col("offset")), pl.col("token_ids").list.slice(-pl.col("offset"))
    )
    .with_columns(pos=pl.int_ranges(0, pl.col("token_logprob").list.len()))
)
doc_len_check = df.select(["uid", "offset"])

print("Example of evaluations with computed offset\n", df.head())

Example of evaluations with computed offset
 shape: (5, 7)
┌─────────┬──────────────────────┬─────────────────────┬──────┬────────┬────────┬──────────────────┐
│ uid     ┆ token_logprob        ┆ token_ids           ┆ step ┆ len    ┆ offset ┆ pos              │
│ ---     ┆ ---                  ┆ ---                 ┆ ---  ┆ ---    ┆ ---    ┆ ---              │
│ i64     ┆ list[f64]            ┆ list[i64]           ┆ i64  ┆ i64    ┆ i64    ┆ list[i64]        │
╞═════════╪══════════════════════╪═════════════════════╪══════╪════════╪════════╪══════════════════╡
│ 1009373 ┆ [-10.568663,         ┆ [199, 9389, … 199]  ┆ 0    ┆ 141790 ┆ 141789 ┆ [0, 1, … 141788] │
│         ┆ -12.305836, … -10…   ┆                     ┆      ┆        ┆        ┆                  │
│ 1000646 ┆ [-10.00154,          ┆ [199, 3, … 199]     ┆ 0    ┆ 136885 ┆ 136884 ┆ [0, 1, … 136883] │
│         ┆ -10.200759, … -10.…  ┆                     ┆      ┆        ┆        ┆                  │
│ 1004697 ┆ [-10.943292,        

In [9]:
data = (
    data.join(doc_len_check, on="uid", how="left")
    # since offset might be shorted than len doc, we need to slice the original
    # data as well
    .with_columns(pl.col("input_ids").list.slice(-pl.col("offset")))
    .drop("offset")
)

print("Example of aligned dataset\n", data.head())

Example of aligned dataset
 shape: (5, 2)
┌─────────────────────┬─────────┐
│ input_ids           ┆ uid     │
│ ---                 ┆ ---     │
│ list[u16]           ┆ i64     │
╞═════════════════════╪═════════╡
│ [899, 1390, … 14]   ┆ 1000500 │
│ [3048, 3660, … 199] ┆ 1000501 │
│ [5318, 29501, … 14] ┆ 1000502 │
│ [27064, 284, … 14]  ┆ 1000503 │
│ [2141, 11758, … 14] ┆ 1000504 │
└─────────────────────┴─────────┘


### Align un-merged tokens with the merges

For two rows of unmerged tokens, say 1 and 2, we will add another column where the values will be the token_id of the merged token that contains the two tokens. If the two tokens are not merged, the value will be the token_id of the token itself.

In [10]:
data = (
    data
    # Create position: since we aligned it in the previous section, this now works!
    .with_columns(pos=pl.int_ranges(0, pl.col("input_ids").list.len()))
    .explode(["input_ids", "pos"])
    # Rename for convenience
    .rename({"input_ids": "tok"})
    # Change data type for later
    .with_columns(pl.col("tok").cast(pl.Int64))
)

# ==== Find OOV tokens ====
# (i) Match the first token of the pair
data = data.with_columns(next_tok=pl.col("tok").shift(-1).over("uid")).join(
    merges_df, left_on=["tok", "next_tok"], right_on=["tok_a", "tok_b"], how="left", suffix="_oov_a", nulls_equal=False
)

# (ii) Match the second token of the pair and keep the
# position of the first token of the pair
# in this way we tag both rows of pair_a and pair_b with new_tok label
data = data.join(
    (data.filter(pl.col("tok_oov_a").is_not_null()).with_columns(next_pos=pl.col("pos") + 1).drop(["tok"])),
    left_on=["uid", "tok", "pos"],
    right_on=["uid", "next_tok", "next_pos"],
    how="left",
    suffix="_b",
    nulls_equal=False,
)

# # Check it works
# data.filter((pl.col("uid") == 1000500) & (pl.col("pos") > 60) & (pl.col("pos") < 70))
# merges_df.filter(pl.col("tok") == 8434)

# Clean up
data = (
    data.rename({"tok": "og_tok", "pos": "og_pos"})
    .with_columns(tok=pl.coalesce(["tok_oov_a", "tok_oov_a_b", "og_tok"]), pos=pl.min_horizontal(["pos_b", "og_pos"]))
    .drop(["next_tok", "tok_oov_a", "tok_oov_a_b", "pos_b"])
    .sort(["uid", "pos"])
)

In [11]:
print("Example of dataset aligned with merges. 'og' columns are the original columns\n", data.head())

Example of dataset aligned with merges. 'og' columns are the original columns
 shape: (5, 5)
┌────────┬─────────┬────────┬───────┬─────┐
│ og_tok ┆ uid     ┆ og_pos ┆ tok   ┆ pos │
│ ---    ┆ ---     ┆ ---    ┆ ---   ┆ --- │
│ i64    ┆ i64     ┆ i64    ┆ i64   ┆ i64 │
╞════════╪═════════╪════════╪═══════╪═════╡
│ 899    ┆ 1000500 ┆ 0      ┆ 899   ┆ 0   │
│ 1390   ┆ 1000500 ┆ 1      ┆ 1390  ┆ 1   │
│ 298    ┆ 1000500 ┆ 2      ┆ 298   ┆ 2   │
│ 1195   ┆ 1000500 ┆ 3      ┆ 1195  ┆ 3   │
│ 20798  ┆ 1000500 ┆ 4      ┆ 20798 ┆ 4   │
└────────┴─────────┴────────┴───────┴─────┘


### Load all the evaluations

There is a bit of redundant code here, but it's not too bad

In [12]:
# Read ALL evaluations and add step column
df = (
    pl.scan_parquet(eval_path)
    # Apply the same offset as before
    .join(doc_len.lazy(), on="uid", how="left")
    .with_columns(offset=pl.min_horizontal(pl.col("token_logprob").list.len(), pl.col("len")))
    .with_columns(
        pl.col("token_logprob").list.slice(-pl.col("offset")), pl.col("token_ids").list.slice(-pl.col("offset"))
    )
    .with_columns(pos=pl.int_ranges(0, pl.col("token_logprob").list.len()))
    .drop(["offset", "len"])
)

# print("Example of evaluations with computed offset\n", df.head())

In [13]:
# # Quick check that the data and evaluations are aligned
# a = df.head(100).collect()
# a = a.explode(["token_logprob", "token_ids", "pos"])
# c = a.join(data, left_on=["uid", "token_ids", "pos"], right_on=["uid", "og_tok", "og_pos"], how="inner")
# assert len(a) == len(c)

In [14]:
# keep the tokens around the cutoff
subset_data = data.filter((pl.col("tok") >= vocab_size - window_size) & (pl.col("tok") < vocab_size + window_size))

In [15]:
eval_df = (
    df.explode(["token_logprob", "token_ids", "pos"])
    .join(subset_data.lazy(), left_on=["uid", "token_ids", "pos"], right_on=["uid", "og_tok", "og_pos"], how="inner")
    .collect()
)
# check that we match all tokens in subset_data
assert eval_df.group_by("step").len()["len"].unique().item() == len(subset_data)

### Prepare data for regression

We sum the log-likelihood of OOV-tokens to that we get their probability as if they are one. Then, we aggregate across documents (i.e., average across contexts)

In [16]:
rdf = (
    eval_df
    # here pos_right allows us to not group for OOV tokens at different positions in the same doc
    .group_by(["step", "uid", "tok", "pos_right"])
    .agg(pl.col("token_logprob").sum())
    # average across contexts
    .group_by(["step", "tok"])
    .agg(
        median=pl.col("token_logprob").quantile(0.5),
        q25=pl.col("token_logprob").quantile(0.25),
        q75=pl.col("token_logprob").quantile(0.75),
        mean=pl.col("token_logprob").mean(),
        std=pl.col("token_logprob").std(),
        num=pl.len(),
    )
    .with_columns(iqr=pl.col("q75") - pl.col("q25"), treat=pl.col("tok") < vocab_size)
    .drop(["q25", "q75"])
)

In [17]:
rdf.write_parquet(out_path / f"{run_name}.parquet")