### Download data from Hub

In [2]:
from huggingface_hub import snapshot_download
dataset = "pietrolesci/minipile"  # "pietrolesci/fineweb-edu-10BT"

snapshot_download(
    repo_id=dataset, repo_type="dataset", local_dir="data", allow_patterns="bpe8064minipile/*"
)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

(…)minipile/000_bpe8064minipile.ds.metadata:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

(…)8064minipile/bpe8064minipile.ds.metadata:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

000_bpe8064minipile.ds.index:   0%|          | 0.00/8.00M [00:00<?, ?B/s]

000_bpe8064minipile.ds:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

'/home/pl487/rdd/data'

In [1]:
from datatrove.utils.dataset import DatatroveFolderDataset
import polars as pl
ds = DatatroveFolderDataset(
    folder_path="data/minipile-eval/bpe32000minipile",
    filename_pattern="data/minipile-eval/bpe32000minipile/*.ds",
    seq_len=2048,
    shuffle=False,
    seed=42,
    token_size=2,
)

In [1]:
import polars as pl

In [2]:
df = pl.read_parquet("/home/pl487/rdd/data/minipile-eval-bpe32000minipile.parquet")

In [6]:
df

uid,input_ids
u64,list[i32]
0,"[49, 26, … 199]"
1,"[14795, 199, … 199]"
2,"[2011, 1164, … 14]"
3,"[16472, 17611, … 14]"
4,"[4265, 258, … 199]"
…,…
10495,"[13723, 26311, … 13425]"
10496,"[15, 10, … 199]"
10497,"[14906, 415, … 199]"
10498,"[18019, 26, … 31]"


In [2]:
data = pl.from_dicts(
    iter({"doc_idx": idx, "input_ids": doc["input_ids"].tolist()[1:]} for idx, doc in enumerate(ds)),
    schema=[('doc_idx', pl.UInt64), ('input_ids', pl.List(pl.UInt16))],
)

In [3]:
data

doc_idx,input_ids
u64,list[u16]
0,"[26, 199, … 93]"
1,"[91, 74, … 199]"
2,"[29968, 10717, … 4]"
3,"[472, 436, … 62]"
4,"[10, 75, … 60]"
…,…
8954,"[2373, 30557, … 290]"
8955,"[18828, 392, … 279]"
8956,"[8804, 22974, … 13]"
8957,"[13, 73, … 264]"


In [4]:
df

doc_idx,token_logprob
u64,list[f64]
0,"[-5.496313, -1.223472, … -3.696448]"
1,"[-4.776392, -5.827122, … -9.259767]"
2,"[-3.334471, -5.524037, … -9.588385]"
3,"[-2.233987, -2.724483, … -2.002547]"
4,"[-7.793006, -2.564259, … -7.019312]"
…,…
100703,"[-7.126338, -3.591892, -9.701439]"
100704,"[-9.839044, -0.144264, -5.004041]"
100705,"[-2.172212, -12.078355, -6.290206]"
100706,"[-7.815956, -2.294855, -1.295988]"


In [25]:
df = df.join(data, on="doc_idx", how="inner")

In [26]:
df = df.explode(["token_logprob", "input_ids"])

In [27]:
df = df.with_columns(
    tok_pos=pl.int_range(0, pl.len()).over(pl.col("doc_idx"))
)

In [19]:
df.group_by("doc_idx").agg(pl.col("tok_pos").max())

doc_idx,tok_pos
u64,i64
8685,2047
1325,2047
5389,2047
6571,2047
143,2047
…,…
3659,2047
8607,2047
3501,2047
5210,2047


In [28]:
df["token_logprob"].describe()

statistic,value
str,f64
"""count""",18348032.0
"""null_count""",0.0
"""mean""",-2.340418
"""std""",2.833773
"""min""",-27.031298
"""25%""",-3.854038
"""50%""",-1.142672
"""75%""",-0.047235
"""max""",0.0


In [30]:
(
    df.filter(pl.col("token_logprob") < -4)
    .group_by("tok_pos").agg(pl.len())
    .sort("len")
)

tok_pos,len
i64,u64
1460,1973
1557,1982
1695,1988
1444,1996
1937,1996
…,…
4,3673
3,3795
2,4137
1,4663


In [7]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    "/home/pl487/rdd/outputs/model_train_pl/smol_llama-81M-tied_bpe32000minipile_2024-09-30T19-42-18/.checkpoints/step0"
)
model = model.eval()

In [9]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 768)
    (layers): ModuleList(
      (0-5): 6 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=768, out_features=3072, bias=False)
          (up_proj): Linear(in_features=768, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((768,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((768,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((

In [25]:
batch = next(iter(DataLoader(ds, batch_size=5, shuffle=False)))

In [28]:
with torch.inference_mode():
    input_ids = batch["input_ids"]
    labels = input_ids.clone()
    loss1 = model(input_ids=input_ids, labels=labels).loss

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)


In [34]:
with torch.inference_mode():
    input_ids = batch["input_ids"][:, :-1]
    labels = batch["input_ids"][:, 1:]
    logits = model.forward(input_ids=input_ids).logits
    loss2 = torch.nn.functional.cross_entropy(logits.permute(0, 2, 1), labels)

In [1]:
from datasets import Dataset, load_from_disk

In [50]:
ds = Dataset.from_parquet("/home/pl487/rdd/data/slim-pajama-eval-bpe32000/data.parquet")

In [53]:
ds = ds.shuffle(seed=42)

In [54]:
ds = ds.take(20_000)

In [56]:
ds = ds.select_columns(["uid", "input_ids"])

In [63]:
ds.save_to_disk("/home/pl487/rdd/data/slim-pajama-eval-bpe32000/validation_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [11]:
ds = load_from_disk("/home/pl487/rdd/data/slim-pajama-eval-bpe32000/validation_dataset")

In [12]:
df = ds.to_polars()

In [13]:
import polars as pl

In [19]:
df = (
    df.with_columns(pl.col("input_ids").list.slice(0, 2049))
    .with_columns(len=pl.col("input_ids").list.len())
    .sort("len", descending=True)
    .drop("len")
)

In [20]:
ds = Dataset.from_polars(df)

In [21]:
ds.save_to_disk("/home/pl487/rdd/data/slim-pajama-eval-bpe32000/validation_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [60]:
pds = ds.to_pandas()

In [62]:
pds["input_ids"].map(len).describe()

count     20000.000000
mean       1112.514700
std        7862.181971
min          24.000000
25%         205.000000
50%         482.000000
75%         983.000000
max      752280.000000
Name: input_ids, dtype: float64

### Check eval dataset

In [4]:
from pathlib import Path

import polars as pl
import srsly
import torch
from datasets import load_from_disk
from IPython.display import HTML, display
from transformers import AutoModelForCausalLM, AutoTokenizer


In [5]:
ds = load_from_disk("data/minipile-eval-bpe32000minipile/eval_samples/")
tok = AutoTokenizer.from_pretrained("outputs/tokenizers/bpe32000minipile/")
df = pl.from_arrow(ds.data.table)
merges_df = (
    pl.DataFrame(srsly.read_jsonl("outputs/tok_train/bpe_minipile_2024-09-22T17-58-54/implemented_merges.jsonl"))
    .with_columns(pl.col("new_token_id").cast(pl.Int32))
)

def decode_sequence(tok, input_ids: list[int], highlight_ids: list[int], highlight_color: str = "green") -> None:
    # Convert token IDs to tokens
    tokens = tok.convert_ids_to_tokens(input_ids, skip_special_tokens=False)

    # Highlight tokens that are in highlight_ids
    highlighted_tokens = [
        f"<span style='background-color:{highlight_color}'>{tok}</span>" if input_ids[i] in highlight_ids else tok
        for i, tok in enumerate(tokens)
    ]

    # Convert tokens back to a single string
    decoded_string = tok.convert_tokens_to_string(highlighted_tokens)

    # Display the result in Jupyter notebook
    display(HTML(decoded_string))

# model = AutoModelForCausalLM.from_pretrained(
#     "/home/pl487/rdd/outputs/model_train/pythia-9M-bpe32000/checkpoints/checkpoint-50000"
# )

In [12]:
data_path = Path("/home/pl487/rdd/data/minipile-eval-bpe32000minipile/")
df = pl.concat([
    pl.read_parquet(data_path / "in_vocab_samples.parquet"),
    # pl.read_parquet(data_path / "out_vocab_samples.parquet"),
])

In [14]:
df = df.sort("new_token_id")

In [39]:
input_ids = df[12]["context"].to_list()[0]
decode_sequence(tok, input_ids, [input_ids[-1]])

In [50]:
df = df.sort("new_token_id")

In [48]:
df.group_by("new_token_id").agg(pl.len())

new_token_id,len
i32,u64
33322,13
32926,11
33188,15
32423,5
31738,3
…,…
30532,38
31300,14
33161,70
32625,37


In [52]:
df[-1]

new_token_id,uid,input_ids
i32,u64,list[i32]
33499,9133,"[49, 26, … 1135]"


In [3]:
batch = torch.tensor(ds[:3]["input_ids"], dtype=torch.long)
batch.shape

torch.Size([3, 2049])

In [4]:
raw_tok_path = Path("/home/pl487/rdd/outputs/tokenizers/bpe32000/")
tok = AutoTokenizer.from_pretrained(raw_tok_path)
prefix_map = {d["prefix"]: d["new_token_id"] for d in srsly.read_jsonl(raw_tok_path / "prefix_map_bpe32000.jsonl")}

In [5]:
# batch_mapping = [
#     prefix_map.get(t, []) + [t] if (i + 1) % 2 == 0 else [t] for i, t in enumerate(batch[:, -2:].flatten().tolist())
# ]
batch_mapping: list[tuple[int, list[int]]] = []
for penultimate_token, last_token in batch[:, -2:].unbind():
    # penultimate token only needs itself (always)
    m_penultimate = penultimate_token.item()

    # for the last token we apply the trick
    m_last = [last_token.item()] + prefix_map.get(last_token.item(), [])

    batch_mapping.append((m_penultimate, m_last))

[' I', ' said', ',', ' "', 'Why', ' would', ' you', ' leave', ' Ann', ' Arbor']

In [6]:
model.eval()
with torch.inference_mode():
    logits = model.forward(input_ids=batch[:, :-1]).logits
    probs = logits.softmax(-1)

last_tokens_probs = probs[:, -2:, :]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)


In [31]:
tok.batch_decode(batch.numpy()[0, -10:])

[' I', ' said', ',', ' "', 'Why', ' would', ' you', ' leave', ' Ann', ' Arbor']

In [32]:
tok.batch_decode(logits[0, -10:].argmax(-1).numpy())

[' He', "'m", ',', ' "', 'I', ' is', ' you', ' like', ' the', 'ie']

In [None]:
mask = torch.zeros_like(last_tokens_probs)
mask.shape

In [37]:
mask = torch.zeros_like(last_tokens_probs)
for idx, (m_penultimate, m_last) in enumerate(batch_mapping):
    # penultimate token only gets its position
    mask[idx, -2, m_penultimate] = 1.0

    # last token gets the fix
    mask[idx, -1, m_last] = 1.0

In [None]:
# check it gets the correct eleements
batch_mapping, (last_tokens_probs * mask).nonzero()

In [None]:
(last_tokens_probs * mask).sum(-1)

In [None]:
list(map(len, batch_mapping)), mask.sum(-1)

In [237]:
last_token_ids = t[:, -2:]
last_token_logprobs = logprobs[:, -2:, :]

In [None]:
tokens = last_token_ids.clone().flatten().cpu().numpy().tolist()
it = last_token_logprobs.reshape(-1, last_token_logprobs.shape[-1])

out = []
for idx, tok in enumerate(tokens):
    # Take other tokens where tok is a prefix + itself
    ids = prefix_map.get(tok, []) + [tok]
    o = it[idx, ids]
    out.append(o)

In [None]:
tokens

In [None]:
last_token_logprobs.reshape(-1, last_token_logprobs.shape[-1])

In [None]:
# mask = torch.zeros_like(last_token_logprobs).reshape(last_token_logprobs.shape[0], -1)
mask = torch.zeros_like(last_token_logprobs).reshape(-1)
mask.shape

In [None]:
last_token_ids

In [271]:
tokens = last_token_ids.clone().flatten().cpu().numpy().tolist()

pos = []
for tok in tokens:
    # Take other tokens where tok is a prefix + itself
    pos.append(prefix_map.get(tok, []) + [tok])

In [None]:
pos

In [None]:
last_token_ids.flatten()

In [None]:
# batch, token_position_in_seq, token_position_in_vocab
logprobs[0, -2, 6106], logprobs[0, -1, 30681]

In [None]:
# batch_size, seq_len, added
last_token_ids = t[:, -2:].unsqueeze(-1)
print(last_token_ids.shape)
last_token_ids

In [None]:
last_token_logprobs = logprobs[:, -2:, :]
print(last_token_logprobs.shape)
last_token_logprobs

In [None]:
res = last_token_logprobs.take_along_dim(dim=-1, indices=last_token_ids)
print(res.shape)
res.squeeze(-1)

In [None]:
last_token_logprobs[0, 0]

In [111]:
p_map = {6016: [1, 2]}

In [None]:
t[..., -2:]

In [None]:
t[..., -2:]

In [None]:
torch.tensor([[0], [2, 3]])

In [None]:
?torch.where

In [72]:
other = torch.nn.functional.cross_entropy(shift_logits.permute(0, 2, 1), shift_labels, reduction="none")

In [None]:
other

In [2]:
import numpy as np

In [6]:
a = [np.pad([1, 2, 3], (2, 0), constant_values=0), np.pad([1, 2, 3], (2, 0), constant_values=0)]

In [None]:
np.vstack(a).shape

In [15]:
a = pl.read_parquet("/home/pl487/rdd/outputs/model_eval/2024-09-11T11-06-11/pythia-9M-bpe32000.parquet")
b = pl.read_parquet("/home/pl487/rdd/outputs/model_eval/2024-09-11T11-31-36/pythia-9M-bpe32000.parquet")

In [None]:
a.join(b, on=["uid", "new_token_id"])

In [None]:
b

In [3]:
import srsly
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast

from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import ByteLevel as ByteLevelProcessor
from tokenizers.trainers import BpeTrainer

In [3]:
eos_token = "|endoftext|"

# Define the trainer
trainer = BpeTrainer(vocab_size=2000, min_frequency=2, special_tokens=[eos_token], show_progress=True)

# Define the tokenizer
tokenizer = Tokenizer(BPE())

# Set up the tokenizer components
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
tokenizer.post_processor = ByteLevelProcessor()
tokenizer.decoder = ByteLevelDecoder()

In [4]:
with open("tokenizers/modified_bpe/data/test_small.txt") as fl:
    lines = fl.readlines()

In [None]:
num_docs = 10_000
batch_size = 1_000

dataset = load_dataset(
    "HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True, cache_dir=".data_cache"
)
dataset = dataset.take(num_docs).select_columns(["text"]).batch(batch_size=batch_size)

In [None]:
tokenizer.train_from_iterator(iter(x["text"] for x in dataset), trainer, int(num_docs / batch_size))

In [2]:
from pathlib import Path

In [42]:
import json

from tokenizers import Tokenizer

In [44]:
def load_tokenizer_with_vocab_size(path: str | Path, vocab_size: int) -> PreTrainedTokenizerFast:
    path = Path(path)

    # Edit conf to adapt to the new vocab_size
    conf = srsly.read_json(path / "tokenizer.json")
    len_alphabet = len(conf["model"]["vocab"]) - len(conf["model"]["merges"])
    conf["model"]["vocab"] = dict(list(conf["model"]["vocab"].items())[:vocab_size])
    conf["model"]["merges"] = conf["model"]["merges"][: vocab_size - len_alphabet]

    # Instantiate tokenizer using tokenizers library
    backend_tok = Tokenizer.from_str(json.dumps(conf))
    eos_token = srsly.read_yaml(path / "metadata.yaml")["eos_token"]

    # Instantiate PreTrainedTokenizerFast from object
    # NOTE: we do not instantiate from file directly due to compatibility
    # https://github.com/huggingface/tokenizers/issues/1562#issuecomment-2315349846
    tok = PreTrainedTokenizerFast(tokenizer_object=backend_tok)
    tok.padding_side = "left"
    tok.eos_token = eos_token

    return tok

In [45]:
path = Path("outputs/tokenizers/2024-08-28T16-34-11")
vocab_size = 500

tok = load_tokenizer_with_vocab_size(path, vocab_size)

In [None]:
tok.save_pretrained(path / f"tok-vocab{vocab_size}")

In [34]:
vocab_size = 500

# edit conf to adapt to the new vocab_size
conf = srsly.read_json(path / "tokenizer.json")
# conf["padding"] = "left"
len_alphabet = len(conf["model"]["vocab"]) - len(conf["model"]["merges"])

conf["model"]["vocab"] = dict(list(conf["model"]["vocab"].items())[:vocab_size])
conf["model"]["merges"] = conf["model"]["merges"][: vocab_size - len_alphabet]

In [39]:
t = Tokenizer.from_str(json.dumps(conf))

In [None]:
conf["model"].keys()

In [None]:
t.get_vocab_size()

----