In [21]:
from datasets import load_from_disk
from transformers import AutoModelForCausalLM
import torch

In [20]:
ds = load_from_disk("/home/pl487/rdd/data/slim-pajama-subset-validation-sample-bpe32000/contexts")
model = AutoModelForCausalLM.from_pretrained("/home/pl487/rdd/outputs/model_train/pythia-9M-bpe32000/checkpoints/checkpoint-50000")

In [42]:
t = torch.tensor(ds[0]["input_ids"], dtype=torch.long)[None, :]

In [64]:
logits = model.forward(input_ids=t).logits
logprobs = logits.log_softmax(-1)
labels = t.clone()

shift_logits = logits[..., :-1, :].contiguous()
shift_logprobs = logprobs[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()

# Get the log-probability of the true token
# (batch, seq, vocab = 1), there is an extra dim that makes it broadcastable to `shift_logprobs`
true_logprobs = shift_logprobs.take_along_dim(dim=-1, indices=shift_labels[..., None])
sup = true_logprobs.squeeze(-1).neg()


In [59]:
sup

tensor([[9.2836, 2.3509, 7.0118,  ..., 0.0324, 2.0413, 3.3009]],
       grad_fn=<NegBackward0>)

In [67]:
shift_logits.shape

torch.Size([1, 2048, 32000])

In [66]:
shift_labels.shape

torch.Size([1, 2048])

In [72]:
other = torch.nn.functional.cross_entropy(shift_logits.permute(0, 2, 1), shift_labels, reduction="none")

In [78]:
other

tensor([[9.2836, 2.3508, 7.0118,  ..., 0.0323, 2.0413, 3.3008]],
       grad_fn=<ViewBackward0>)

In [2]:
import numpy as np

In [6]:
a = [np.pad([1, 2, 3], (2, 0), constant_values=0), np.pad([1, 2, 3], (2, 0), constant_values=0)]

In [13]:
np.vstack(a).shape

(2, 5)

In [14]:
import polars as pl

In [15]:
a = pl.read_parquet("/home/pl487/rdd/outputs/model_eval/2024-09-11T11-06-11/pythia-9M-bpe32000.parquet")
b = pl.read_parquet("/home/pl487/rdd/outputs/model_eval/2024-09-11T11-31-36/pythia-9M-bpe32000.parquet")


In [18]:
a.join(b, on=["uid", "new_token_id"])

sup,rank,entropy,uid,new_token_id,input_ids,sup_right,rank_right,entropy_right
list[f64],list[i64],list[f64],i64,i64,list[i64],list[f64],list[i64],list[f64]
"[1.383524, 3.333795]","[0, 3]","[4.181461, 5.62039]",84,31567,"[540, 31567]","[2.059909, 3.300911]","[1, 2]","[4.185226, 4.343543]"
"[1.213583, 10.371901]","[0, 2604]","[4.736505, 7.015377]",84,31881,"[262, 31881]","[1.272575, 8.572933]","[0, 738]","[5.03889, 6.998665]"
"[5.590575, 9.243664]","[33, 1059]","[4.292186, 6.798021]",127,31655,"[1817, 31655]","[4.976245, 10.881699]","[15, 1325]","[3.17473, 3.348009]"
"[4.397538, 9.070292]","[6, 569]","[1.681468, 3.974308]",157,31569,"[281, 31569]","[4.597424, 4.831842]","[5, 8]","[1.517665, 4.55687]"
"[1.967112, 6.330029]","[0, 68]","[4.861785, 5.456806]",157,31238,"[432, 31238]","[2.048149, 8.504574]","[0, 338]","[5.06059, 5.027475]"
…,…,…,…,…,…,…,…,…
"[1.740024, 6.415906]","[2, 92]","[1.653188, 6.302306]",103405,31932,"[364, 31932]","[1.693829, 6.449864]","[2, 94]","[1.680988, 6.278137]"
"[4.799245, 13.960602]","[10, 11265]","[4.09253, 4.861162]",107976,31671,"[1237, 31671]","[4.900066, 14.020318]","[11, 11435]","[4.061255, 4.795384]"
"[6.604518, 13.779301]","[73, 10834]","[5.865509, 4.507862]",108539,31070,"[5616, 31070]","[6.594501, 13.775612]","[72, 10883]","[5.804174, 4.513897]"
"[3.648093, 12.668388]","[1, 6216]","[7.356655, 6.21884]",113782,31658,"[13, 31658]","[3.665712, 12.639827]","[1, 6194]","[7.360681, 6.237061]"


In [17]:
b

new_token_id,uid,input_ids,sup,rank,entropy
i64,i64,list[i64],list[f64],list[i64],list[f64]
31567,84,"[540, 31567]","[2.059909, 3.300911]","[1, 2]","[4.185226, 4.343543]"
31881,84,"[262, 31881]","[1.272575, 8.572933]","[0, 738]","[5.03889, 6.998665]"
31655,127,"[1817, 31655]","[4.976245, 10.881699]","[15, 1325]","[3.17473, 3.348009]"
31569,157,"[281, 31569]","[4.597424, 4.831842]","[5, 8]","[1.517665, 4.55687]"
31238,157,"[432, 31238]","[2.048149, 8.504574]","[0, 338]","[5.06059, 5.027475]"
…,…,…,…,…,…
31932,103405,"[364, 31932]","[1.693829, 6.449864]","[2, 94]","[1.680988, 6.278137]"
31671,107976,"[1237, 31671]","[4.900066, 14.020318]","[11, 11435]","[4.061255, 4.795384]"
31070,108539,"[5616, 31070]","[6.594501, 13.775612]","[72, 10883]","[5.804174, 4.513897]"
31658,113782,"[13, 31658]","[3.665712, 12.639827]","[1, 6194]","[7.360681, 6.237061]"


In [3]:
import srsly
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast

from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import ByteLevel as ByteLevelProcessor
from tokenizers.trainers import BpeTrainer

In [3]:
eos_token = "|endoftext|"

# Define the trainer
trainer = BpeTrainer(vocab_size=2000, min_frequency=2, special_tokens=[eos_token], show_progress=True)

# Define the tokenizer
tokenizer = Tokenizer(BPE())

# Set up the tokenizer components
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
tokenizer.post_processor = ByteLevelProcessor()
tokenizer.decoder = ByteLevelDecoder()

In [4]:
with open("tokenizers/modified_bpe/data/test_small.txt") as fl:
    lines = fl.readlines()

In [10]:
num_docs = 10_000
batch_size = 1_000

dataset = load_dataset(
    "HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True, cache_dir=".data_cache"
)
dataset = dataset.take(num_docs).select_columns(["text"]).batch(batch_size=batch_size)

Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

In [12]:
tokenizer.train_from_iterator(iter(x["text"] for x in dataset), trainer, int(num_docs / batch_size))






In [2]:
from pathlib import Path

In [42]:
import json

from tokenizers import Tokenizer

In [44]:
def load_tokenizer_with_vocab_size(path: str | Path, vocab_size: int) -> PreTrainedTokenizerFast:
    path = Path(path)

    # Edit conf to adapt to the new vocab_size
    conf = srsly.read_json(path / "tokenizer.json")
    len_alphabet = len(conf["model"]["vocab"]) - len(conf["model"]["merges"])
    conf["model"]["vocab"] = dict(list(conf["model"]["vocab"].items())[:vocab_size])
    conf["model"]["merges"] = conf["model"]["merges"][: vocab_size - len_alphabet]

    # Instantiate tokenizer using tokenizers library
    backend_tok = Tokenizer.from_str(json.dumps(conf))
    eos_token = srsly.read_yaml(path / "metadata.yaml")["eos_token"]

    # Instantiate PreTrainedTokenizerFast from object
    # NOTE: we do not instantiate from file directly due to compatibility
    # https://github.com/huggingface/tokenizers/issues/1562#issuecomment-2315349846
    tok = PreTrainedTokenizerFast(tokenizer_object=backend_tok)
    tok.padding_side = "left"
    tok.eos_token = eos_token

    return tok

In [45]:
path = Path("outputs/tokenizers/2024-08-28T16-34-11")
vocab_size = 500

tok = load_tokenizer_with_vocab_size(path, vocab_size)

In [48]:
tok.save_pretrained(path / f"tok-vocab{vocab_size}")

('outputs/tokenizers/2024-08-28T16-34-11/tok-vocab500/tokenizer_config.json',
 'outputs/tokenizers/2024-08-28T16-34-11/tok-vocab500/special_tokens_map.json',
 'outputs/tokenizers/2024-08-28T16-34-11/tok-vocab500/tokenizer.json')

In [34]:
vocab_size = 500

# edit conf to adapt to the new vocab_size
conf = srsly.read_json(path / "tokenizer.json")
# conf["padding"] = "left"
len_alphabet = len(conf["model"]["vocab"]) - len(conf["model"]["merges"])

conf["model"]["vocab"] = dict(list(conf["model"]["vocab"].items())[:vocab_size])
conf["model"]["merges"] = conf["model"]["merges"][: vocab_size - len_alphabet]

In [39]:
t = Tokenizer.from_str(json.dumps(conf))

In [38]:
conf["model"].keys()

dict_keys(['type', 'dropout', 'unk_token', 'continuing_subword_prefix', 'end_of_word_suffix', 'fuse_unk', 'byte_fallback', 'ignore_merges', 'vocab', 'merges'])

In [41]:
t.get_vocab_size()

500

----