In [1]:
%%writefile util.py
from pathlib import Path
import os
def get_epoch_checkpoints(model_dir):
    c = sorted([int(str(x).split("-")[-1]) for x in Path(model_dir).glob("checkpoint-*")])
    epoch_checkpoints = c[5::6]
    if c[-1] not in epoch_checkpoints:
        epoch_checkpoints.pop()
        epoch_checkpoints.append(c[-1])
    return epoch_checkpoints

def get_all_chunks(checkpoint_path,gradient_input_dir, gradients_per_file):
    return [ os.path.join(gradient_input_dir, checkpoint_path.split("-")[-1] + "_" + str(i) + "_" + str(i + gradients_per_file)) for i in range(0, len(dataset["train"]), args.gradients_per_file)]



import xxhash

h = xxhash.xxh64()
def get_seed_for_document(document, epoch):
    h.update(document.cpu().numpy())
    h.update(bytes(epoch))
    seed = h.intdigest()
    h.reset()
    return seed

from transformers import DataCollatorForLanguageModeling
class DeterministicDataCollatorForLanguageModeling (DataCollatorForLanguageModeling): 
    def torch_mask_tokens(self, inputs, special_tokens_mask = None):
        """
        Adapted to make masking determinsitic based on (text, epoch). 
        Just wrapped the original implementation in a for loop where a seed based on (labels, epoch) is set for each individual example before masking.
        """

        labels = inputs.clone()


        import torch


        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()

        for i in range(0, labels.shape[0]):
            torch.manual_seed(get_seed_for_document(labels[i], self.epoch))

            # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)

            probability_matrix = torch.full(labels[i:i+1].shape, self.mlm_probability)


           
            probability_matrix.masked_fill_(special_tokens_mask[i:i+1], value=0.0)
            masked_indices = torch.bernoulli(probability_matrix).bool()
            labels[i:i+1][~masked_indices] = -100  # We only compute loss on masked tokens

            # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
            indices_replaced = torch.bernoulli(torch.full(labels[i:i+1].shape, 0.8)).bool() & masked_indices
            inputs[i:i+1][indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

            # 10% of the time, we replace masked input tokens with random word
            indices_random = torch.bernoulli(torch.full(labels[i:i+1].shape, 0.5)).bool() & masked_indices & ~indices_replaced
            random_words = torch.randint(len(self.tokenizer), labels[i:i+1].shape, dtype=torch.long)
            inputs[i:i+1][indices_random] = random_words[indices_random]

        ######################
        
        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels
    def set_epoch(self, epoch):
        self.epoch = epoch

Overwriting util.py


In [2]:
%%writefile pretrain.py

import argparse
import os

parser = argparse.ArgumentParser("pretraining")
parser.add_argument("dataset_folder", help="Path to a dataset folder of .train files that can be read by calling load_dataset('text', <path>)")
parser.add_argument("model_output_dir", help="Where the model and checkpoints should be stored")
# parser.add_argument("epochs", help="Number of epochs", type=int)
# parser.add_argument("mode", help="Set to 'curriculum' for curriculum training, 'shuffle' for default Trainer behaviour")
parser.add_argument("curriculum_path", help="A path to a torch tensor of shape (epochs, training examples) with training data ids")
parser.add_argument("dataset_folder_eval", help="Path to a dataset folder of .train files that can be read by calling load_dataset('text', <path>)")

parser.add_argument("--per_device_train_batch_size", help="per_device_train_batch_size", type=int, nargs="?", const=1, default=64) # TODO
# parser.add_argument("--checkpoints_per_epoch", help="Checkpoints to store per epoch", type=int, nargs="?", const=1, default=1)
parser.add_argument("--cuda_visible_devices", help="Comma seperated GPU ids to use", nargs="?", const=1, default="0,1")

args = parser.parse_args()

os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_visible_devices
# os.environ["CUDA_LAUNCH_BLOCKING"] = 1"
os.environ["WANDB_PROJECT"]="babylm_pretraining"

if not os.path.exists(args.model_output_dir):
    os.makedirs(args.model_output_dir)


from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

# paths = [str(x) for x in Path(args.dataset_folder).glob("**/*.train")]

# # Initialize a tokenizer
# tokenizer = ByteLevelBPETokenizer()

# # Customize training
# tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
#     "<s>",
#     "<pad>",
#     "</s>",
#     "<unk>",
#     "<mask>",
# ])

# # Save files to disk
# tokenizer.save_model(args.model_output_dir)







from torch.utils.data import Dataset
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import torch
from random import randrange
import cloudpickle

from datasets import load_dataset
import random







import util




from torch.utils.data import DataLoader

from transformers.trainer_utils import (

    seed_worker,

)

from transformers import Trainer, TrainingArguments
#https://discuss.huggingface.co/t/non-shuffle-training/6986/3
from torch.utils.data import SequentialSampler
class CurriculumTrainer(Trainer):
    
    def get_train_dataloader(self) -> DataLoader:
        """
        Adapted to use EpochVariableDataLoader (skips accelerator!)
        """
        train_dataset = self.train_dataset
        data_collator = self.data_collator

        train_dataset = self._remove_unused_columns(train_dataset, description="training")
        dataloader_params = {
            "batch_size": self._train_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }

        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
            dataloader_params["sampler"] = OrderedSampler(self.train_dataset, self.state.epoch if self.state.epoch is not None else 0)
            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = seed_worker
            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor

        return EpochVariableDataLoader(train_dataset, data_collator.set_epoch, **dataloader_params) # the Trainer class calls set_epoch on the dataloader, but we also need it in the data_collator
        
class EpochVariableDataLoader(DataLoader):
    def __init__(self, train_dataset, passtrough_function, **dataloader_params):
        self.passtrough_function = passtrough_function
        super().__init__(train_dataset, **dataloader_params)
    def set_epoch(self, epoch):
        self.sampler.epoch = epoch    
        self.passtrough_function(epoch)    

class OrderedSampler(SequentialSampler):
    def __init__(self, data_source, epoch):
        self.data_source = data_source
        self.epoch = epoch
        self.curriculum = torch.load(args.curriculum_path, weights_only=True)
       
    def __iter__(self):
        print("getting new iterator in epoch", self.epoch,flush=True)
        return iter(self.curriculum[self.epoch].tolist())
    


# class BrownTrainer(Trainer):
#     def _get_train_sampler(self):
#         return EpochVariableSampler(self.train_dataset, self.state.epoch, torch.randperm(len(self.train_dataset)).tolist())
        

# trainer = None

# if args.mode == "shuffle":
#     print("Random order!")

# elif args.mode == "curriculum":
#     print("Curriculum!")
#     trainer = CurriculumTrainer( 
#         model=model,
#         args=training_args,
#         data_collator=data_collator,
#         train_dataset=dataset["train"],
        

#         )
# elif args.mode == "brown":
#     print("Brown!")
#     trainer = BrownTrainer( 
#         model=model,
#         args=training_args,
#         data_collator=data_collator,
#         train_dataset=dataset["train"],

#         )

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(args.dataset_folder).glob("**/*.train")]



from transformers import RobertaTokenizerFast

try:
    tokenizer = RobertaTokenizerFast.from_pretrained(args.model_output_dir, max_len=512)
except:
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

    # Save files to disk
    tokenizer.save_model(args.model_output_dir)
    tokenizer = RobertaTokenizerFast.from_pretrained(args.model_output_dir, max_len=512)
data_collator = util.DeterministicDataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

import datasets
t = lambda x : tokenizer(x["text"], return_special_tokens_mask=True, truncation=True, max_length=512)

dataset = datasets.load_from_disk(args.dataset_folder)
dataset.set_transform(t)



# from huggingface_hub import login
# login()
dataset_eval = datasets.load_from_disk(args.dataset_folder_eval)
dataset_eval.set_transform(t)

data_collator.set_epoch(1000)
# # https://github.com/ayoolaolafenwa/TrainNLP
# def insert_random_mask(batch):
#     features = [dict(zip(batch, t)) for t in zip(*batch.values())]
#     masked_inputs = data_collator(features)
#     return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}


# dataset_eval = dataset_eval.map(insert_random_mask,batched=True,)

# dataset_eval = dataset_eval.rename_columns({"masked_input_ids": "input_ids",
# "masked_attention_mask": "attention_mask","masked_labels": "labels"})


from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=41130,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=1,
    layer_norm_eps=1e-05,
    # pad_token_id=1  TODO change if using old tokenizers
    attention_probs_dropout_prob = 0.1,
    # bos_token_id = 0,
    # "eos_token_id": 2,
    hidden_act = "gelu",
    hidden_dropout_prob=0.1,
    hidden_size =768,
    initializer_range=0.02,
    intermediate_size=3072,
)

EPOCHS = len(torch.load(args.curriculum_path, weights_only=True))
print("Detected {} epochs".format(EPOCHS))


#steps_per_epoch = ((len(dataset) / (torch.cuda.device_count()*args.per_device_train_batch_size)) // args.checkpoints_per_epoch ), # roughly N times per epoch
training_args = TrainingArguments(
    output_dir=args.model_output_dir,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS, #1000 !?!?!
    per_device_train_batch_size=args.per_device_train_batch_size,
    do_eval=True,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="epoch",
    logging_steps=100,
    seed=42,
    prediction_loss_only=False,
    remove_unused_columns=False,
    warmup_steps=10000,
    learning_rate=3e-4,
     report_to="wandb",
    eval_on_start=True,
     label_names=["labels"],
     batch_eval_metrics=True,
     per_device_eval_batch_size=128

    
    
)


from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)


from datasets import load_metric

from datasets import load_metric
from collections import defaultdict
import numpy as np
batch_metrics = defaultdict(list) 
def compute_metrics(eval_pred, compute_result):
    global batch_metrics 
    
    #print("before", flush=True)
    logits, labels = eval_pred

    
    # https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_mlm_flax.py
    label_mask = torch.where(labels > 0, 1.0, 0.0)
    # loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask

    # compute accuracy
    accuracy = (torch.equal(torch.argmax(logits, axis=-1), labels) * label_mask)
    #print("at")
    
    # Accumulate batch-level metrics
    batch_metrics["accuracy"].append(accuracy.flatten(0))
    batch_metrics["normalizer"].append(label_mask.flatten(0))
    #print(logits.flatten().shape, accuracy.flatten(0).shape,flush=True)
    #print("bm",batch_metrics, flush=True)
    
    if compute_result:

        # for name, score in batch_metrics.items():
        #     for s in score:
        #         print(name, s.shape, flush=True)
        result = {name: torch.cat(score).mean() for name, score in batch_metrics.items()}
        batch_metrics = defaultdict(list) 
        #print("r",result)
        return result
    else:
        return {}
    #metrics = jax.lax.psum(metrics, axis_name="batch")

trainer = CurriculumTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    eval_dataset=dataset_eval,
    compute_metrics=compute_metrics,
    
   
    )
# for batch in trainer.get_eval_dataloader(dataset_eval):
#     print(batch["labels"])
#     break
# print(data_collator(dataset_eval))
# print("DO NOT TRUST TQDM's time estimates: some modes have varying numbers of steps per epoch")
trainer.train()  
trainer.save_model(args.model_output_dir)




Overwriting pretrain.py


In [None]:
%run pretrain.py curricula/datasets/curriculum_10M_2024 ./Curriculum_10M_2024 ./curricula/curriculum_10M_2024 curricula/datasets/curriculum_10M_2024_eval

Exception ignored in: <module 'collections.abc' from '/usr/lib/python3.10/collections/abc.py'>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
KeyboardInterrupt: 
2024-10-24 12:40:45.418419: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-24 12:40:45.435208: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-24 12:40:45.454169: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-24 12:40:45.459757: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Una

In [4]:
import transformers
print(transformers.__version__)

4.46.0


In [5]:
import numpy as np
np.array([0,1,2])[0:0+1]

array([0])

In [6]:


# clustering = None
# with open('brown_clustering', "rb") as handle:
#     clustering = cloudpickle.load(handle)

# class BrownDataset(Dataset):
#     def rewrite(self, x):
#         result = []
#         for doc in x:
#            # print("doc", doc, flush=True)
#             tokenized = tokenizer.tokenize(doc)
#             if len(tokenized) == 0:
#                 result.append(doc)
#                 continue
#             #print("tokenized", tokenized, flush=True)
#             IDX = randrange(len(tokenized))

#             r = []
#             for i, word in enumerate(tokenized):
#                 replacement = clustering.get_similar(word)
                
#                 if i == IDX and len(replacement):
#                     r.append(random.choice(replacement)[0])
#                 else:
#                     r.append(word)
#             print(doc,tokenizer.convert_tokens_to_string(r), flush=True)
#             result.append(tokenizer.convert_tokens_to_string(r))
            

#         return result
       
        

#     def __init__(self, data_dir):
#         self.size = float('inf')
#         self.data = load_dataset("text", data_dir=args.dataset_folder)
#         self.data.set_transform(lambda x : tokenizer(self.rewrite(x["text"]), return_special_tokens_mask=True, truncation=True, max_length=512))
#     def __len__(self):
#         return float('inf')
#         # TODO argue that an infinite training dataset is cognitively plausible 

#     def __getitem__(self, idx):
#      #   print(self.transform(self.data[idx]))
#      #   print(self.data[idx], idx)
#         return self.data[idx]#tokenizer(, return_special_tokens_mask=True, truncation=True, max_length=512)

# if args.mode == "brown":
#     dataset = BrownDataset(args.dataset_folder)
#     print(dataset["train"][0])
#     exit
# else:

In [7]:
%run pretrain.py ./train_test ./Test curriculum test_random_curriculum

FileNotFoundError: Directory ./train_test is neither a `Dataset` directory nor a `DatasetDict` directory.

In [8]:
# %run pretrain.py ./train_10M ./10MModel 10 shuffle

|     Groups     |Version|Filter|n-shot|Metric|Value |   |Stderr|
|----------------|-------|------|-----:|------|-----:|---|-----:|
|blimp_supplement|N/A    |none  |     0|acc   |0.4314|±  |0.0069|
|blimp_filtered  |N/A    |none  |     0|acc   |0.4831|±  |0.0019|

In [9]:
# %run pretrain.py ./10MCurriculum ./10MModelCurriculum 2