In [43]:
%%writefile util.py
from pathlib import Path
import os
def get_epoch_checkpoints(model_dir):
    checkpoint_ids = sorted([int(str(x).split("-")[-1]) for x in Path(model_dir).glob("checkpoint-*")])

    return [os.path.join(model_dir,"checkpoint-{}".format(c)) for c in checkpoint_ids]
    # epoch_checkpoints = c[5::6]
    # if c[-1] not in epoch_checkpoints:
    #     epoch_checkpoints.pop()
    #     epoch_checkpoints.append(c[-1])
    return epoch_checkpoints

# def get_all_chunks(checkpoint_path,gradient_input_dir, gradients_per_file):
#     return [ os.path.join(gradient_input_dir, checkpoint_path.split("-")[-1] + "_" + str(i) + "_" + str(i + gradients_per_file)) for i in range(0, len(dataset["train"]), args.gradients_per_file)]
def get_epoch(checkpoint_path):
    checkpoint_ids = sorted([int(str(x).split("-")[-1]) for x in Path(os.path.dirname(checkpoint_path)).glob("checkpoint-*")])
    return checkpoint_ids.index(int(str(checkpoint_path).split("-")[-1]))


import xxhash

h = xxhash.xxh64()
def get_seed_for_document(document, epoch):
    h.update(document.cpu().numpy())
    h.update(bytes(epoch))
    seed = h.intdigest()
    h.reset()
    return seed

from transformers import DataCollatorForLanguageModeling
class DeterministicDataCollatorForLanguageModeling (DataCollatorForLanguageModeling): 
    def torch_mask_tokens(self, inputs, special_tokens_mask = None):
        """
        Adapted to make masking determinsitic based on (text, epoch). 
        Just wrapped the original implementation in a for loop where a seed based on (labels, epoch) is set for each individual example before masking.
        """

        labels = inputs.clone()


        import torch


        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()

        for i in range(0, labels.shape[0]):
            torch.manual_seed(get_seed_for_document(labels[i], self.epoch))

            # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)

            probability_matrix = torch.full(labels[i:i+1].shape, self.mlm_probability)


           
            probability_matrix.masked_fill_(special_tokens_mask[i:i+1], value=0.0)
            masked_indices = torch.bernoulli(probability_matrix).bool()
            labels[i:i+1][~masked_indices] = -100  # We only compute loss on masked tokens

            # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
            indices_replaced = torch.bernoulli(torch.full(labels[i:i+1].shape, 0.8)).bool() & masked_indices
            inputs[i:i+1][indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

            # 10% of the time, we replace masked input tokens with random word
            indices_random = torch.bernoulli(torch.full(labels[i:i+1].shape, 0.5)).bool() & masked_indices & ~indices_replaced
            random_words = torch.randint(len(self.tokenizer), labels[i:i+1].shape, dtype=torch.long)
            inputs[i:i+1][indices_random] = random_words[indices_random]

        ######################
        
        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels
    def set_epoch(self, epoch):
        self.epoch = epoch

Overwriting util.py


In [4]:
NUM_GPUs = 8
per_device_batch_size = 16
update_freq = 16
NUM_GPUs*per_device_batch_size*update_freq

2048

In [68]:
NUM_GPUs = 2
per_device_batch_size = 64
update_freq = 16
NUM_GPUs*per_device_batch_size*update_freq

2048

In [62]:
%%writefile pretrain.py

import argparse
import os

parser = argparse.ArgumentParser("pretraining")
parser.add_argument("config", help="Path to a config.json file")
parser.add_argument("--per_device_train_batch_size", help="per_device_train_batch_size", type=int, nargs="?", const=1, default=64) # TODO
parser.add_argument("--cuda_visible_devices", help="Comma seperated GPU ids to use", nargs="?", const=1, default="0,1")

args = parser.parse_args()

import json
config = None
with open(args.config) as f:
    config = json.load(f)
    print(config)
config["model_path"] = os.path.join("./models/",os.path.basename(config["curriculum_path"]))   
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_visible_devices
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["WANDB_PROJECT"]="babylm_pretraining"

if not os.path.exists(config["model_path"]):
    os.makedirs(config["model_path"])


from pathlib import Path

from tokenizers import ByteLevelBPETokenizer



from torch.utils.data import Dataset
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import torch
from random import randrange
import cloudpickle

from datasets import load_dataset
import random







import util




from torch.utils.data import DataLoader

from transformers.trainer_utils import (

    seed_worker,

)

from transformers import Trainer, TrainingArguments
#https://discuss.huggingface.co/t/non-shuffle-training/6986/3
from torch.utils.data import SequentialSampler
class CurriculumTrainer(Trainer):
    
    def get_train_dataloader(self) -> DataLoader:
        """
        Adapted to use EpochVariableDataLoader (skips accelerator!)
        """
        train_dataset = self.train_dataset
        data_collator = self.data_collator

        train_dataset = self._remove_unused_columns(train_dataset, description="training")
        dataloader_params = {
            "batch_size": self._train_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }

        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
            dataloader_params["sampler"] = OrderedSampler(self.train_dataset, self.state.epoch if self.state.epoch is not None else 0)
            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = seed_worker
            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor

        return EpochVariableDataLoader(train_dataset, data_collator.set_epoch, **dataloader_params) # the Trainer class calls set_epoch on the dataloader, but we also need it in the data_collator
        
class EpochVariableDataLoader(DataLoader):
    def __init__(self, train_dataset, passtrough_function, **dataloader_params):
        self.passtrough_function = passtrough_function
        super().__init__(train_dataset, **dataloader_params)
    def set_epoch(self, epoch):
        self.sampler.epoch = epoch    
        self.passtrough_function(epoch)    

class OrderedSampler(SequentialSampler):
    def __init__(self, data_source, epoch):
        self.data_source = data_source
        self.epoch = epoch
        self.curriculum = torch.load(config["curriculum_path"], weights_only=True)
       
    def __iter__(self):
        print("getting new iterator in epoch", self.epoch,flush=True)
        return iter(self.curriculum[self.epoch].tolist())
    



from tokenizers import ByteLevelBPETokenizer


import datasets

from transformers import RobertaTokenizerFast
tokenizer = None
try:
    tokenizer = RobertaTokenizerFast.from_pretrained(config["model_path"], max_len=512)
except:

    dataset_tokenizer = datasets.load_from_disk(config["dataset_folder"]) # without set_transform
    # https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#train-tokenizer
    tokenizer = ByteLevelBPETokenizer()

    def batch_iterator(batch_size=1000):
        for i in range(0, len(dataset_tokenizer), batch_size):
            yield dataset_tokenizer[i: i + batch_size]["text"]

    # Customized training
    tokenizer.train_from_iterator(batch_iterator(), vocab_size=52_000, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

    # Save files to disk
    tokenizer.save_model(config["model_path"])
    tokenizer = RobertaTokenizerFast.from_pretrained(config["model_path"], max_len=512)

# we still use dynamic masking (mask differently at each epoch) as in the original RoBERTa paper, but do so deterministically
# we do not use sentence packing as that would defeat the purpouse of applying an influence estimation method on a per-document basis
# we compensate by increasing batch size
data_collator = util.DeterministicDataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


t = lambda x : tokenizer(x["text"], return_special_tokens_mask=True, truncation=True, max_length=512)

dataset = datasets.load_from_disk(config["dataset_folder"])
dataset = dataset.map(t)
dataset = dataset.remove_columns(["text"]) 
dataset.set_format("torch")
print(dataset[0])



# from huggingface_hub import login
# login()
dataset_eval = datasets.load_from_disk(config["eval_dataset_folder"])
dataset_eval = dataset_eval.map(t)
dataset_eval = dataset_eval.remove_columns(["text"]) 
dataset_eval.set_format("torch")

data_collator.set_epoch(0)
# # https://github.com/ayoolaolafenwa/TrainNLP
# def insert_random_mask(batch):
#     features = [dict(zip(batch, t)) for t in zip(*batch.values())]
#     masked_inputs = data_collator(features)
#     return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}


# dataset_eval = dataset_eval.map(insert_random_mask,batched=True,)

# dataset_eval = dataset_eval.rename_columns({"masked_input_ids": "input_ids",
# "masked_attention_mask": "attention_mask","masked_labels": "labels"})


from transformers import RobertaConfig
tokenizer
roberta_config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=1,
    layer_norm_eps=1e-05,
    attention_probs_dropout_prob = 0.1,
    # bos_token_id = 0,
    # "eos_token_id": 2,
    hidden_act = "gelu",
    hidden_dropout_prob=0.1,
    hidden_size =768,
    initializer_range=0.02,
    intermediate_size=3072,
)

EPOCHS = len(torch.load(config["curriculum_path"], weights_only=True))
print("Detected {} epochs".format(EPOCHS))


#steps_per_epoch = ((len(dataset) / (torch.cuda.device_count()*args.per_device_train_batch_size)) // args.checkpoints_per_epoch ), # roughly N times per epoch
training_args = TrainingArguments(
    output_dir=config["model_path"],
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS, #1000 !?!?!
    per_device_train_batch_size=64,#args.per_device_train_batch_size,
    eval_strategy="epoch",
    dataloader_num_workers=10,
    
   # eval_steps=2,#1000,
    save_strategy="epoch",
    fp16=False, # was True https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/config/pretraining/base.yaml
    logging_steps=50,
    seed=42,
    prediction_loss_only=False,
    gradient_accumulation_steps=16, 
    remove_unused_columns=True,


    

    # https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/README.pretraining.md
    learning_rate=5e-4, # effective batch size is 2048=16*64*2
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-06,
    weight_decay=0.01,
    lr_scheduler_type="polynomial",
    warmup_steps=10000, 
    report_to="wandb",
    #eval_on_start=True,
     label_names=["labels"],
     batch_eval_metrics=True,
    # eval_accumulation_steps=5,
     per_device_eval_batch_size=64

    
    
)


from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=roberta_config)


import evaluate

from datasets import load_metric
from collections import defaultdict
import numpy as np

from datasets import load_metric
intermediate_logits = [] 
intermediate_labels = []
def compute_metrics(eval_pred, compute_result=True):
    global intermediate_logits
    global intermediate_labels
    
    #print("before", flush=True)
    logits, labels = eval_pred
    # if not torch.is_tensor(logits):
    #     logits = torch.tensor(logits)
    # if not torch.is_tensor(labels):
    #     labels = torch.tensor(labels)
    intermediate_logits.append(logits.cpu())
    intermediate_labels.append(labels.cpu())


    
    if compute_result:
        import math
        # https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_mlm_flax.py
        
        # loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
       
        logits = torch.stack(intermediate_logits)
        labels = torch.stack(intermediate_labels)
        label_mask = torch.where(labels > 0, 1.0, 0.0)
        predictions = torch.argmax(logits, axis=-1)
        

        accuracy = evaluate.load("accuracy")

        loss = torch.nn.functional.cross_entropy(logits, torch.nn.functional.one_hot((labels*label_mask).to(torch.int64), logits.shape[-1]).to(torch.float64))*label_mask
           
        ##################
 
        result = {
            "accuracy": accuracy.compute(predictions=predictions.flatten(), references=labels.flatten(), sample_weight=label_mask.flatten())["accuracy"],
            "mlm_perplexity": math.exp(loss.mean()),
            "mlm_loss": loss.mean() # TODO just a sanity check for testing
            }
        intermediate_logits = []
        intermediate_labels = []
        return result
    else:
        return {}
    #metrics = jax.lax.psum(metrics, axis_name="batch")

trainer = CurriculumTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    eval_dataset=dataset_eval,
    compute_metrics=compute_metrics,
    
   
    )

# print("DO NOT TRUST TQDM's time estimates: some modes have varying numbers of steps per epoch")
trainer.train()  
trainer.save_model(config["model_path"])




Overwriting pretrain.py


In [52]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


Looking in indexes: https://download.pytorch.org/whl/cpu
Note: you may need to restart the kernel to use updated packages.


In [55]:
python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__);"

SyntaxError: invalid syntax (2608791077.py, line 1)

In [None]:
%run pretrain.py curricula/datasets/curriculum_10M_2024 ./curriculum_10M_2024_random ./curricula/curriculum_10M_2024_random curricula/datasets/curriculum_10M_2024_eval

2024-10-25 07:48:49.555328: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-25 07:48:49.572431: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-25 07:48:49.593136: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-25 07:48:49.599024: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-25 07:48:49.615469: I tensorflow/core/platform/cpu_feature_guar

In [4]:
import transformers
print(transformers.__version__)

4.46.0


In [5]:
import numpy as np
np.array([0,1,2])[0:0+1]

array([0])

In [3]:


# clustering = None
# with open('brown_clustering', "rb") as handle:
#     clustering = cloudpickle.load(handle)

# class BrownDataset(Dataset):
#     def rewrite(self, x):
#         result = []
#         for doc in x:
#            # print("doc", doc, flush=True)
#             tokenized = tokenizer.tokenize(doc)
#             if len(tokenized) == 0:
#                 result.append(doc)
#                 continue
#             #print("tokenized", tokenized, flush=True)
#             IDX = randrange(len(tokenized))

#             r = []
#             for i, word in enumerate(tokenized):
#                 replacement = clustering.get_similar(word)
                
#                 if i == IDX and len(replacement):
#                     r.append(random.choice(replacement)[0])
#                 else:
#                     r.append(word)
#             print(doc,tokenizer.convert_tokens_to_string(r), flush=True)
#             result.append(tokenizer.convert_tokens_to_string(r))
            

#         return result
       
        

#     def __init__(self, data_dir):
#         self.size = float('inf')
#         self.data = load_dataset("text", data_dir=args.dataset_folder)
#         self.data.set_transform(lambda x : tokenizer(self.rewrite(x["text"]), return_special_tokens_mask=True, truncation=True, max_length=512))
#     def __len__(self):
#         return float('inf')
#         # TODO argue that an infinite training dataset is cognitively plausible 

#     def __getitem__(self, idx):
#      #   print(self.transform(self.data[idx]))
#      #   print(self.data[idx], idx)
#         return self.data[idx]#tokenizer(, return_special_tokens_mask=True, truncation=True, max_length=512)

# if args.mode == "brown":
#     dataset = BrownDataset(args.dataset_folder)
#     print(dataset["train"][0])
#     exit
# else:

In [7]:
%run pretrain.py ./train_test ./Test curriculum test_random_curriculum

FileNotFoundError: Directory ./train_test is neither a `Dataset` directory nor a `DatasetDict` directory.

In [8]:
# %run pretrain.py ./train_10M ./10MModel 10 shuffle

|     Groups     |Version|Filter|n-shot|Metric|Value |   |Stderr|
|----------------|-------|------|-----:|------|-----:|---|-----:|
|blimp_supplement|N/A    |none  |     0|acc   |0.4314|±  |0.0069|
|blimp_filtered  |N/A    |none  |     0|acc   |0.4831|±  |0.0019|

In [9]:
# %run pretrain.py ./10MCurriculum ./10MModelCurriculum 2

[1;34mwandb[0m: 🚀 View run [33m./asdfa13sdafra[0m at: [34mhttps://wandb.ai/loriss/babylm_pretraining/runs/t677v40s[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20241025_073016-t677v40s/logs[0m
