In [11]:
%%writefile pretrain.py

import argparse
import os

parser = argparse.ArgumentParser("pretraining")
parser.add_argument("dataset_folder", help="Path to a dataset folder of .train files that can be read by calling load_dataset('text', <path>)")
parser.add_argument("model_output_dir", help="Where the model and checkpoints should be stored")
parser.add_argument("epochs", help="Number of epochs", type=int)
parser.add_argument("mode", help="Set to 'curriculum' for curriculum training, 'shuffle' for default Trainer behaviour")

parser.add_argument("--per_device_train_batch_size", help="per_device_train_batch_size", type=int, nargs="?", const=1, default=8)
parser.add_argument("--checkpoints_per_epoch", help="Checkpoints to store per epoch", type=int, nargs="?", const=1, default=3)
parser.add_argument("--cuda_visible_devices", help="Comma seperated GPU ids to use", nargs="?", const=1, default="0,1")

args = parser.parse_args()

os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_visible_devices


if not os.path.exists(args.model_output_dir):
    os.makedirs(args.model_output_dir)


from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

# paths = [str(x) for x in Path(args.dataset_folder).glob("**/*.train")]

# # Initialize a tokenizer
# tokenizer = ByteLevelBPETokenizer()

# # Customize training
# tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
#     "<s>",
#     "<pad>",
#     "</s>",
#     "<unk>",
#     "<mask>",
# ])

# # Save files to disk
# tokenizer.save_model(args.model_output_dir)



from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)


from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(args.model_output_dir, max_len=512)


from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)




from torch.utils.data import Dataset
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import torch
from random import randrange
import cloudpickle

from datasets import load_dataset
import random
clustering = None
with open('brown_clustering', "rb") as handle:
    clustering = cloudpickle.load(handle)

class BrownDataset(Dataset):
    def rewrite(self, x):
        result = []
        for doc in x:
           # print("doc", doc, flush=True)
            tokenized = tokenizer.tokenize(doc)
            if len(tokenized) == 0:
                result.append(doc)
                continue
            #print("tokenized", tokenized, flush=True)
            IDX = randrange(len(tokenized))

            r = []
            for i, word in enumerate(tokenized):
                replacement = clustering.get_similar(word)
                
                if i == IDX and len(replacement):
                    r.append(random.choice(replacement)[0])
                else:
                    r.append(word)
            print(doc,tokenizer.convert_tokens_to_string(r), flush=True)
            result.append(tokenizer.convert_tokens_to_string(r))
            

        return result
       
        

    def __init__(self, data_dir):
        self.size = float('inf')
        self.data = load_dataset("text", data_dir=args.dataset_folder)
        self.data.set_transform(lambda x : tokenizer(self.rewrite(x["text"]), return_special_tokens_mask=True, truncation=True, max_length=512))
    def __len__(self):
        return float('inf')
        # TODO argue that an infinite training dataset is cognitively plausible 

    def __getitem__(self, idx):
     #   print(self.transform(self.data[idx]))
     #   print(self.data[idx], idx)
        return self.data[idx]#tokenizer(, return_special_tokens_mask=True, truncation=True, max_length=512)

if args.mode == "brown":
    dataset = BrownDataset(args.dataset_folder)
    print(dataset["train"][0])
    exit
else:
    dataset = load_dataset("text", data_dir=args.dataset_folder)
    dataset.set_transform(lambda x : tokenizer(x["text"], return_special_tokens_mask=True, truncation=True, max_length=512))



from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=args.model_output_dir,
    overwrite_output_dir=True,
    num_train_epochs=args.epochs,
    per_device_train_batch_size=args.per_device_train_batch_size,
    save_strategy="steps",
    save_steps=((len(dataset["train"]) / (torch.cuda.device_count()*args.per_device_train_batch_size)) // args.checkpoints_per_epoch ), # roughly N times per epoch
    seed=42,
    prediction_loss_only=True,
    remove_unused_columns=False,

    
    
)
from torch.utils.data import DataLoader

from transformers.trainer_utils import (

    seed_worker,

)
#https://discuss.huggingface.co/t/non-shuffle-training/6986/3
from torch.utils.data import SequentialSampler
class CurriculumTrainer(Trainer):
    
    def get_train_dataloader(self) -> DataLoader:
        """
        Adapted to use EpochVariableDataLoader (skips accelerator!)
        """
        train_dataset = self.train_dataset
        data_collator = self.data_collator

        train_dataset = self._remove_unused_columns(train_dataset, description="training")
        dataloader_params = {
            "batch_size": self._train_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }

        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
            dataloader_params["sampler"] = OrderedSampler(self.train_dataset, self.state.epoch if self.state.epoch is not None else 0)
            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = seed_worker
            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
        return EpochVariableDataLoader(train_dataset, **dataloader_params)
        
class EpochVariableDataLoader(DataLoader):
    def set_epoch(self, epoch):
        self.sampler.epoch = epoch        

class OrderedSampler(SequentialSampler):
    def __init__(self, data_source, epoch):
        self.data_source = data_source
        self.epoch = epoch
    def __iter__(self):
        print("getting new iterator", flush=True)
        return iter(torch.randperm(len(self.data_source)).tolist())
    


class BrownTrainer(Trainer):
    def _get_train_sampler(self):
        return EpochVariableSampler(self.train_dataset, self.state.epoch, torch.randperm(len(self.train_dataset)).tolist())
        

trainer = None

if args.mode == "shuffle":
    print("Random order!")
    trainer = Trainer( # shuffles the data at each epoch by default!
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset["train"],
        

        )
elif args.mode == "curriculum":
    print("Curriculum!")
    trainer = CurriculumTrainer( 
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset["train"],
        

        )
elif args.mode == "brown":
    print("Brown!")
    trainer = BrownTrainer( 
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset["train"],

        )

trainer.train()
trainer.save_model(args.model_output_dir)




Overwriting pretrain.py


In [12]:
%run pretrain.py ./train_test ./10MModelBrown 2 curriculum



Curriculum!
aaaaaaaaaaaaa 0
getting new iterator




Step,Training Loss




aaaaaaaaaaaaa 1
getting new iterator




In [3]:
# %run pretrain.py ./train_10M ./10MModel 10 shuffle

|     Groups     |Version|Filter|n-shot|Metric|Value |   |Stderr|
|----------------|-------|------|-----:|------|-----:|---|-----:|
|blimp_supplement|N/A    |none  |     0|acc   |0.4314|±  |0.0069|
|blimp_filtered  |N/A    |none  |     0|acc   |0.4831|±  |0.0019|

In [4]:
# %run pretrain.py ./10MCurriculum ./10MModelCurriculum 2