In [3]:
import sys
from os.path import expanduser
sys.path.insert(0, expanduser("~/nta/nupic.research/projects/transformers"))

import torch
from experiments import CONFIGS

In [16]:
from pprint import pprint
from hashlib import blake2b

from transformers import (
    HfArgumentParser,
    AutoTokenizer,
)
from datasets import load_dataset, DatasetDict, concatenate_datasets, load_from_disk

from experiments import CONFIGS
from run_args import CustomTrainingArguments, DataTrainingArguments, ModelArguments

from run_utils import preprocess_datasets_mlm, hash_dataset_folder_name


## Load the default arguments for our datasets

In [12]:
bert_100k = CONFIGS["bert_100k"]
exp_parser = HfArgumentParser(
    (ModelArguments, DataTrainingArguments, CustomTrainingArguments)
)
model_args, data_args, training_args = exp_parser.parse_dict(bert_100k)

Modify default arguments for new little dataset

In [19]:
data_args.dataset_name = ("wikipedia_plus_bookcorpus", )
data_args.dataset_config_name = (None, )
data_args.max_seq_length = 128

In [14]:
pprint(data_args.__dict__)

{'data_collator': 'DataCollatorForWholeWordMask',
 'dataset_config_name': None,
 'dataset_name': 'wikipedia_plus_bookcorpus',
 'line_by_line': False,
 'max_seq_length': 128,
 'mlm_probability': 0.15,
 'override_finetuning_results': False,
 'overwrite_cache': False,
 'pad_to_max_length': False,
 'preprocessing_num_workers': None,
 'reuse_tokenized_data': True,
 'save_tokenized_data': True,
 'task_name': None,
 'task_names': [],
 'tokenized_data_cache_dir': '/mnt/efs/results/preprocessed-datasets/text',
 'train_file': None,
 'validation_file': None,
 'validation_split_percentage': 5}


## Load a fraction of Wikipedia and a fraction of Book Corpus
These will be used to make a custom small dataset

Load 1% of Wikipedia for training and another 1% for validation

In [28]:
cache_dir = "/mnt/efs/results/cache/huggingface/datasets/"
wiki_dataset_train = load_dataset(
    "wikipedia", "20200501.en",
    cache_dir=cache_dir,
    split="train[:1%]"
)
wiki_dataset_val = load_dataset(
    "wikipedia", "20200501.en",
    cache_dir=cache_dir,
    split=f"train[1%:2%]"
)

Reusing dataset wikipedia (/mnt/efs/results/cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/4021357e28509391eab2f8300d9b689e7e8f3a877ebb3d354b01577d497ebc63)
Reusing dataset wikipedia (/mnt/efs/results/cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/4021357e28509391eab2f8300d9b689e7e8f3a877ebb3d354b01577d497ebc63)


In [29]:
wiki_dataset_train.remove_columns_("title")
wiki_dataset_val.remove_columns_("title")

In [None]:
Load 8% of Book Corpus for training and another 8% for validation

In [30]:
book_dataset_train = load_dataset("bookcorpus", None, cache_dir=cache_dir, split="train[:8%]")
book_dataset_val = load_dataset("bookcorpus", None, cache_dir=cache_dir, split="train[8:16%]")

Reusing dataset bookcorpus (/mnt/efs/results/cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/af844be26c089fb64810e9f2cd841954fd8bd596d6ddd26326e4c70e2b8c96fc)
Reusing dataset bookcorpus (/mnt/efs/results/cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/af844be26c089fb64810e9f2cd841954fd8bd596d6ddd26326e4c70e2b8c96fc)


In [31]:
assert wiki_dataset_train.features.type == \
    wiki_dataset_val.features.type == \
    book_dataset_train.features.type == \
    book_dataset_val.features.type


Concatenate the datasets

In [32]:
datasets = DatasetDict()
train_datasets = [wiki_dataset_train, book_dataset_train]
validation_datasets = [wiki_dataset_val, book_dataset_val]

datasets["train"] = concatenate_datasets(train_datasets)
datasets["validation"] = concatenate_datasets(validation_datasets)

In [33]:
datasets["train"].info.description = \
"""
A combination of Wikipedia plus Book Corpus. This uses the range from 0% to 1% of the train split of wikipedia, and 0% to 8% of the train split Book Corpus.
"""

datasets["validation"].info.description = \
"""
A combination of Wikipedia plus Book Corpus. This uses the range from 1% to 2% of the train split of wikipedia, and 8% to 16% of the train split Book Corpus.
"""



In [34]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 5981122
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 5981122
    })
})

## Tokenize and save the little dataset
* It will be called `wikipedia_plus_bookcorpus`
* max_seq_length=128

In [41]:
hashed_folder = hash_dataset_folder_name(data_args)

dataset_path = os.path.join(
    os.path.abspath(data_args.tokenized_data_cache_dir),
    str(hashed_folder)
)
print(f"Tokenized dataset cache folder: {dataset_path}")

Hashing dataset folder name 'wikipedia_plus_bookcorpus_None (max_seq_length=128)' to 'facb56894f4824388c354b52e13d2c8a421ca1f3'
Tokenized dataset cache folder: /mnt/efs/results/preprocessed-datasets/text/facb56894f4824388c354b52e13d2c8a421ca1f3


In [20]:
pprint(data_args.__dict__)

{'data_collator': 'DataCollatorForWholeWordMask',
 'dataset_config_name': (None,),
 'dataset_name': ('wikipedia_plus_bookcorpus',),
 'line_by_line': False,
 'max_seq_length': 128,
 'mlm_probability': 0.15,
 'override_finetuning_results': False,
 'overwrite_cache': False,
 'pad_to_max_length': False,
 'preprocessing_num_workers': None,
 'reuse_tokenized_data': True,
 'save_tokenized_data': True,
 'task_name': None,
 'task_names': [],
 'tokenized_data_cache_dir': '/mnt/efs/results/preprocessed-datasets/text',
 'train_file': None,
 'validation_file': None,
 'validation_split_percentage': 5}


In [17]:

column_names = datasets["train"].column_names
text_column_name = "text"
tokenizer_kwargs = dict(
    cache_dir=model_args.cache_dir,
    use_fast=model_args.use_fast_tokenizer,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name,  # 'bert-base-cased'
    **tokenizer_kwargs
)
tokenizer

NameError: name 'datasets' is not defined

In [47]:
tokenized_datasets = preprocess_datasets_mlm(
    datasets, tokenizer, data_args,
    column_names, text_column_name
)

print(f"Saving tokenized dataset to {dataset_path}")
tokenized_datasets.save_to_disk(dataset_path)

Token indices sequence length is longer than the specified maximum sequence length for this model (768 > 512). Running this sequence through the model will result in indexing errors
Loading cached processed dataset at /mnt/efs/results/cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/4021357e28509391eab2f8300d9b689e7e8f3a877ebb3d354b01577d497ebc63/cache-65fd888ecc1b6bf6.arrow
Loading cached processed dataset at /mnt/efs/results/cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/4021357e28509391eab2f8300d9b689e7e8f3a877ebb3d354b01577d497ebc63/cache-90f2357a182dabe4.arrow


HBox(children=(FloatProgress(value=0.0, max=5982.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5982.0), HTML(value='')))


Saving tokenized dataset to /mnt/efs/results/preprocessed-datasets/text/facb56894f4824388c354b52e13d2c8a421ca1f3


In [50]:
# tokenized_datasets = load_from_disk(dataset_path)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask', 'token_type_ids'],
        num_rows: 1132942
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask', 'token_type_ids'],
        num_rows: 1133148
    })
})

## Load Tokenized Dataset

In [21]:
hashed_folder = hash_dataset_folder_name(data_args)
dataset_path = os.path.join(
    os.path.abspath(data_args.tokenized_data_cache_dir),
    str(hashed_folder)
)
print(f"Tokenized dataset cache folder: {dataset_path}")

Hashing dataset folder name 'wikipedia_plus_bookcorpus_None (max_seq_length=128)' to 'facb56894f4824388c354b52e13d2c8a421ca1f3'
Tokenized dataset cache folder: /mnt/efs/results/preprocessed-datasets/text/facb56894f4824388c354b52e13d2c8a421ca1f3


In [22]:
loaded_datasets = load_from_disk(dataset_path)

In [23]:
loaded_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask', 'token_type_ids'],
        num_rows: 1132942
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask', 'token_type_ids'],
        num_rows: 1133148
    })
})