In [2]:
from transformers import (AutoTokenizer,
                          AutoModelForMaskedLM,
                          DataCollatorForLanguageModeling,
                          TrainingArguments,
                          Trainer,
                          pipeline)

from datasets import load_dataset
from huggingface_hub import notebook_login

In [None]:
# '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'
notebook_login()

In [None]:
dataset = load_dataset('marticampgin/ietf_texts')

In [49]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForMaskedLM.from_pretrained("roberta-base")

In [None]:
mask_filler_no_ft = pipeline('fill-mask', 'roberta-base')

texts = ['i <mask> adoption of this draft',
         'develop <mask> protocols',
         'dear working <mask>',
         'i have updated the <mask>']

mask_filler_no_ft(texts, top_k=3)

In [50]:
# Adding domain specific tokens to the tokenizer, modifying models token embedding matrix
_ = tokenizer.add_tokens(top_800_tokens)
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 51065. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(51065, 768)

In [10]:
dataset['train'][0]

{'text': 'the usage of the term "path" is actually misleading, at least i see that i am misusing it. we can only consider pairs of local ip address and remote ip address, not all the path. when related to section [link], i think that it should apply to the source-destination pair rather than the destination only, and this is because sctp cannot assume that reaching a destination has the same characteristics from all the sources at a certain time. when doing a "path" probing, part of the values of section [link] can be already available, for instance srtt, rto, pmtu, state. about source based routing, currently sctp is already used in scenarios involving security gateways so that a set of destination addresses can only be reached from a subset of source addresses, this is not prohibited from rfc4960.'}

In [11]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in list(examples["text"])])

In [None]:
tokenized_dset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

In [None]:
tokenized_dset['train'][0]

In [16]:
block_size = 128

In [17]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [19]:
lm_dataset = tokenized_dset.map(group_texts, batched=True)

Map:   0%|          | 0/28005 [00:00<?, ? examples/s]

Map:   0%|          | 0/3112 [00:00<?, ? examples/s]

In [20]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
seed = 77

training_args = TrainingArguments(
    output_dir="roberta_ietf_finetuned",
    evaluation_strategy='steps',
    logging_strategy='steps',
    save_strategy='steps',
    save_total_limit=5,
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=False,
    seed=seed,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

In [None]:
trainer.state.log_history

In [53]:
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

'https://huggingface.co/marticampgin/roberta_ietf_finetuned/tree/main/'

In [56]:
tokenizer.push_to_hub('marticampgin/roberta_ietf_finetuned')

CommitInfo(commit_url='https://huggingface.co/marticampgin/roberta_ietf_finetuned/commit/bee736415884cf78d977348ca8b10dbe97332105', commit_message='Upload tokenizer', commit_description='', oid='bee736415884cf78d977348ca8b10dbe97332105', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
mask_filler_ft = pipeline("fill-mask", "marticampgin/roberta_ietf_finetuned")