In [3]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from torch.utils.data import Dataset
from transformers import BertModel, BertConfig
import torch
import transformers
from transformers import RobertaConfig, BertConfig
from transformers import RobertaTokenizerFast, BertTokenizerFast
from transformers import RobertaForMaskedLM, BertForMaskedLM
from transformers import LineByLineTextDataset
print(transformers.__version__)
print(torch.cuda.is_available())
torch.version.cuda
torch.__version__

3.0.2
True


'1.6.0'

In [None]:
paths = ['../datasets/tweets_corpus_bert.txt']

# Initialize a tokenizer
tokenizer_model = ByteLevelBPETokenizer()

# Customize training
tokenizer_model.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
tokenizer = ByteLevelBPETokenizer(
    "../models/tweet_bert/vocab.json",
    "../models/tweet_bert/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=50)

print(
    tokenizer.encode("hola soy un tweet .")
)
print(
    tokenizer.encode("hola soy un tweet 😃 .").tokens
)

In [None]:
# Save files to disk
tokenizer.save_model("../models/tweet_bert")

In [None]:
dataset_tw = TwitterBertDataset()

In [None]:
class TwitterBertDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "../models/tokenizer_tweets-vocab.json",
            "../models/tokenizer_tweets-merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=50)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("../datasets/").glob("tweets_corpus_bert.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [9]:
from transformers import RobertaTokenizerFast, RobertaTokenizer
roberta_tokenizer = RobertaTokenizerFast.from_pretrained("../models/tweet_bert", max_len=50)
len(roberta_tokenizer)
roberta_tokenizer

from transformers import RobertaTokenizerFast, RobertaTokenizer
bert_tokenizer = BertTokenizerFast.from_pretrained("../models/tweet_bert", max_len=50)

I0801 22:07:47.860127 17300 tokenization_utils_base.py:1424] Model name '../models/tweet_bert' not found in model shortcut name list (roberta-base, roberta-large, roberta-large-mnli, distilroberta-base, roberta-base-openai-detector, roberta-large-openai-detector). Assuming '../models/tweet_bert' is a path, a model identifier, or url to a directory containing tokenizer files.
I0801 22:07:47.862130 17300 tokenization_utils_base.py:1454] Didn't find file ../models/tweet_bert\added_tokens.json. We won't load it.
I0801 22:07:47.863127 17300 tokenization_utils_base.py:1454] Didn't find file ../models/tweet_bert\special_tokens_map.json. We won't load it.
I0801 22:07:47.864140 17300 tokenization_utils_base.py:1454] Didn't find file ../models/tweet_bert\tokenizer_config.json. We won't load it.
I0801 22:07:47.865134 17300 tokenization_utils_base.py:1454] Didn't find file ../models/tweet_bert\tokenizer.json. We won't load it.
I0801 22:07:47.867147 17300 tokenization_utils_base.py:1509] loading fi

OSError: Model name '../models/tweet_bert' was not found in tokenizers model name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, TurkuNLP/bert-base-finnish-cased-v1, TurkuNLP/bert-base-finnish-uncased-v1, wietsedv/bert-base-dutch-cased). We assumed '../models/tweet_bert' was a path, a model identifier, or url to a directory containing vocabulary files named ['vocab.txt'] but couldn't find such vocabulary files at this path or url.

In [30]:
dataset = LineByLineTextDataset(
    tokenizer=roberta_tokenizer,
    file_path="../datasets/tweets_corpus_bert.txt",
    block_size=128,
)

I0801 21:21:08.593384  1852 language_modeling.py:89] Creating features from dataset file at ../datasets/tweets_corpus_bert.txt


In [7]:
# config = RobertaConfig(
#     vocab_size=52_000,
#     max_position_embeddings=52,
#     num_attention_heads=12,
#     num_hidden_layers=6,
#     type_vocab_size=1
# )
# model = RobertaForMaskedLM(config=config)
config = BertConfig(
    vocab_size=52_000,
    max_position_embeddings=52,
    num_attention_heads=6,
    num_hidden_layers=3,
)
model = BertForMaskedLM(config)

In [8]:
model.num_parameters()

62477344

In [33]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=roberta_tokenizer, mlm=True, mlm_probability=0.15,
)

In [40]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/tweet_bert",
    do_train=True,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2
)
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset,
    prediction_loss_only = True,
)

I0801 21:25:01.799694  1852 training_args.py:254] PyTorch: setting up devices
I0801 21:25:02.140770  1852 trainer.py:206] You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.


In [41]:
trainer.train()

I0801 21:25:03.000386  1852 trainer.py:460] ***** Running training *****
I0801 21:25:03.001385  1852 trainer.py:461]   Num examples = 1118383
I0801 21:25:03.001385  1852 trainer.py:462]   Num Epochs = 3
I0801 21:25:03.002384  1852 trainer.py:463]   Instantaneous batch size per device = 8
I0801 21:25:03.003395  1852 trainer.py:464]   Total train batch size (w. parallel, distributed & accumulation) = 8
I0801 21:25:03.003395  1852 trainer.py:465]   Gradient Accumulation steps = 1
I0801 21:25:03.004393  1852 trainer.py:466]   Total optimization steps = 419394


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=139798.0, style=ProgressStyle(description…





RuntimeError: CUDA error: device-side assert triggered

In [None]:

# model