In [6]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "SZTAKI-HLT/hubert-base-cc"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/672 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

In [7]:
model.num_parameters()

110651649

In [8]:
text = "Esni fog az [MASK]."

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> Esni fog az is.'
'>>> Esni fog az olaj.'
'>>> Esni fog az USA.'
'>>> Esni fog az OTP.'
'>>> Esni fog az idő.'


In [12]:
torch.topk(mask_token_logits, 5, dim=1)

torch.return_types.topk(
values=tensor([[19.8728, 15.6464, 14.4843, 14.1546, 14.0488]],
       grad_fn=<TopkBackward0>),
indices=tensor([[7521, 2527, 2073, 4460, 5251]]))

In [6]:
import pandas as pd

In [8]:
import datasets
from pathlib import Path

data_dir = Path('/datasets/msg/')
unsupervised_dataset = datasets.Dataset.from_parquet((data_dir / "processed_msg.parquet").as_posix()).remove_columns(column_names=['username', 'date', 'prev_id', 'thread_id', 'comment_id', '__index_level_0__']).shuffle(seed=42).select(range(50_000))

Using custom data configuration default-e3c06abc361dc1b8
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/default-e3c06abc361dc1b8/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/parquet/default-e3c06abc361dc1b8/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-f122fbb78ab5e024.arrow


In [9]:
unsupervised_dataset

Dataset({
    features: ['text'],
    num_rows: 50000
})

In [10]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = unsupervised_dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

Loading cached processed dataset at /root/.cache/huggingface/datasets/parquet/default-e3c06abc361dc1b8/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-3a683c647f73a7ed.arrow


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
    num_rows: 50000
})

In [11]:
tokenizer.model_max_length

1000000000000000019884624838656

In [12]:
chunk_size = 128

In [13]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets[:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 19'
'>>> Review 1 length: 131'
'>>> Review 2 length: 17'


In [14]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 167'


In [15]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 39'


In [16]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [17]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Loading cached processed dataset at /root/.cache/huggingface/datasets/parquet/default-e3c06abc361dc1b8/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-af37b48d2a7a71a0.arrow


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 12628
})

In [18]:
tokenizer.decode(lm_datasets[1]["input_ids"])

'HUF 3 590 HUF 340 db 465 db 3 503 HUF 3 591 HUF 100 db [SEP] [CLS] Irányíthatnád úgy hogy 4400 - ért lehessen venni, még a héten [SEP] [CLS] 300e drb valós lesz ) ) ) re hantás [SEP] [CLS] Nagyon kiváncsi vagyok a mai napi kereskedésre [SEP] [CLS] mire pályáztál Nati? Oti 3500 - öt vette irányba jól látom? [SEP] [CLS] Én is fel ajánlotam neki hogy társuljunk az én nyereségemből adnék is neki de nem reagált. [SEP] [CLS] Sima ügy. - : ) ) [SEP] [CLS] Nekem tetszik az a doji. [SEP]'

In [19]:
tokenizer.decode(lm_datasets[1]["labels"])

'HUF 3 590 HUF 340 db 465 db 3 503 HUF 3 591 HUF 100 db [SEP] [CLS] Irányíthatnád úgy hogy 4400 - ért lehessen venni, még a héten [SEP] [CLS] 300e drb valós lesz ) ) ) re hantás [SEP] [CLS] Nagyon kiváncsi vagyok a mai napi kereskedésre [SEP] [CLS] mire pályáztál Nati? Oti 3500 - öt vette irányba jól látom? [SEP] [CLS] Én is fel ajánlotam neki hogy társuljunk az én nyereségemből adnék is neki de nem reagált. [SEP] [CLS] Sima ügy. - : ) ) [SEP] [CLS] Nekem tetszik az a doji. [SEP]'

In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [21]:
samples = [lm_datasets[i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] Most meg [MASK] össze nyomták az [MASK] ét nem [MASK] vissza menni, elemzők mik vannak [SEP] [CLS] Ajánlat Én könyv 400 db 3 [MASK]5 HUF 3 569 HUF [MASK] db 100 db 3 53 [MASK] HUF 3 570 HUF 90 db 730 db 3 531 HUF 3 574 HUF 170 db 200 db [MASK] 524 [MASK] 3 575 HUF 36 [MASK] db 1 318 db 3 520 [MASK] 3 57 [MASK] [MASK] 250 [MASK] 350 [MASK] 3állyal0 HUF 3 580 HUF 60 db 300 db 3 506 HUF 3 584 HUF 16 [MASK] 150 db 3 505 HUF 3 589 HUF 150 db 30 db 3 504'

'>>> HUF [unused636] 59 szavaz HUF [MASK]0 [MASK] 465 db 3 503 HUF [MASK] 59 [MASK] HUF [MASK] [MASK] [SEP] [CLS] Irányíthatnád úgy hogy 4400 - ért lehessen venni, még a héten [SEP] [CLS] 300e drb valós lesz ) ) ) re játékosok [MASK]ás [SEP] [CLS] Nagyon kiváncsi vagyok a mai [MASK] kereskedésre [SEP] [CLS] mire pályáztál Nati? Oti 350 [MASK] [MASK] öt vette irányba jól látom? [SEP] [CLS] Énetjük fel ajánlotam [MASK] hogy társuljunk az én [MASK]emből adnék is [MASK] de [MASK] reagált [MASK] [SEP] [CLS] Sima ügy. - : [MASK] ) [S

In [22]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return default_data_collator(features)

In [23]:
samples = [lm_datasets[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] Most meg úgy össze nyomták az OTP ét nem tud [MASK] [MASK], hogy [MASK] vannak [SEP] [CLS] [MASK] [MASK] [MASK] 400 db 3 535 HUF 3 [MASK] [MASK] HUF [MASK] [MASK] 100 db [MASK] 532 HUF 3 570 HUF 90 db 730 db 3 [MASK] [MASK] [MASK] 3 574 HUF 170 [MASK] 200 [MASK] 3 524 [MASK] 3 575 HUF 367 db [MASK] [MASK] [MASK] db 3 [MASK] [MASK] HUF [MASK] 579 HUF 250 [MASK] [MASK] db 3 510 HUF [MASK] [MASK] [MASK] HUF 60 db 300 [MASK] 3 506 HUF 3 584 HUF 16 db 150 [MASK] 3 505 [MASK] 3 589 HUF 150 db [MASK] db [MASK] [MASK] [MASK]'

'>>> HUF 3 590 HUF 340 db 465 db [MASK] 503 HUF 3 591 HUF 100 db [SEP] [CLS] Irányíthatnád úgy hogy 4400 - ért lehessen [MASK], még [MASK] [MASK] [SEP] [CLS] [MASK] [MASK] drb valós lesz ) [MASK] ) re hantás [SEP] [CLS] Nagyon kiváncsi vagyok a mai napi kereskedésre [SEP] [CLS] mire pályáztál Nati [MASK] Oti [MASK] [MASK] [MASK] öt vette irányba jól látom? [SEP] [CLS] Én [MASK] fel ajánlotam neki hogy társuljunk [MASK] én nyereségemből adnék [MASK] [MASK] [MA

In [24]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/parquet/default-e3c06abc361dc1b8/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-50fbe0a6882b1785.arrow and /root/.cache/huggingface/datasets/parquet/default-e3c06abc361dc1b8/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-2cf19993d7aa53c0.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [49]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [59]:
!git --version
!apt-get install git-lfs
!git-lfs install

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
git version 2.25.1
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 28 not upgraded.
Need to get 3316 kB of archives.
After this operation, 11.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 git-lfs amd64 2.9.2-1 [3316 kB]
Fetch

In [60]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-forum",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [61]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/papsebestyen/hubert-base-cc-finetuned-forum into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Using amp half precision backend


In [62]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


>>> Perplexity: 44.36


In [63]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 471


Epoch,Training Loss,Validation Loss
1,2.7966,2.513944
2,2.6303,2.460126
3,2.5525,2.450056


The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co

TrainOutput(global_step=471, training_loss=2.6578216967815553, metrics={'train_runtime': 615.3289, 'train_samples_per_second': 48.754, 'train_steps_per_second': 0.765, 'total_flos': 1974070172160000.0, 'train_loss': 2.6578216967815553, 'epoch': 3.0})

In [64]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


>>> Perplexity: 11.88


In [65]:
trainer.push_to_hub()

Saving model checkpoint to hubert-base-cc-finetuned-forum
Configuration saved in hubert-base-cc-finetuned-forum/config.json
Model weights saved in hubert-base-cc-finetuned-forum/pytorch_model.bin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Upload file pytorch_model.bin:   0%|          | 32.0k/422M [00:00<?, ?B/s]

Upload file runs/May18_16-09-38_ne6ddpxez3/events.out.tfevents.1652890244.ne6ddpxez3.112.0: 100%|##########| 5…

Upload file training_args.bin: 100%|##########| 3.17k/3.17k [00:00<?, ?B/s]

Upload file runs/May18_16-09-38_ne6ddpxez3/1652890268.067156/events.out.tfevents.1652890268.ne6ddpxez3.112.1: …

Upload file runs/May18_16-09-38_ne6ddpxez3/events.out.tfevents.1652890890.ne6ddpxez3.112.2: 100%|##########| 3…

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/papsebestyen/hubert-base-cc-finetuned-forum
   76365a9..08dbd3f  main -> main



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Masked Language Modeling', 'type': 'fill-mask'}}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/papsebestyen/hubert-base-cc-finetuned-forum
   08dbd3f..baab836  main -> main



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'https://huggingface.co/papsebestyen/hubert-base-cc-finetuned-forum/commit/08dbd3f87eda1c1b2c867c3ace68259ac1b09a69'

# Accelerate

In [25]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = whole_word_masking_data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [26]:
#downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
        "masked_token_type_ids": "token_type_ids",
    }
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/parquet/default-e3c06abc361dc1b8/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-c8bbbef98772ba87.arrow


In [28]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=whole_word_masking_data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [29]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at SZTAKI-HLT/hubert-base-cc were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [31]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [32]:
from transformers import get_scheduler

num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [33]:
from huggingface_hub import get_full_repo_name

model_name = "hubert-base-cc-finetuned-forum"
repo_name = get_full_repo_name(model_name)
repo_name

'papsebestyen/hubert-base-cc-finetuned-forum'

In [34]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

/notebooks/hubert-base-cc-finetuned-forum is already a clone of https://huggingface.co/papsebestyen/hubert-base-cc-finetuned-forum. Make sure you pull the latest changes with `repo.git_pull()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [35]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/785 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 2.1391974882929667
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disab

In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model=model, tokenizer=tokenizer
)

In [53]:
text = 'A támasz [MASK]'

In [69]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> A támasz.
>>> A támaszok
>>> A támasz?
>>> A támasz!
>>> A támasz :


-------------

In [1]:
!pip install -U pip
!pip install -U -r requirements.txt
!pip install accelerate
!pip install -U ipywidgets jupyter

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pip
  Downloading pip-22.1-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 14.2 MB/s eta 0:00:01     |████████████████████            | 1.3 MB 14.2 MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.2.4
    Uninstalling pip-21.2.4:
      Successfully uninstalled pip-21.2.4
Successfully installed pip-22.1
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers[torch]
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.2.1-py3-none-any.whl (342 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.2/342.2 kB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
Coll