# example script to try to train and eval BERT model

## 1. eval pretrained

In [5]:
%pip freeze

aiohttp==3.7.4.post0
argon2-cffi==21.1.0
async-timeout==3.0.1
attrs==21.2.0
backcall==0.2.0
bleach==4.1.0
certifi==2021.10.8
cffi==1.15.0
chardet==4.0.0
charset-normalizer==2.0.7
click==8.0.3
colorama==0.4.4
datasets==1.13.3
debugpy==1.5.0
decorator==5.1.0
defusedxml==0.7.1
dill==0.3.4
entrypoints==0.3
filelock==3.3.1
fsspec==2021.10.1
huggingface-hub==0.0.19
idna==3.3
ipykernel==6.4.1
ipython==7.28.0
ipython-genutils==0.2.0
ipywidgets==7.6.5
jedi==0.18.0
Jinja2==3.0.2
joblib==1.1.0
jsonschema==4.1.0
jupyter-client==7.0.6
jupyter-core==4.8.1
jupyterlab-pygments==0.1.2
jupyterlab-widgets==1.0.2
MarkupSafe==2.0.1
matplotlib-inline==0.1.3
mistune==0.8.4
multidict==5.2.0
multiprocess==0.70.12.2
nbclient==0.5.4
nbconvert==6.2.0
nbformat==5.1.3
nest-asyncio==1.5.1
notebook==6.4.4
numpy==1.21.2
packaging==21.0
pandas==1.3.3
pandocfilters==1.5.0
parso==0.8.2
pickleshare==0.7.5
Pillow==8.4.0
portalocker==2.3.2
prometheus-client==0.11.0
prompt-toolkit==3.0.20
pyarrow==5.0.0
pycparser==2.20
Pygme

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-sk")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-sk")

In [7]:
import torch
torch.cuda.is_available()

True

In [8]:
torch.device("cpu")

device(type='cpu')

In [9]:
from datasets import load_dataset

dataset = load_dataset("opus100", "en-sk")

Reusing dataset opus100 (C:\Users\marek\.cache\huggingface\datasets\opus100\en-sk\0.0.0\a87abd612d82947c7a2c3991f71095a98f55141af7ad37516dfb31bfa3511ddc)


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
tokenizer

PreTrainedTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-sk', vocab_size=60025, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})

In [11]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(60025, 512, padding_idx=60024)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(60025, 512, padding_idx=60024)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
   

In [12]:
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [13]:
dataset['test'][1]

{'translation': {'en': 'What do you mean?', 'sk': 'Čo tým myslíš?'}}

In [14]:
dataset['train'][1]['translation']['en']

'My ankle!'

In [15]:
dataset['train'][1]['translation']['sk']

'Oh, môj členok!'

In [16]:
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [17]:
dataset['train'].features

{'translation': Translation(languages=['en', 'sk'], id=None)}

In [18]:
max_input_length = 512
max_target_length = 512
source_lang = "en"
target_lang = "sk"
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\marek\.cache\huggingface\datasets\opus100\en-sk\0.0.0\a87abd612d82947c7a2c3991f71095a98f55141af7ad37516dfb31bfa3511ddc\cache-db1709c076bb19c0.arrow
Loading cached processed dataset at C:\Users\marek\.cache\huggingface\datasets\opus100\en-sk\0.0.0\a87abd612d82947c7a2c3991f71095a98f55141af7ad37516dfb31bfa3511ddc\cache-289d30b4ede4ae62.arrow
Loading cached processed dataset at C:\Users\marek\.cache\huggingface\datasets\opus100\en-sk\0.0.0\a87abd612d82947c7a2c3991f71095a98f55141af7ad37516dfb31bfa3511ddc\cache-1a9712c8e1b58928.arrow


In [20]:
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'translation'],
        num_rows: 2000
    })
})

In [21]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [22]:
from transformers import Seq2SeqTrainingArguments

batch_size = 4
# model_name = model_checkpoint.split("/")[-1]

training_args = Seq2SeqTrainingArguments(
    "Helsinki-NLP/opus-mt-en-sk",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    no_cuda=True,
    fp16=False,
    push_to_hub=False,
)

In [23]:
from datasets import load_metric

metric = load_metric("sacrebleu")

In [24]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
#     print(preds[0])
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [25]:
val_small = tokenized_datasets['validation'].train_test_split(test_size=0.01)

Loading cached split indices for dataset at C:\Users\marek\.cache\huggingface\datasets\opus100\en-sk\0.0.0\a87abd612d82947c7a2c3991f71095a98f55141af7ad37516dfb31bfa3511ddc\cache-acc57b045c23ac0b.arrow and C:\Users\marek\.cache\huggingface\datasets\opus100\en-sk\0.0.0\a87abd612d82947c7a2c3991f71095a98f55141af7ad37516dfb31bfa3511ddc\cache-99a8a36ac19b9583.arrow


In [26]:
val_small

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'translation'],
        num_rows: 1980
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'translation'],
        num_rows: 20
    })
})

In [27]:
from transformers import Seq2SeqTrainer

# model.to('cpu')

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [28]:
val_small['test']

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'translation'],
    num_rows: 20
})

In [29]:
trainer.predict(val_small['test'])

The following columns in the test set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running Prediction *****
  Num examples = 20
  Batch size = 4


PredictionOutput(predictions=array([[60024,    82,   237, ..., 60024, 60024, 60024],
       [60024,  2328,     7, ..., 60024, 60024, 60024],
       [60024,  8492,   622, ..., 60024, 60024, 60024],
       ...,
       [60024, 21366,   139, ..., 60024, 60024, 60024],
       [60024,  1463,   222, ..., 60024, 60024, 60024],
       [60024,   392,  5860, ..., 60024, 60024, 60024]], dtype=int64), label_ids=array([[   82,   237, 12716, ..., 60024, 60024, 60024],
       [ 2328,   244,   164, ..., 60024, 60024, 60024],
       [ 2100,    28,     7, ..., 60024, 60024, 60024],
       ...,
       [ 7519,     7,   521, ..., 60024, 60024, 60024],
       [ 1463,   222,     2, ..., 60024, 60024, 60024],
       [  392,  5860,    62, ..., 60024, 60024, 60024]], dtype=int64), metrics={'eval_loss': 1.4562461376190186, 'eval_bleu': 34.2418, 'eval_gen_len': 16.85, 'eval_runtime': 12.9938, 'eval_samples_per_second': 1.539, 'eval_steps_per_second': 0.385})

We can see **BLEU score of 34.2418**

Other metrics:

'eval_gen_len': 16.85, 'eval_runtime': 12.4035, 'eval_samples_per_second': 1.612, 'eval_steps_per_second': 0.403

## Quantized model evaluation

In [30]:
import torch

torch.backends.quantized.engine

'fbgemm'

In [31]:
import torch

# model.to('cpu')

# choose torch.backends.quantized.engine = 'qnnpack' if running on ARM device
torch.backends.quantized.engine = 'fbgemm'



quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
quantized_model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(60025, 512, padding_idx=60024)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(60025, 512, padding_idx=60024)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): DynamicQuantizedLinear(in_features=512, out_features=512, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v_proj): DynamicQuantizedLinear(in_features=512, out_features=512, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (q_proj): DynamicQuantizedLinear(in_features=512, out_features=512, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (out_proj): DynamicQuantizedLinear(in_features=512, out_features=512, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (

In [32]:
quant_data_collator = DataCollatorForSeq2Seq(tokenizer, model=quantized_model)

In [35]:
from transformers import Seq2SeqTrainer

quantized_model.to('cpu')

quant_trainer = Seq2SeqTrainer(
    quantized_model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=quant_data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [36]:
quant_trainer.device

AttributeError: 'Seq2SeqTrainer' object has no attribute 'device'

In [37]:
quant_trainer.predict(val_small['test'])

The following columns in the test set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running Prediction *****
  Num examples = 20
  Batch size = 4


PredictionOutput(predictions=array([[60024,    82,   237, ..., 60024, 60024, 60024],
       [60024,  2328,     7, ..., 60024, 60024, 60024],
       [60024,  8492,   622, ..., 60024, 60024, 60024],
       ...,
       [60024, 21366,   139, ..., 60024, 60024, 60024],
       [60024,  1463,   222, ..., 60024, 60024, 60024],
       [60024,   392,  5860, ..., 60024, 60024, 60024]], dtype=int64), label_ids=array([[   82,   237, 12716, ..., 60024, 60024, 60024],
       [ 2328,   244,   164, ..., 60024, 60024, 60024],
       [ 2100,    28,     7, ..., 60024, 60024, 60024],
       ...,
       [ 7519,     7,   521, ..., 60024, 60024, 60024],
       [ 1463,   222,     2, ..., 60024, 60024, 60024],
       [  392,  5860,    62, ..., 60024, 60024, 60024]], dtype=int64), metrics={'eval_loss': 1.4729728698730469, 'eval_bleu': 33.2778, 'eval_gen_len': 16.8, 'eval_runtime': 7.4097, 'eval_samples_per_second': 2.699, 'eval_steps_per_second': 0.675})

We can see **BLEU score of 34.842**

Other metrics:

'eval_gen_len': 16.7, 'eval_runtime': 6.1269, 'eval_samples_per_second': 3.264, 'eval_steps_per_second': 0.816

## Test on whole validation set

In [40]:
# trainer.predict(tokenized_datasets["validation"])
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 4


{'eval_loss': 1.4373191595077515,
 'eval_bleu': 35.7476,
 'eval_gen_len': 14.988,
 'eval_runtime': 985.4955,
 'eval_samples_per_second': 2.029,
 'eval_steps_per_second': 0.507}

We can see **BLEU score of 35.7476**

Other metrics:

'eval_gen_len': 14.988, **'eval_runtime': 894.6245, 'eval_samples_per_second': 2.236,** 'eval_steps_per_second': 0.559

In [41]:
# quant_trainer.predict(tokenized_datasets["validation"])
quant_trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 4


{'eval_loss': 1.4801710844039917,
 'eval_bleu': 34.6891,
 'eval_gen_len': 15.038,
 'eval_runtime': 652.8809,
 'eval_samples_per_second': 3.063,
 'eval_steps_per_second': 0.766}

We can see **BLEU score of 35.7055**

Other metrics:

'eval_gen_len': 15.0445, **'eval_runtime': 501.4448, 'eval_samples_per_second': 3.988,** 'eval_steps_per_second': 0.997

In [None]:
quant_runtime= 501.4448
full_runtime= 894.6245
print(f"Quantization evaluation is {quant_runtime/full_runtime}% of full precision time")
print(f"Quantization evaluation is {full_runtime/quant_runtime}x faster than full precision time")

## different dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("open_subtitles", lang1="en",lang2="sk")

In [None]:
dataset

In [None]:
dataset['train']

In [None]:
dataset['train'][0]

In [None]:
dataset['train'][1]