# Evaluating a multilingual MT5 model for all languages English, German, Czech, Slovene using 1/4 of the data from each monolingual datasets


In [None]:
model = 'yawnick/mt5-small-paracrawl-multi-small' 
dataset = 'yawnick/para_crawl_multi_small'

## Environment Setup

We need a GPU

In [None]:
!nvidia-smi

Wed May 24 11:15:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

We install all needed libraries

In [None]:
!pip install datasets==2.11.0 transformers==4.28.0 nltk==3.8.1 parascore==1.0.5 sentencepiece==0.1.98

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==2.11.0
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Collecting parascore==1.0.5
  Downloading parascore-1.0.5-py3-none-any.whl (15 kB)
Collecting sentencepiece==0.1.98
  Downloading sentencepiece-0.1.98-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets==2.11.0)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

Connect to Google Drive to save the results in the root folder of our Drive at `/content/drive/MyDrive/`.

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


## Data Download and Preparation

In [None]:
from datasets import load_dataset

We use our own created datasets with paraphrases

In [None]:
raw_dataset = load_dataset(dataset, split='test')
raw_dataset[5]

Downloading and preparing dataset csv/yawnick--para_crawl_multi_small to /root/.cache/huggingface/datasets/yawnick___csv/yawnick--para_crawl_multi_small-23612396dfcd540b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/10.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.09M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/yawnick___csv/yawnick--para_crawl_multi_small-23612396dfcd540b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


{'Original': 'Und sie sind die Feinde des lebendigen Gottes.',
 'Paraphrase': 'Und sie sind die Feinde des lebendigen Gottes.'}

## Generating paraphrases

First, we will initialize the pipeline

In [None]:
from transformers import pipeline
import tensorflow as tf
from tqdm import tqdm

In [None]:
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
    print("Found GPU at: {}".format(device_name))
else:
    device_name = "/device:CPU:0"
    print("No GPU, using {}.".format(device_name))

Found GPU at: /device:GPU:0


In [None]:
pipe = pipeline('text2text-generation', model=model)  # device=0 tells it to use the GPU

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]



In [None]:
def data():
  for row in raw_dataset:
    yield row['Original']

In [None]:
ds_length = raw_dataset.num_rows
ps = []

with tf.device(device_name):
  for res in tqdm(pipe(data(), batch_size=48), total=ds_length):
    ps.append(res[0]['generated_text'])

100%|██████████| 11532/11532 [38:46<00:00,  4.96it/s]


## Evaluating paraphrases

In [None]:
from parascore import ParaScorer

In [None]:
scorer = ParaScorer(lang='multi')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
scores = scorer.base_score(raw_dataset['Original'], ps, raw_dataset['Paraphrase'], batch_size=16)
scores

[0.9309726956864478,
 0.9500001192092895,
 0.8630440814662844,
 0.8950634135702862,
 0.9500001192092895,
 0.9499999403953552,
 0.8486874330749911,
 0.9719296860694885,
 0.8886326718330383,
 0.9146899788482641,
 0.9332211780548095,
 0.901831464767456,
 0.9148347070237168,
 0.8745751917201554,
 0.902580870183787,
 0.9509455132484436,
 0.9635963675927143,
 0.9185632038116455,
 0.95,
 0.9791528239202658,
 0.9499998211860656,
 0.9010124838754032,
 0.8844574856758117,
 0.9001422095298767,
 0.9813071060180664,
 0.9500001192092895,
 0.9204317498207092,
 0.9470431361879621,
 0.9254100477302467,
 0.9719155844155845,
 0.8610642719268798,
 0.95,
 1.0175,
 0.8683685442484882,
 0.9500001192092895,
 0.886628942489624,
 0.9014778769420068,
 0.8246336150169372,
 0.9499998807907104,
 0.95,
 0.9499997615814209,
 0.9499998807907104,
 0.9149647711219471,
 0.8843897151947021,
 1.0175,
 0.9185509068625314,
 0.7975274491310119,
 0.9500002384185791,
 0.9746201799995631,
 0.9064082670211792,
 0.9834848323262724

In [None]:
score = sum(scores) / len(scores)
print('Average Parascore:', score)

Average Parascore: 0.9252384648973574


Generate and export the evaluation table

In [None]:
raw_dataset = raw_dataset.rename_column('Paraphrase', 'Reference')
raw_dataset = raw_dataset.add_column(name='Paraphrase', column=ps)
raw_dataset = raw_dataset.add_column(name='Parascore', column=scores)
raw_dataset = raw_dataset.to_csv('/content/drive/MyDrive/data/eval_table_multi_small.csv')
raw_dataset

Creating CSV from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

3034257