### ЛАБОРАТОРНА РОБОТА 2

#### 1. Вибір задачі та датасету

https://huggingface.co/datasets/Helsinki-NLP/opus-100

In [1]:
from datasets import load_dataset
import pandas as pd

language_pairs = ["en-fr", "en-fi", "ar-en", "en-hi", "en-zh"]
subset_size = 1000  # number of training samples per pair

data = pd.DataFrame()

for pair in language_pairs:
    print(f"Loading {pair}...")
    ds = load_dataset("Helsinki-NLP/opus-100", pair, split=f"train[:{subset_size}]")
    data[pair] = ds['translation']


Loading en-fr...
Loading en-fi...
Loading ar-en...
Loading en-hi...
Loading en-zh...


#### 2. Аналіз даних та метрик

In [2]:
data.describe()

Unnamed: 0,en-fr,en-fi,ar-en,en-hi,en-zh
count,1000,1000,1000,1000,1000
unique,985,981,992,994,995
top,"{'en': 'Thank you.', 'fr': 'Merci.'}","{'en': 'Thank you.', 'fi': 'Kiitos.'}","{'ar': 'حسناً؟', 'en': 'Okay?'}",{'en': 'Failed to decrypt MIME part: protocol ...,"{'en': 'Introduction', 'zh': '一. 导言'}"
freq,6,7,5,2,4


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   en-fr   1000 non-null   object
 1   en-fi   1000 non-null   object
 2   ar-en   1000 non-null   object
 3   en-hi   1000 non-null   object
 4   en-zh   1000 non-null   object
dtypes: object(5)
memory usage: 39.2+ KB


In [4]:
data.isnull().sum()

en-fr    0
en-fi    0
ar-en    0
en-hi    0
en-zh    0
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split


train_df, val_df = train_test_split(data, test_size=0.2)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

Метрики для оцінки моделей

In [6]:
from evaluate import load

prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]

chrf = load("chrf")
# chrF (character n-grams) - word_order=0, chrF++ (char + word n-grams) - word_order=2.
chrf_results = chrf.compute(predictions=prediction, references=reference)
chrfplus_results = chrf.compute(predictions=prediction, references=reference, word_order=2)

print('chrF:', chrf_results)
print('chrF++:', chrfplus_results)

chrF: {'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
chrF++: {'score': 82.87263732906315, 'char_order': 6, 'word_order': 2, 'beta': 2}


In [7]:
from evaluate import load
import numpy as np

#B ilingual Evaluation Understudy
bleu = load("bleu")

data_example = {
    "en-fr": {
        "predictions": ["Le temps est agréable aujourd'hui.", "Elle aime lire des livres."],
        "references": [["Il fait beau aujourd'hui."], ["Elle adore lire des livres."]],
    },
    "en-ar": {
        "predictions": ["الطقس جميل اليوم.", "هي تحب قراءة الكتب."],
        "references": [["إنه يوم جميل."], ["إنها تحب قراءة الكتب."]],
    },
    "en-zh": {
        "predictions": ["今天天气很好。", "她喜欢看书。"],
        "references": [["今天的天气很好。"], ["她喜欢读书。"]],
    }
}

scores = []
for lang_pair, values in data_example.items():
    result = bleu.compute(predictions=values["predictions"], references=values["references"])
    print(f"{lang_pair} BLEU: {result['bleu']:.2f}")
    scores.append(result["bleu"])

macro_avg_bleu = np.mean(scores)
print(f"\nMacro-Averaged BLEU: {macro_avg_bleu:.2f}")


en-fr BLEU: 0.31
en-ar BLEU: 0.44
en-zh BLEU: 0.00

Macro-Averaged BLEU: 0.25


Бенчмарки

#### 3. Експериментальна частина


Бейслайн

In [None]:
en_only = val_df.map(lambda x: x.get('en') if isinstance(x, dict) else None)
en_only[:].to_csv('opus100_en_val.csv', index=False)

except_en = val_df.map(lambda x: [x[d] for d in x.keys() if d != 'en'][0] if isinstance(x, dict) else None)
except_en.to_csv('opus100_notEn_val.csv', index=False)
print('\n'.join(en_only['en-zh'].to_list()))

In [20]:
import json

gpt_4o_mini_predicted = pd.read_csv('opus100_gpt-4o-mini_translated_to_english.csv')

chrf = load("chrf")
# chrF (character n-grams) - word_order=0, chrF++ (char + word n-grams) - word_order=2.
chrf_results = {}
chrfplus_results = {}
for pair in gpt_4o_mini_predicted.columns.to_list():
    chrf_results[pair] = chrf.compute(predictions=gpt_4o_mini_predicted[pair], references=en_only[pair])
    chrfplus_results[pair] = chrf.compute(predictions=gpt_4o_mini_predicted[pair], references=en_only[pair], word_order=2)

print('chrF\n', json.dumps(chrf_results, indent=4))
print('\nchrF++\n', json.dumps(chrfplus_results, indent=4))

chrF
 {
    "en-fr": {
        "score": 5.914547598876224,
        "char_order": 6,
        "word_order": 0,
        "beta": 2
    },
    "en-fi": {
        "score": 6.5802290940836174,
        "char_order": 6,
        "word_order": 0,
        "beta": 2
    },
    "ar-en": {
        "score": 7.656335102287539,
        "char_order": 6,
        "word_order": 0,
        "beta": 2
    },
    "en-hi": {
        "score": 16.184023089548692,
        "char_order": 6,
        "word_order": 0,
        "beta": 2
    },
    "en-zh": {
        "score": 13.239483280331369,
        "char_order": 6,
        "word_order": 0,
        "beta": 2
    }
}

chrF++
 {
    "en-fr": {
        "score": 5.534151523094612,
        "char_order": 6,
        "word_order": 2,
        "beta": 2
    },
    "en-fi": {
        "score": 6.055987583711977,
        "char_order": 6,
        "word_order": 2,
        "beta": 2
    },
    "ar-en": {
        "score": 7.5534628105481,
        "char_order": 6,
        "word_order":

In [19]:
print(gpt_4o_mini_predicted['en-fr'][0], en_only['en-fr'][0])

Second Committee Open the gate!
