### ЛАБОРАТОРНА РОБОТА 2

#### 1. Вибір задачі та датасету

https://huggingface.co/datasets/Helsinki-NLP/opus-100

In [16]:
from datasets import load_dataset
import pandas as pd

language_pairs = ["en-fr", "en-fi", "ar-en", "en-hi", "en-zh"]
subset_size = 1000  # number of training samples per pair

data = pd.DataFrame()

for pair in language_pairs:
    print(f"Loading {pair}...")
    ds = load_dataset("Helsinki-NLP/opus-100", pair, split=f"train[:{subset_size}]")
    data[pair] = ds['translation']



Loading en-fr...
Loading en-fi...
Loading ar-en...
Loading en-hi...
Loading en-zh...


#### 2. Аналіз даних та метрик

In [18]:
data.describe()

Unnamed: 0,en-fr,en-fi,ar-en,en-hi,en-zh
count,1000,1000,1000,1000,1000
unique,985,981,992,994,995
top,"{'en': 'Thank you.', 'fr': 'Merci.'}","{'en': 'Thank you.', 'fi': 'Kiitos.'}","{'ar': 'حسناً؟', 'en': 'Okay?'}",{'en': 'Failed to decrypt MIME part: protocol ...,"{'en': 'Introduction', 'zh': '一. 导言'}"
freq,6,7,5,2,4


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   en-fr   1000 non-null   object
 1   en-fi   1000 non-null   object
 2   ar-en   1000 non-null   object
 3   en-hi   1000 non-null   object
 4   en-zh   1000 non-null   object
dtypes: object(5)
memory usage: 39.2+ KB


In [20]:
data.isnull().sum()

en-fr    0
en-fi    0
ar-en    0
en-hi    0
en-zh    0
dtype: int64

In [17]:
from sklearn.model_selection import train_test_split


train_df, val_df = train_test_split(data, test_size=0.2)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

en_only = val_df.map(lambda x: x.get('en') if isinstance(x, dict) else None)
en_only.to_csv('opus100_en_val.csv', index=False)

en_except = val_df.map(lambda x: [x[d] for d in x.keys() if d != 'en'][0] if isinstance(x, dict) else None)
en_except.to_csv('opus100_notEn_val.csv', index=False)

In [None]:
print('\n'.join(en_only['en-fi'].to_list()))

Second Committee
Nadia ?
Instructions:
I'll see you people in a little while
Hello, Mr Gallbladder.
My Lord...
Leave me alone.
Where is he now?
I have a bit of a cold.
Oh, ridiculous.
Look, this watch is worth over $ 1100.
At the same meeting, statements in connection with the proposed oral amendments to draft decision A/HRC/2/L.44, as contained in document A/HRC/2/L.48, were made by the representatives of Algeria (on behalf of the African Group), Canada, Jordan and Switzerland and the observer for the Sudan.
You got it.
Again!
She left me certain clues.
Excuse me, darling.
Fortieth session
He was gone already.
zinc chloride, glycerol, metacresol, phenol, disodium phosphate dihydrate, sodium hydroxide, hydrochloric acid, protamine sulphate and water for injections.
My Uncle Edmure has his forces garrisoned there.
And I didn't think it should show up on his record.
Pressure.
Me. This wasn't for me. This was for her.
- Well, I mean, at least financially.
So, what do we do now?
- Wait...


Метрики для оцінки моделей

In [22]:
from evaluate import load

prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]

chrf = load("chrf")
# chrF (character n-grams) - word_order=0, chrF++ (char + word n-grams) - word_order=2.
chrf_results = chrf.compute(predictions=prediction, references=reference)
chrfplus_results = chrf.compute(predictions=prediction, references=reference, word_order=2)

print('chrF:', chrf_results)
print('chrF++:', chrfplus_results)

chrF: {'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
chrF++: {'score': 82.87263732906315, 'char_order': 6, 'word_order': 2, 'beta': 2}


In [23]:
from evaluate import load
import numpy as np

#B ilingual Evaluation Understudy
bleu = load("bleu")

data_example = {
    "en-fr": {
        "predictions": ["Le temps est agréable aujourd'hui.", "Elle aime lire des livres."],
        "references": [["Il fait beau aujourd'hui."], ["Elle adore lire des livres."]],
    },
    "en-ar": {
        "predictions": ["الطقس جميل اليوم.", "هي تحب قراءة الكتب."],
        "references": [["إنه يوم جميل."], ["إنها تحب قراءة الكتب."]],
    },
    "en-zh": {
        "predictions": ["今天天气很好。", "她喜欢看书。"],
        "references": [["今天的天气很好。"], ["她喜欢读书。"]],
    }
}

scores = []
for lang_pair, values in data_example.items():
    result = bleu.compute(predictions=values["predictions"], references=values["references"])
    print(f"{lang_pair} BLEU: {result['bleu']:.2f}")
    scores.append(result["bleu"])

macro_avg_bleu = np.mean(scores)
print(f"\nMacro-Averaged BLEU: {macro_avg_bleu:.2f}")


en-fr BLEU: 0.31
en-ar BLEU: 0.44
en-zh BLEU: 0.00

Macro-Averaged BLEU: 0.25


Бенчмарки

#### 3. Експериментальна частина


Бейслайн

In [39]:
%pip install openai

Collecting openai
  Downloading openai-1.78.1-py3-none-any.whl.metadata (25 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.11.4-py3-none-any.whl.metadata (66 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.33.2 (from pydantic<3,>=1.9.0->openai)
  Downloading pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting typing-extensions<5,>=4.11 (from openai)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting typing-inspection>=0.4.0 (from pydantic<3,>=1.9.0->openai)
  Downloading typing_inspection-0.4.0-py3-none-any.

In [None]:
import json

creds["OPENAI_API_KEY"] 

'lol'

In [12]:
import json
import pandas as pd
import openai
import time
from tqdm import tqdm

# Set your API key securely
with open('creds.json') as f:
    creds = json.load(f)

client = openai.OpenAI(api_key=creds['OPENAI_API_KEY'])  # Replace with your actual key or use os.environ

# Load the CSV
input_path = "opus100_en_val.csv"
output_path = "translated_opus100.csv"
df = pd.read_csv(input_path)

# Map of column names to target languages
language_map = {
    "en-fr": "French",
    "en-fi": "Finnish",
    "ar-en": "Arabic",
    "en-hi": "Hindi",
    "en-zh": "Chinese"
}


# Translation function for a batch of sentences
def translate_batch(sentences, target_language, retries=3):
    joined = "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])
    prompt = (
        f"Translate the following sentences into {target_language}. "
        f"Return only the translated sentences, each on a new line:\n\n{joined}"
    )

    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
            )
            output = response.choices[0].message.content.strip().split("\n")
            return [line.strip() for line in output if line.strip()]
        except Exception as e:
            print(f"Error: {e}, retrying ({attempt+1}/{retries})...")
            time.sleep(2)
    return [f"[Translation failed: {s}]" for s in sentences]

# Batch size per API call
batch_size = 30

# Translate all columns
translated_df = pd.DataFrame()
for col in df.columns:
    print(f"🔁 Translating column: {col}")
    target_lang = language_map[col]
    texts = df[col].astype(str).tolist()
    translated = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        translated_batch = translate_batch(batch, target_lang)
        translated.extend(translated_batch)

    translated_df[col] = translated

# Save output
translated_df.to_csv(output_path, index=False)
print(f"\n✅ Translated file saved to: {output_path}")

🔁 Translating column: en-fr


  0%|          | 0/7 [00:00<?, ?it/s]

Error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}, retrying (1/3)...
Error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}, retrying (2/3)...


  0%|          | 0/7 [00:07<?, ?it/s]


KeyboardInterrupt: 