### ЛАБОРАТОРНА РОБОТА 2

#### 1. Вибір задачі та датасету

https://huggingface.co/datasets/Helsinki-NLP/opus-100

In [31]:
from datasets import load_dataset
import pandas as pd

language_pairs = ["en-fr", "en-fi", "ar-en", "en-hi", "en-zh"]
subset_size = 1000  # number of training samples per pair

data = pd.DataFrame()

for pair in language_pairs:
    print(f"Loading {pair}...")
    ds = load_dataset("Helsinki-NLP/opus-100", pair, split=f"train[:{subset_size}]")
    data[pair] = ds['translation']



Loading en-fr...
Loading en-fi...
Loading ar-en...
Loading en-hi...
Loading en-zh...


#### 2. Аналіз даних та метрик

In [18]:
data.describe()

Unnamed: 0,en-fr,en-fi,ar-en,en-hi,en-zh
count,1000,1000,1000,1000,1000
unique,985,981,992,994,995
top,"{'en': 'Thank you.', 'fr': 'Merci.'}","{'en': 'Thank you.', 'fi': 'Kiitos.'}","{'ar': 'حسناً؟', 'en': 'Okay?'}",{'en': 'Failed to decrypt MIME part: protocol ...,"{'en': 'Introduction', 'zh': '一. 导言'}"
freq,6,7,5,2,4


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   en-fr   1000 non-null   object
 1   en-fi   1000 non-null   object
 2   ar-en   1000 non-null   object
 3   en-hi   1000 non-null   object
 4   en-zh   1000 non-null   object
dtypes: object(5)
memory usage: 39.2+ KB


In [20]:
data.isnull().sum()

en-fr    0
en-fi    0
ar-en    0
en-hi    0
en-zh    0
dtype: int64

In [37]:
from sklearn.model_selection import train_test_split


train_df, val_df = train_test_split(data, test_size=0.2)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

en_only = val_df.map(lambda x: x.get('en') if isinstance(x, dict) else None)
en_only.to_csv('opus100_en_val.csv', index=False)

en_except = val_df.map(lambda x: [x[d] for d in x.keys() if d != 'en'][0] if isinstance(x, dict) else None)
en_except.to_csv('opus100_notEn_val.csv', index=False)

In [38]:
en_except

Unnamed: 0,en-fr,en-fi,ar-en,en-hi,en-zh
0,J'aimerais le savoir.,- Vaikuttavaa.,نعم...,पूर्ववर्ती संस्करण से श्रेणीविस्तार असफल:,但对他的船员所知不多
1,"Starbuck, ici le Galactica. La piste est à vous.",Tulin maapallon toiselta puolelta... nähdäksen...,كيف حياة الزواج ؟,सब बदलें,我要那批货... 明天上船
2,Dis le.,"Arrow Fasteners Co. Ltd, Taipei 15,2 %, —",وذلك التعهد لا يعيد تأكيد إيماننا بالأمم المتح...,दिन@ info: whatsthis,年轻的好少年
3,C'est ma mère.,Albert.,لله ملك السماوات والأرض وما فيهما ملكًا وتدبير...,ईंट का सत्ता,荷兰（2004年4月30日）
4,Financement de la Base de soutien logistique d...,Haluan sen takaisin.,أنت في المراحل الأولى من الدخول في صدمة,फ़ाइल सूची को भी गोपित करें (_E),PHP 忒聊
...,...,...,...,...,...
195,Je l'espère bien.,Sarja Portti,سيد روس,केडीई यूएसबी प्रदर्शक,两种符号联接 可提高你个人的力量
196,Qu'est-ce que c'est que ça ?,"Huomaan, että tämä on sinulle tärkeää.",ما الذي تفعله هناك؟,जीवनसाथी,上帝在挑战你，他说你是笨蛋
197,Le cours est fini.,- Kutsuin erään auttamaan.,خذ وقتك,फ़ोल्डर नहीं खोल सका: अनुमति नहीं है,喔唔.
198,"Oh, on ne vous l'a pas dit ?",Pilvipää ulkona?,هـذا الطـريــق هـو جـزءٌ مـن قـصـة حـبـهــم,"DSUM( A1: C5; ""वेतन""; A9: A11)",- 滚开！


Метрики для оцінки моделей

In [22]:
from evaluate import load

prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]

chrf = load("chrf")
# chrF (character n-grams) - word_order=0, chrF++ (char + word n-grams) - word_order=2.
chrf_results = chrf.compute(predictions=prediction, references=reference)
chrfplus_results = chrf.compute(predictions=prediction, references=reference, word_order=2)

print('chrF:', chrf_results)
print('chrF++:', chrfplus_results)

chrF: {'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
chrF++: {'score': 82.87263732906315, 'char_order': 6, 'word_order': 2, 'beta': 2}


In [23]:
from evaluate import load
import numpy as np

#B ilingual Evaluation Understudy
bleu = load("bleu")

data_example = {
    "en-fr": {
        "predictions": ["Le temps est agréable aujourd'hui.", "Elle aime lire des livres."],
        "references": [["Il fait beau aujourd'hui."], ["Elle adore lire des livres."]],
    },
    "en-ar": {
        "predictions": ["الطقس جميل اليوم.", "هي تحب قراءة الكتب."],
        "references": [["إنه يوم جميل."], ["إنها تحب قراءة الكتب."]],
    },
    "en-zh": {
        "predictions": ["今天天气很好。", "她喜欢看书。"],
        "references": [["今天的天气很好。"], ["她喜欢读书。"]],
    }
}

scores = []
for lang_pair, values in data_example.items():
    result = bleu.compute(predictions=values["predictions"], references=values["references"])
    print(f"{lang_pair} BLEU: {result['bleu']:.2f}")
    scores.append(result["bleu"])

macro_avg_bleu = np.mean(scores)
print(f"\nMacro-Averaged BLEU: {macro_avg_bleu:.2f}")


en-fr BLEU: 0.31
en-ar BLEU: 0.44
en-zh BLEU: 0.00

Macro-Averaged BLEU: 0.25


Бенчмарки

#### 3. Експериментальна частина


Бейслайн

In [39]:
%pip install openai

Collecting openai
  Downloading openai-1.78.1-py3-none-any.whl.metadata (25 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.11.4-py3-none-any.whl.metadata (66 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.33.2 (from pydantic<3,>=1.9.0->openai)
  Downloading pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting typing-extensions<5,>=4.11 (from openai)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting typing-inspection>=0.4.0 (from pydantic<3,>=1.9.0->openai)
  Downloading typing_inspection-0.4.0-py3-none-any.

In [None]:
import json

creds["OPENAI_API_KEY"] 

'lol'

In [None]:
import json
import pandas as pd
import openai
import time

# Set your API key securely
with open('creds.json') as f:
    creds = json.load(f)

client = openai.OpenAI(api_key=creds['OPENAI_API_KEY'])  # Replace with your actual key or use os.environ

# Load the CSV
input_path = "opus100_en_val.csv"
output_path = "translated_opus100.csv"
df = pd.read_csv(input_path)

# Map of column names to target languages
language_map = {
    "en-fr": "French",
    "en-fi": "Finnish",
    "ar-en": "Arabic",
    "en-hi": "Hindi",
    "en-zh": "Chinese"
}

# Function to translate a single sentence
def translate(text, target_language, retries=3):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": f"Translate the following sentence to {target_language}."},
                    {"role": "user", "content": text}
                ],
                temperature=0
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error: {e}, retrying ({attempt+1}/{retries})...")
            time.sleep(2)
    return f"[Translation failed: {text}]"

# Translate each column
translated_df = pd.DataFrame()
for col in df.columns:
    print(f"Translating column: {col}")
    target_language = language_map.get(col, "French")  # default to French if missing
    translated_df[col] = df[col].apply(lambda x: translate(str(x), target_language))

# Save to CSV
translated_df.to_csv(output_path, index=False)
print(f"✅ Translated CSV saved to: {output_path}")

Translating column: en-fr
Error: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
, retrying (1/3)...
Error: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
, retryi

KeyboardInterrupt: 