In [10]:
!uv pip install -qU sentence-transformers
!uv pip install -q transformers
!uv pip install -q datasets 
!uv pip install -q ipywidgets
!uv pip install -q pandas 
!uv pip install -q 'accelerate>=0.26.0'
!uv pip install -q openai
!uv pip install -q python-dotenv

#### Translating mteb/sts17-crosslingual-sts

We will use OpenAI's `gpt-4o-mini` to machinely translate `en-en` pairs to include `ua-ua` and `en-ua` pairs. For additional research, the benchmark may also be translated into languages that are close or similar to Ukrainian.


#### Define OpenAI client and test API key 

In [35]:
import os 

from openai import AsyncOpenAI
from dotenv import load_dotenv

load_dotenv()
client = AsyncOpenAI()

In [30]:
res = await client.responses.create(
    model='gpt-4o-mini',
    instructions='You are a helpful AI assistant.',
    input='Briefly answer: "What is Odessa?"'
)
res.output_text

'Odessa is a major port city on the Black Sea in southern Ukraine. It is known for its rich history, diverse culture, and significant role in trade and shipping. Odessa is also famous for its architecture, including the Potemkin Stairs, and its vibrant arts scene.'

#### Load dataset

In [31]:
from datasets import load_dataset

dataset = load_dataset('mteb/sts17-crosslingual-sts', 'en-en', split='test')
dataset.set_format(type='pandas')
df = dataset[:]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sentence1  250 non-null    object 
 1   sentence2  250 non-null    object 
 2   score      250 non-null    float64
 3   lang       250 non-null    object 
dtypes: float64(1), object(3)
memory usage: 7.9+ KB


In [19]:
df.head()

Unnamed: 0,sentence1,sentence2,score,lang
0,A person is on a baseball team.,A person is playing basketball on a team.,2.4,en-en
1,Our current vehicles will be in museums when e...,The car needs to some work,0.2,en-en
2,A woman supervisor is instructing the male wor...,A woman is working as a nurse.,1.0,en-en
3,A bike is next to a couple women.,A child next to a bike.,2.0,en-en
4,The group is eating while taking in a breathta...,A group of people take a look at an unusual tree.,2.2,en-en


#### Translation 

In [55]:
async def translate(client: AsyncOpenAI, sentence: str, language: str) -> str:
    res = await client.responses.create(
        model='gpt-4o-mini',
        instructions=f'''
You are a helpful AI assistant. Please translate the following sentence into {language} language, 
using the most natural and fluent wording a native speaker would use.''',
        input=sentence
    )
    return res.output_text


In [56]:
LANGUAGE = 'Ukrainian'

**Translate `sentence1` column asynchronously**

In [57]:
# gathering asynchronous tasks 
tasks = []
for sentence in df['sentence1']:
    tasks.append(
        translate(
            client,
            sentence,
            LANGUAGE,
        )
    )
len(tasks)

250

In [58]:
import time 
import asyncio 

# asynchronously translate 250 sentences!
start = time.time()
sentence1 = await asyncio.gather(*tasks)
end = time.time()

print(f'Translation complete in: {end - start}')

Translation complete in: 8.253586769104004


In [59]:
print(len(sentence1))
sentence1[:3]

['Людина є в бейсбольній команді.',
 'Наші нинішні автомобілі опиняться в музеях, коли у всіх будуть свої власні літаки.',
 'Жінка-наставник дає вказівки чоловікам-робітникам.']

**Translate `sentence2` column asynchronously**

In [61]:
# gathering asynchronous tasks 
tasks = []
for sentence in df['sentence2']:
    tasks.append(
        translate(
            client,
            sentence,
            LANGUAGE,
        )
    )
len(tasks)

250

In [62]:
import time 
import asyncio 

# asynchronously translate 250 sentences!
start = time.time()
sentence2 = await asyncio.gather(*tasks)
end = time.time()

print(f'Translation complete in: {end - start}')

Translation complete in: 7.393270492553711


In [63]:
print(len(sentence2))
sentence2[:3]

250


['Людина грає у баскетбол у команді.',
 'Автомобіль потребує ремонту.',
 'Жінка працює медсестрою.']

#### Update dataframes

In [64]:
import pandas as pd

# ua-ua
ua_ua_df = pd.DataFrame() 
ua_ua_df['score'] = df['score']
ua_ua_df['sentence1'] = sentence1
ua_ua_df['sentence2'] = sentence2
ua_ua_df['lang'] = ['ua-ua'] * len(df)

ua_ua_df.head()

Unnamed: 0,score,sentence1,sentence2,lang
0,2.4,Людина є в бейсбольній команді.,Людина грає у баскетбол у команді.,ua-ua
1,0.2,"Наші нинішні автомобілі опиняться в музеях, ко...",Автомобіль потребує ремонту.,ua-ua
2,1.0,Жінка-наставник дає вказівки чоловікам-робітни...,Жінка працює медсестрою.,ua-ua
3,2.0,Поруч із кількома жінками стоїть велосипед.,Дитина біля велосипеда.,ua-ua
4,2.2,"Група їсть, насолоджуючись захоплюючим видом.",Група людей дивиться на незвичайне дерево.,ua-ua


In [65]:
# en-ua 

en_ua_df = pd.DataFrame() 
en_ua_df['score'] = df['score']
en_ua_df['sentence1'] = df['sentence1'] # en 
en_ua_df['sentence2'] = sentence2 # ua
en_ua_df['lang'] = ['en-ua'] * len(df)

en_ua_df.head()

Unnamed: 0,score,sentence1,sentence2,lang
0,2.4,A person is on a baseball team.,Людина грає у баскетбол у команді.,en-ua
1,0.2,Our current vehicles will be in museums when e...,Автомобіль потребує ремонту.,en-ua
2,1.0,A woman supervisor is instructing the male wor...,Жінка працює медсестрою.,en-ua
3,2.0,A bike is next to a couple women.,Дитина біля велосипеда.,en-ua
4,2.2,The group is eating while taking in a breathta...,Група людей дивиться на незвичайне дерево.,en-ua


#### Save translated datasets

In [69]:
en_ua_df.to_csv('./datasets/sts17-en-ua-gpt-4o-mini.csv', index=False)
ua_ua_df.to_csv('./datasets/sts17-ua-ua-gpt-4o-mini.csv', index=False)