In [1]:
import os
os.environ["OPENAI_API_KEY"] = 'API_KEY'

import random
import pandas as pd

from openai import OpenAI

In [2]:
client = OpenAI()
train_df = pd.read_csv('../dataset/en_train.csv')
valid_df = pd.read_csv('../dataset/en_dev.csv')
test_df = pd.read_csv('../dataset/en_test.csv')

train_df['is_train'] = 0
valid_df['is_train'] = 1
test_df['is_train'] = 2

total_df = pd.concat([train_df, valid_df, test_df])
print(total_df.shape)

(14460, 4)


In [5]:
content = "Translate the following English sentence into conversational Korean. \
           Ensure that the special strings like #string# are not translated. \
           Names of people (e.g., Mr. Smith) and places (e.g., Hawkins) can be translated into Korean. \
           Please make korean sentence very naturally."

def translate_text_with_gpt(text):
    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": content},
                {"role": "user", "content": text}
            ]
        )
        translation = completion.choices[0].message.content.strip()
        return translation
    except Exception as e:
        print(f"Error during translation: {e}")
        return text

In [6]:
idx = random.randint(0, total_df.shape[0]-1)
print(f"{idx:>08}")
sample_dialogue = total_df.loc[idx, 'dialogue']
sample_summary = total_df.loc[idx, 'summary']

translated_dialogue = translate_text_with_gpt(sample_dialogue)
translated_summary = translate_text_with_gpt(sample_summary)

print("Original Dialogue:\n",sample_dialogue,"\n")
print("Translated Dialogue:\n",translated_dialogue,"\n")

print("Original Summary:\n",sample_summary,"\n")
print("Translated Summary:\n",translated_summary,"\n")

00002546
Original Dialogue:
 #Person1#: What are you doing here today?
#Person2#: I would like to fight my ticket.
#Person1#: Is the arresting officer in the courtroom today?
#Person2#: Yes, the officer is here today.
#Person1#: What's your case?
#Person2#: I was pulled over for running a red light, but I never did.
#Person1#: Do you believe that the officer lied?
#Person2#: There was a camera on the signal. So, he was obviously lying.
#Person1#: Your license plate was caught on camera?
#Person2#: A picture of my license plate was never taken.
#Person1#: Since there is no picture of your license plate on record, I'm going to let you go.
#Person2#: Thank you. I'm glad that you believe me. 

Translated Dialogue:
 #Person1#: 오늘 여기서 뭐 하고 있어요?
#Person2#: 티켓을 취소하려고 왔어요.
#Person1#: 체포한 경찰관이 오늘 법정에 와 있나요?
#Person2#: 네, 오늘 경찰관이 여기 있어요.
#Person1#: 사건이 뭐예요?
#Person2#: 저는 빨간 신호등을 무시했다고 잡혔는데, 사실 그렇지 않았어요.
#Person1#: 경찰관이 거짓말을 했다고 생각해요?
#Person2#: 신호등에 카메라가 있었어요. 그래서 그가 분명히 거짓말했어요.
#Person1#: 당신의 차량

In [7]:
total_df['dialogue_ko'] = total_df['dialogue'].apply(translate_text_with_gpt)
total_df['summary_ko'] = total_df['summary'].apply(translate_text_with_gpt)

KeyboardInterrupt: 

In [8]:
print(total_df.shape)

(14460, 4)


In [None]:
train_df = total_df[total_df['is_train'] == 0].drop(columns=['is_train'])
valid_df = total_df[total_df['is_train'] == 1].drop(columns=['is_train'])
test_df = total_df[total_df['is_train'] == 2].drop(columns=['is_train'])

train_output_path = '../dataset/translated_train.csv'
valid_output_path = '../dataset/translated_valid.csv'
test_output_path = '../dataset/translated_test.csv'

train_df.to_csv(train_output_path, index=False)
valid_df.to_csv(valid_output_path, index=False)
test_df.to_csv(test_output_path, index=False)