In [1]:
import re
import torch
import pandas as pd

from tqdm import tqdm
from transformers import pipeline
from googletrans import Translator
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import MarianMTModel, MarianTokenizer

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("../dataset/cleaned_train.csv")
model_name = 'Helsinki-NLP/opus-mt-ko-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)

In [None]:
def translate_sentences(sentences, model, tokenizer):
    # 여러 문장을 한 번에 번역
    tokenized_texts = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    tokenized_texts = {k: v.to(device) for k, v in tokenized_texts.items()}  # 입력 텐서를 GPU로 이동
    translated = model.generate(**tokenized_texts)
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return translated_texts

def translate_sentence(sentence, model, tokenizer):
    # 하나의 문장을 번역
    tokenized_text = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    tokenized_text = {k: v.to(device) for k, v in tokenized_text.items()}  # 입력 텐서를 GPU로 이동
    translated = model.generate(**tokenized_text)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

def translation(dialogue):
    lines = dialogue.split('\n')
    sentences = [line.split(': ', 1)[1] for line in lines if ': ' in line]
    
    translated_sentences = translate_sentences(sentences, model, tokenizer)
    
    for i, line in enumerate(lines):
        if ': ' in line:
            speaker, _ = line.split(': ', 1)
            lines[i] = f"{speaker}: {translated_sentences.pop(0)}"
    
    return '\n'.join(lines)

In [None]:
example = df.iloc[0]['dialogue']
print(f"{example}\n\n")
print(translation(example))

In [None]:
translated_summaries = []
translated_dialogues = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    translated_dialogue = translation(row['dialogue'])
    translated_summary = translate_sentence(row['summary'], model, tokenizer)

    translated_dialogues.append(translated_dialogue)
    translated_summaries.append(translated_summary)

df['translated_dialogue'] = translated_dialogues
df['translated_summary'] = translated_summaries

In [None]:
print(df.iloc[17]['translated_dialogue'])
print(df.iloc[17]['translated_summary'])

In [None]:
df.columns

In [None]:
df.to_csv("../dataset/en_train.csv", index=False)

In [None]:
df = pd.read_csv("../dataset/cleaned_dev.csv")

In [None]:
translated_summaries = []
translated_dialogues = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    translated_dialogue = translation(row['dialogue'])
    translated_summary = translate_sentence(row['summary'], model, tokenizer)

    translated_dialogues.append(translated_dialogue)
    translated_summaries.append(translated_summary)

df['translated_dialogue'] = translated_dialogues
df['translated_summary'] = translated_summaries

In [None]:
df.to_csv("../dataset/en_dev.csv", index=False)

In [2]:
special_tokens = [
    '#Person1#',
    '#Person2#',
    '#Person3#',
    '#Person4#',
    '#Person5#',
    '#Person6#',
    '#Person7#',
    '#SSN#',
    '#Email#',
    '#Address#',
    '#Reaction#',
    '#CarNumber#',
    '#Movietitle#',
    '#DateOfBirth#',
    '#CardNumber#',
    '#PhoneNumber#',
    '#PassportNumber#'
]

In [3]:
train_df = pd.read_csv("../dataset/en_train.csv")
valid_df = pd.read_csv("../dataset/en_dev.csv")

In [4]:
train_df['is_train'] = 1
valid_df['is_train'] = 0
total_df = pd.concat([train_df, valid_df], ignore_index=True)

In [7]:
# 특수 문자열 패턴 찾기 함수
def find_special_strings(text):
    return re.findall(r'#\w+', text)

total_df['dialogue_special_strings'] = total_df['translated_dialogue'].apply(find_special_strings)
total_df['summary_special_strings'] = total_df['translated_summary'].apply(find_special_strings)

# total_df[['dialogue_special_strings', 'summary_special_strings']].head()
unique_dialogue_strings = set([item for sublist in total_df['dialogue_special_strings'] for item in sublist])
unique_summary_strings = set([item for sublist in total_df['summary_special_strings'] for item in sublist])

print(unique_dialogue_strings)
print(unique_summary_strings)

{'#Person1', '#This', '#Person6', '#1', '#사람1만기', '#Yes', '#or', '#All', '#Ratection', '#Person4', '#작은', '#Person3', '#in', '#I', '#5', '#How', '#Really', '#72', '#샐러드용', '#2', '#나', '#Oh', '#No', '#Person5', '#CardNumber', '#DeateOfBirth', '#right', '#고객님', '#Mobititle', '#Reaction', '#one', '#Adress', '#PhoneNumber', '#Pasport', '#Person', '#Rate', '#Camation', '#You', '#다음', '#CarNumber', '#Look', '#Person2', '#Addresss', '#여기서', '#Niel', '#어디', '#여기', '#Person7', '#time', '#Rection', '#잠깐만요', '#Rase', '#PhoneNomber', '#하지만', '#PoneNumber', '#Swice', '#Cliffs', '#Mobiettetle', '#karNumber', '#DateOfBirth', '#Moby', '#PasportNumber', '#B형', '#Email', '#페리에와', '#passport', '#Address', '#Mobiettele', '#SSN', '#Thank', '#Card', '#Hm'}
{'#Person1', '#Verson2', '#680', '#Serson1', '#1', '#Sfrights', '#Yon', '#aded', '#wanson1', '#The', '#I', '#Treate', '#Torress', '#Herson2', '#Painson1', '#Gerry', '#TM', '#Saturative', '#No', '#A', '#Amanda', '#Amenity', '#Secret', '#30', '#feson1', '#w