In [1]:
import pandas as pd
import re
from io import StringIO
import googletrans
import os
import sys
import urllib
import json
from tqdm import tqdm

In [2]:
def back_translate_google(source_text, target_lang="en"):
    translator = googletrans.Translator()
    
    # en와 ja 비슷, zh-cn와 zh-tw 비슷
    # lang_list = ["en", "zh-cn"]
    
    trans_text = translator.translate(source_text, dest=target_lang)
    back_trans_text = translator.translate(trans_text.text, dest="ko")
    
    return back_trans_text.text
    

In [11]:
def back_translate_papago(source_text, target_lang="en"):
    # 보안상 개인 naver api key 사용 바람
    json_file = open("./data/translator_key.json", encoding="utf-8")
    key_dict = json.loads(json_file.read())

    client_id = key_dict["client_id"]
    client_secret = key_dict["client_secret"]

    url = "https://openapi.naver.com/v1/papago/n2mt"

#     lang_list = ["en", "ja"]
    
#     target_lang = lang_list[0]
    ### 한국어 -> 외국어
    enc_text = urllib.parse.quote(source_text)
    data = f"source=ko&target={target_lang}&text=" + enc_text
    
    # 요청 header 및 parameter
    req_header = {"X-Naver-Client-Id":client_id, "X-Naver-Client-Secret":client_secret}
    req_parameter = {"source":"ko", "target":target_lang, "text":enc_text}
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id", client_id)
    request.add_header("X-Naver-Client-Secret", client_secret)
    response = urllib.request.urlopen(request, data=data.encode("utf-8"))
    rescode = response.getcode()

    if(rescode == 200):
        response_body = response.read()
        res_json = json.loads(response_body.decode("utf-8"))
        trans_text = res_json["message"]["result"]["translatedText"]
    else:
        print("error Code:" + rescode)
        sys.exit(1)
    
    ### 외국어 -> 한국어
    enc_text = urllib.parse.quote(trans_text)
    data = f"source={target_lang}&target=ko&text=" + enc_text
    
    # 요청 header 및 parameter
    req_parameter = {"source":target_lang, "target":"ko", "text":trans_text}
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id", client_id)
    request.add_header("X-Naver-Client-Secret", client_secret)
    response = urllib.request.urlopen(request, data=data.encode("utf-8"))
    rescode = response.getcode()

    if(rescode == 200):
        response_body = response.read()
        res_json = json.loads(response_body.decode("utf-8"))
        back_trans_text = res_json["message"]["result"]["translatedText"]
    else:
        print("error Code:" + rescode)
        sys.exit(1)
    
    print(source_text)
    print("-" * 30)
    print(trans_text)
    print("-" * 30)
    print(back_trans_text)
    
    return back_trans_text


In [4]:
def max_len_slicing(input_str):
    return input_str[:1021]

In [5]:
def back_trans_list(original_text_list, domain="google", lang='en'):
    new_text_list = list()
    for original_text in tqdm(original_text_list, ncols=100):
        if domain == "papago":
            new_text = back_translate_papago(original_text, target_lang=lang)
            new_text_list.append(new_text)
        elif domain == "google":
            new_text = back_translate_google(original_text, target_lang=lang)
            new_text_list.append(new_text)
        else:
            print("Invalid Domain for back_translation.")
            return -1
    return new_text_list

In [14]:
def main():
    domain = "google"
    
    # data load
    origin_strongbuy = pd.read_csv("./data/strongbuy.csv")
    origin_sell = pd.read_csv("./data/sell.csv")
    origin_holddown = pd.read_csv("./data/holddown.csv")
    
    strongbuy_articles = list(origin_strongbuy["article"])
    sell_articles = list(origin_sell["article"])
    holddown_articles = list(origin_holddown["article"])
    
    # back trans - zh-cn
    expand_strongbuy_zhcn_list = back_trans_list(strongbuy_articles, domain=domain, lang="zh-cn")
    expand_sell_zhcn_list = back_trans_list(sell_articles, domain=domain, lang="zh-cn")
    expand_holddown_zhcn_list = back_trans_list(holddown_articles, domain=domain, lang="zh-cn")
    
    # save - zh-cn
    origin_strongbuy["article"] = expand_strongbuy_zhcn_list
    origin_strongbuy.to_csv(f"./data/strongbuy_expand_{domain}_zhcn.csv", index=False)
    
    origin_sell["article"] = expand_sell_zhcn_list
    origin_sell.to_csv(f"./data/sell_expand_{domain}_zhcn.csv", index=False)
    
    origin_holddown["article"] = expand_holddown_zhcn_list
    origin_holddown.to_csv(f"./data/holddown_expand_{domain}_zhcn.csv", index=False)
    
    # back trans - en
    expand_strongbuy_en_list = back_trans_list(strongbuy_articles, domain=domain, lang="en")
    expand_sell_en_list = back_trans_list(sell_articles, domain=domain, lang="en")
    expand_holddown_en_list = back_trans_list(holddown_articles, domain=domain, lang="en")
    
    # save - en
    origin_strongbuy["article"] = expand_strongbuy_en_list
    origin_strongbuy.to_csv(f"./data/strongbuy_expand_{domain}_en.csv", index=False)
    
    origin_sell["article"] = expand_sell_en_list
    origin_sell.to_csv(f"./data/sell_expand_{domain}_en.csv", index=False)
    
    origin_holddown["article"] = expand_holddown_en_list
    origin_holddown.to_csv(f"./data/holddown_expand_{domain}_en.csv", index=False)
    
    
if __name__ == "__main__":
    main()

100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.48it/s]

이번엔 왜 안늘어나는거지?
------------------------------
Why isn't it increasing this time?
------------------------------
이번에는 왜 안 올라가요?
['이번에는 왜 안 올라가요?']



