In [6]:
import pandas as pd
import re
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

In [7]:
def preprocessing(data):
    # data = re.sub(' {3,}', '@', data)
    # data = re.sub(' ', "", data)
    # data = re.sub('@', ' ', data)
    data = re.sub(r'\n', '', data)
    REMOVE_CHARS = re.compile("'+|(=+.{2,30}=+)|__TOC__|(ファイル:).+|:(en|de|it|fr|es|kr|zh|no|fi):", re.UNICODE)
    SPACE_CHARS = re.compile("(\\s|゙|゚|　)+", re.UNICODE)
    EMAIL_PATTERN = re.compile("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.UNICODE)
    URL_PATTERN = re.compile("(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.UNICODE)
    REMOVE_TOKEN_CHARS = re.compile("(\\*$|:$|^파일:.+|^;)", re.UNICODE)
    MULTIPLE_SPACES = re.compile(' +', re.UNICODE)
    EXCEPT_KOREAN = re.compile("[^ ㄱ-ㅣ가-힣+|.]", re.UNICODE)
    data = re.sub(EMAIL_PATTERN, ' ', data)  # remove email pattern
    data = re.sub(URL_PATTERN, ' ', data) # remove url pattern
    data = re.sub(REMOVE_CHARS, ' ', data)  # remove unnecessary chars
    data = re.sub(EXCEPT_KOREAN, ' ', data)
    data = re.sub(SPACE_CHARS, ' ', data)
    data = re.sub(MULTIPLE_SPACES, ' ', data)
    data = re.sub(' [.]', '', data)
    stop_words = "등 등이 에 에서 와 과 은 는 의 개 개의 년간 로 을 를 하는 총 월 목표주가 현재주가 연결 요약 재무제표 십억원 만주 주 주가 배 년 일 만 억원 우 좌 원 억 백만 약 각각 으로 십 액면가 종가 자본금 발행주식수 시가총액 외국인지분율 일평균거래량 일평균거래대금 주가수익률 절대수익률 상대수익률 배당수익률"
    stop_words = set(stop_words.split(' '))
    result = [word for word in data.split(' ') if not word in stop_words]
    result = " ".join(result)
    return result

In [8]:
def read_pdf_PDFMINER(pdf_file_path):
    """
    pdf_file_path: 'dir/aaa.pdf'로 구성된 path로부터 
    내부의 text 파일을 모두 읽어서 스트링을 리턴함.
    https://pdfminersix.readthedocs.io/en/latest/tutorials/composable.html
    """
    output_string = StringIO()
    with open(pdf_file_path, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for pageNumber, page in enumerate(PDFPage.create_pages(doc)):
            if pageNumber == 0:
                interpreter.process_page(page)
                break

    return str(output_string.getvalue())

### Google 번역기 활용
### pip install googletrans==4.0.0-rc1로 설치할 것

In [107]:
import googletrans


def back_translate_google(source_text):
    translator = googletrans.Translator()
    
    # en와 ja 비슷, zh-cn와 zh-tw 비슷
    lang_list = ["en", "zh-cn"]
    back_trans_list = list()
    
    # print(googletrans.LANGUAGES)
    for lang in lang_list:
        trans_text = translator.translate(source_text, dest=lang)
        back_trans_text = translator.translate(trans_text.text, dest="ko")
        back_trans_list.append(back_trans_text.text)
    
    return back_trans_list
    

### Naver 파파고 활용

In [116]:
import os
import sys
import urllib
import json


def back_translate_papago(source_text):
    # 보안상 개인 naver api key 사용 바람
    json_file = open("./translator_key.json", encoding="utf-8")
    key_dict = json.loads(json_file.read())

    client_id = key_dict["client_id"]
    client_secret = key_dict["client_secret"]

    url = "https://openapi.naver.com/v1/papago/n2mt"

    lang_list = ["en", "ja"]
    
    target_lang = lang_list[0]
    ### 한국어 -> 외국어
    enc_text = urllib.parse.quote(source_text)
    data = f"source=ko&target={target_lang}&text=" + enc_text
    
    # 요청 header 및 parameter
    req_header = {"X-Naver-Client-Id":client_id, "X-Naver-Client-Secret":client_secret}
    req_parameter = {"source":"ko", "target":target_lang, "text":encText}
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id", client_id)
    request.add_header("X-Naver-Client-Secret", client_secret)
    response = urllib.request.urlopen(request, data=data.encode("utf-8"))
    rescode = response.getcode()

    if(rescode == 200):
        response_body = response.read()
        res_json = json.loads(response_body.decode("utf-8"))
        trans_text = res_json["message"]["result"]["translatedText"]
    else:
        print("error Code:" + rescode)
        sys.exit(1)
    
    ### 외국어 -> 한국어
    enc_text = urllib.parse.quote(trans_text)
    data = f"source={target_lang}&target=ko&text=" + enc_text
    
    # 요청 header 및 parameter
    req_parameter = {"source":target_lang, "target":"ko", "text":trans_text}
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id", client_id)
    request.add_header("X-Naver-Client-Secret", client_secret)
    response = urllib.request.urlopen(request, data=data.encode("utf-8"))
    rescode = response.getcode()

    if(rescode == 200):
        response_body = response.read()
        res_json = json.loads(response_body.decode("utf-8"))
        back_trans_text = res_json["message"]["result"]["translatedText"]
    else:
        print("error Code:" + rescode)
        sys.exit(1)
    
    print(source_text)
    print("-" * 30)
    print(trans_text)
    print("-" * 30)
    print(back_trans_text)
    
    return back_trans_text


In [1]:
def main():    
    origin_strongbuy = pd.read_csv("./data/strongbuy.csv")
    origin_sell = pd.read_csv("./data/sell.csv")
    origin_holddown = pd.read_csv("./data/holddown.csv")
    
    print(origin_strongbuy)

    # back_trans_text = back_translate_papago(preprocessed_text_data)
    
    
if __name__ == "__main__":
    main()

NameError: name 'pd' is not defined