## Ray

In [None]:
import ray
import kss
from glob import glob
from itertools import chain

In [None]:
def sorted_list(path_list):
    path_list = sorted(path_list, reverse=False)
    path_list = sorted(path_list, key=len)
    
    return path_list

In [None]:
def post_txt_file_path_list(corpus_list):
   
  post_corpus_list = [corpus_file.replace("pro", "post")
                      for corpus_file in corpus_list]

  return post_corpus_list

In [None]:
def make_pro_post_txt_file_path_list(pro_corpus_path):
    
    pro_total_corpus_list = glob(pro_corpus_path)
    pro_total_corpus_list = sorted_list(pro_total_corpus_list)
    post_total_corpus_list = post_txt_file_path_list(pro_total_corpus_list)

    return pro_total_corpus_list, post_total_corpus_list

In [None]:
pro_corpus_path = "AIHUB_corpus/exploration/automatic_patent_classification_data_pro/AIHUB_automatic_patent_classification_data_" + "*.txt"
pro_total_corpus_path_list, post_total_corpus_path_list = make_pro_post_txt_file_path_list(pro_corpus_path)

In [None]:
len(pro_total_corpus_path_list)

In [None]:
ray.init(num_cpus = 4)

@ray.remote
def preprocessing_text(source):
    
    preprocessing_sentence_list = []

    sentences = source.replace("'", "")
    sentences = sentences.replace("\"", "")
    sentences = sentences.split("\n")

    for sentence in sentences:
        preprocessing_sentence_list.append(sentence)

    return preprocessing_sentence_list

In [None]:
def preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list):
    print("[Size]")
    print("The number of preprocessing corpus: " + str(len(pro_total_corpus_path_list)))
    print("\n[Order]")
    num = 0
    process_num = 10    

    for pro, post in zip(pro_total_corpus_path_list, post_total_corpus_path_list):

        sentence_list = []

        with open(pro, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines() 
            nested_lines_num = len(lines) // process_num
            for i in range(nested_lines_num - 1):
                start_line = process_num * i
                end_line = process_num * (i+1)
                futures = [preprocessing_text.remote(lines[start_line:end_line][j]) for j in range(process_num)]
                results = ray.get(futures)

                if i == nested_lines_num - 2:
                    futures = [preprocessing_text.remote(lines[end_line:][j]) for j in range(len(lines) - end_line)]
                    results = ray.get(futures)

                sentences = list(chain.from_iterable(results))
                sentence_list.append(sentences)

        sentence_list = list(chain.from_iterable(sentence_list))

        num += 1
        print(str(num), end=" ")  

        with open(post, 'a', encoding='utf-8') as fp:
            fp.write("\n".join(sentence_list))

In [None]:
ray.shutdown()

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list)

### Reference

<b>Blog<b/>
<br>[BERT를 이용한 한국어 띄어쓰기 모델 만들기 - 01. 데이터 준비](https://bhchoi.github.io/post/nlp/dev/bert_korean_spacing_01/)