In [2]:
import re
from gensim.utils import to_unicode

WIKI_REMOVE_CHARS = re.compile("'+|(=+.{2,30}=+)|__TOC__|(ファイル:).+|:(en|de|it|fr|es|kr|zh|no|fi):|\n", re.UNICODE)
WIKI_SPACE_CHARS = re.compile("(\\s|゙|゚|　)+", re.UNICODE)
EMAIL_PATTERN = re.compile("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.UNICODE)
URL_PATTERN = re.compile("(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.UNICODE)
WIKI_REMOVE_TOKEN_CHARS = re.compile("(\\*$|:$|^파일:.+|^;)", re.UNICODE)
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)


def tokenize(content, token_min_len=2, token_max_len=100, lower=True):
    content = re.sub(EMAIL_PATTERN, ' ', content)  
    content = re.sub(URL_PATTERN, ' ', content)
    content = re.sub(WIKI_REMOVE_CHARS, ' ', content)
    content = re.sub(WIKI_SPACE_CHARS, ' ', content)
    content = re.sub(MULTIPLE_SPACES, ' ', content)
    tokens = content.replace(", )", "").split(" ")
    result = []
    for token in tokens:
        if not token.startswith('_'):
            token_candidate = to_unicode(re.sub(WIKI_REMOVE_TOKEN_CHARS, '', token))
        else:
            token_candidate = ""
        if len(token_candidate) > 0:
            result.append(token_candidate)
    return result


In [3]:
from gensim.corpora import WikiCorpus, Dictionary
from gensim.utils import to_unicode
from gensim.corpora.wikicorpus import tokenize


in_f =r"C:\Users\AI31\Downloads\kowiki-latest-pages-articles.xml.bz2"
out_f =r"C:\Users\AI31\Downloads\processed\processed_wiki_ko.txt"
output = open(out_f, 'w', encoding = "utf-8")
wiki = WikiCorpus(in_f, tokenizer_func=tokenize, dictionary=Dictionary())
i = 0
for text in wiki.get_texts() :
    output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
    i = i + 1
    if (i % 10000 == 0):
        print('Processed ' + str(i) + ' articles')
output.close()
print('Processing complete!')



Processed 10000 articles
Processed 20000 articles
Processed 30000 articles
Processed 40000 articles
Processed 50000 articles
Processed 60000 articles
Processed 70000 articles
Processed 80000 articles
Processed 90000 articles
Processed 100000 articles
Processed 110000 articles
Processed 120000 articles
Processed 130000 articles
Processed 140000 articles
Processed 150000 articles
Processed 160000 articles
Processed 170000 articles
Processed 180000 articles
Processed 190000 articles
Processed 200000 articles
Processed 210000 articles
Processed 220000 articles
Processed 230000 articles
Processed 240000 articles
Processed 250000 articles
Processed 260000 articles
Processed 270000 articles
Processed 280000 articles
Processed 290000 articles
Processed 300000 articles
Processed 310000 articles
Processed 320000 articles
Processed 330000 articles
Processed 340000 articles
Processed 350000 articles
Processed 360000 articles
Processed 370000 articles
Processed 380000 articles
Processed 390000 arti

In [8]:
import json

corpus_fname = r"C:\Users\AI31\Downloads\KorQuAD_v1.0_train.json"
output_fname = r"C:\Users\AI31\Downloads\processed\processed_korquad.txt"

with open(corpus_fname) as f1, open(output_fname, 'w', encoding='utf-8') as f2:
    dataset_json = json.load(f1)
    dataset = dataset_json['data']
    for article in dataset:
        w_lines = []
        for paragraph in article['paragraphs']:
            w_lines.append(paragraph['context'])
            for qa in paragraph['qas']:
                q_text = qa['question']
                for a in qa['answers']:
                    a_text = a['text']
                    w_lines.append(q_text + " " + a_text)
        for line in w_lines:
            f2.writelines(line + "\n")

In [9]:
corpus_path = r"C:\Users\AI31\Desktop\ratings.txt"
output_fname = r"C:\Users\AI31\Downloads\processed\processed_ratings.txt"
with_label = False

with open(corpus_path, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        next(f1)  
        for line in f1:
            _, sentence, label = line.strip().split('\t')
            if not sentence: continue
            if with_label:
                f2.writelines(sentence + "\u241E" + label + "\n")
            else:
                f2.writelines(sentence + "\n")