In [153]:
import warnings
warnings.filterwarnings('ignore')
import csv
import glob
import io
import numpy as np
import re

In [154]:
# Function to parse corpus files

def parse_corpus(fname, corpus):
    
    f = open(fname, 'r', encoding='utf-8')
    # File is non-csv format but comma "," does not exist.
    # Csv reader is used to avoid text garbling.
    iter_obj = csv.reader(f)
    lines = [v for v in iter_obj]
    f.close()
    
    text = ''
    for line in lines:
        if len(line) == 0:
            continue

        s = line[0]
        if len(s) == 0: continue

        # ％ｃｏｍ： is supplementary info
        if s[0:5] == "％ｃｏｍ：":
#                 print(' - Line', i, ': skip ％ｃｏｍ')
            continue

        if s[0] == '＠':
#                 print(' - Line', i, ': skip ＠ meta info ')
            continue
        else:
            #Replace stars with UNK
            s = s.replace('＊＊＊','UNK')
            #Replace speaker symbols with a separater
            if s[0] == 'F' or s[0] == 'M':
                s = 'SSSS'+s[5:]
            if s[0:2] == 'Ｘ：':
                s = 'SSSS'+s[2:]

            s = re.sub('F[0-9]{3}',"UNK",s)
            s = re.sub('M[0-9]{3}',"UNK",s)
            s = s.replace("＊","")


        # ?? Should be after concatinating the values ??
        for L, R in zip(['（', '＜', '【'], ['）', '＞', '】']):
            while s.find(L) != -1:
                left_pos = s.find(L)
                if s.find(R) != -1:
                    right_pos = s.find(R)
                    if left_pos > right_pos:
                        if s[0:4] == 'SSSS':
                            s = s.replace(s[4:right_pos+1], '', 1)
                        else:
                            s = s.replace(s[:right_pos+1], '', 1)
                    else:
                        s = s.replace(s[left_pos:right_pos+1], '')
                        if len(s) == 0:
                            continue
                else:
                    s=s[0:left_pos]

        if s != "\n" and s != "SSSS":
            text += s


    if text[0:4] != 'SSSS':
        text = 'SSSS' + text
    while text[0:4] == 'SSSS':
        next_pos = text[4:].find("SSSS")
        if next_pos == -1:
            corpus.append(text)
            break
        else:
            corpus.append(text[:4+next_pos])
            text = text[4+next_pos:]
        # Breaks up a long sentence
        # ?? The split lines are considered a talk between 2. Any influence in training ??
        if len(corpus[-1]) > 50:
            xs = corpus[-1].split('。')
            if len(xs) == 1:
                continue
            corpus.pop()
            if len(xs[0]) > 30:
                corpus.append(xs[0].split('、')[0] + '。')
            else:
                corpus.append(xs[0] + '。')
            while xs[-1] == '' or xs[-1] == ' ' or xs[-1] == '　':
                xs.pop()
            if len(xs) > 1:
                if len(xs[-1]) > 30:
                    corpus.append('SSSS' + xs[-1].split('、')[-1] + '。')
                else:
                    corpus.append('SSSS' + xs[-1] + '。')


In [121]:
# Parse corpus

print('\n...Started parsing corpus files\n')

fname_list = glob.glob('data/nucc/*')
print('The number of the corpus files are', len(fname_list), '\n')

corpus = []
for fname in fname_list:
    parse_corpus(fname, corpus)
print('The number of the corpus is', len(corpus), '\n')

with open('data/nucc_corpus.txt', 'w', encoding='utf-8') as f:
    for line in corpus:
        f.write(line + "\n")
print('...completed parsing corpus into "data/nucc_corpus.txt"\n')


...Started parsing corpus files

The number of the corpus files are 129 

The number of the corpus is 83076 

...completed parsing corpus into "data/nucc_corpus.txt"



In [98]:
# Morphological analysis
# !! Note this step can take more than minutes !!

print('\n...Started morphological analysis on "data/nucc_corpus.txt"\n')
!jumanpp -f < data/nucc_corpus.txt > data/nucc_corpus_analyzed.txt
print('...completed the analysis into "data/nucc_corpus_analyzed.txt"\n')

print('...extracting only tokens from "data/nucc_corpus.txt"\n')
!cat data/nucc_corpus_analyzed.txt | cut -f1 -d\  > data/nucc_tokens.txt
print('...completed the extracting into "data/nucc_tokens.txt"\n')


...Started morphological analysis on "data/nucc_corpus.txt"

...completed the analysis into "data/nucc_corpus_analyzed.txt"

...extracting only tokens from "data/nucc_corpus.txt"

...completed the extracting into "data/nucc_tokens.txt"



In [171]:
# Function to tokenize parsed corpus file

def tokenize_corpus(src_fname, dst_fname):
    
    f = open(src_fname, 'r')
    data = f.read()
    f.close()
    print('Input file is', src_fname.split('/')[-1], '\n')
    
    print('The byte size of the input token is', len(data), '\n')
    data = re.sub('.*SSSS.*', 'SSSS', data)
    data = re.sub('SSSS\nSSSS', 'SSSS', data)
    data = re.sub('.*UNK.*', 'UNK', data)
    data = re.sub('@\n', '', data)
    data = re.sub('EOS\n', '', data)

    # File is non-csv format but comma "," does not exist.
    # Csv reader is used to avoid text garbling.
    iter_obj = csv.reader(io.StringIO(data))

    token_list = [v for v in iter_obj]
    print('The number of the input token is', len(token_list), '\n')
    token_list.append('SSSS')
    token_array = np.array(token_list).reshape(len(token_list), 1) 

    print('The number of the output token is', token_array.shape, '\n')
    np.save(dst_fname, token_array)


In [172]:
# Tokenize corpus

print('\n...Started tokenizing corpus file\n')

tokenize_corpus('data/nucc_tokens.txt', 'data/nucc_tokens.npy')

print('...completed tokenizing into "data/nucc_tokens.npy"\n')


...Started tokenizing corpus file

Input file is nucc_tokens.txt 

The byte size of the input token is 2837186 

The number of the input token is 811177 

The number of the output token is (811178, 1) 

...completed tokenizing into "data/nucc_tokens.npy"



In [101]:
iter_obj = csv.reader(open('data/nucc_corpus_analyzed.txt', 'r'), delimiter=' ')


In [102]:
li = [v[0] for v in iter_obj] 

In [80]:
mat = [v[0] for v in li] 

In [103]:
len(li)

935512