In [112]:
import warnings
warnings.filterwarnings('ignore')
import csv
import glob
import io
import numpy as np
import pickle
import re

In [265]:
# Function to parse corpus files

def parse_corpus(fname, corpus):
    
    f = open(fname, 'r', encoding='utf-8')
    # File is non-csv format but comma "," does not exist.
    # Csv reader is used to avoid text garbling.
    iter_obj = csv.reader(f)
    lines = [v for v in iter_obj]
    f.close()
    
    text = ''
    for line in lines:
        if len(line) == 0:
            continue

        s = line[0]
        if len(s) == 0: continue

        # ％ｃｏｍ： is supplementary info
        if s[0:5] == "％ｃｏｍ：":
#                 print(' - Line', i, ': skip ％ｃｏｍ')
            continue

        if s[0] == '＠':
#                 print(' - Line', i, ': skip ＠ meta info ')
            continue
        else:
            #Replace stars with UNK
            s = s.replace('＊＊＊','UNK')
            #Replace speaker symbols with a separater
            if s[0] == 'F' or s[0] == 'M':
                s = 'SSSS'+s[5:]
            if s[0:2] == 'Ｘ：':
                s = 'SSSS'+s[2:]

            s = re.sub('F[0-9]{3}',"UNK",s)
            s = re.sub('M[0-9]{3}',"UNK",s)
            s = s.replace("＊","")


        # ?? Should be after concatinating the values ??
        for L, R in zip(['（', '＜', '【'], ['）', '＞', '】']):
            while s.find(L) != -1:
                left_pos = s.find(L)
                if s.find(R) != -1:
                    right_pos = s.find(R)
                    if left_pos > right_pos:
                        if s[0:4] == 'SSSS':
                            s = s.replace(s[4:right_pos+1], '', 1)
                        else:
                            s = s.replace(s[:right_pos+1], '', 1)
                    else:
                        s = s.replace(s[left_pos:right_pos+1], '')
                        if len(s) == 0:
                            continue
                else:
                    s=s[0:left_pos]

        if s != "\n" and s != "SSSS":
            text += s


    if text[0:4] != 'SSSS':
        text = 'SSSS' + text
    while text[0:4] == 'SSSS':
        next_pos = text[4:].find("SSSS")
        if next_pos == -1:
            corpus.append(text)
            break
        else:
            corpus.append(text[:4+next_pos])
            text = text[4+next_pos:]
        # Breaks up a long sentence
        # ?? The split lines are considered a talk between 2. Any influence in training ??
        if len(corpus[-1]) > 50:
            xs = corpus[-1].split('。')
            if len(xs) == 1:
                continue
            corpus.pop()
            if len(xs[0]) > 30:
                corpus.append(xs[0].split('、')[0] + '。')
            else:
                corpus.append(xs[0] + '。')
            while xs[-1] == '' or xs[-1] == ' ' or xs[-1] == '　':
                xs.pop()
            if len(xs) > 1:
                if len(xs[-1]) > 30:
                    corpus.append('SSSS' + xs[-1].split('、')[-1] + '。')
                else:
                    corpus.append('SSSS' + xs[-1] + '。')


In [266]:
# Parse corpus

print('\n...Started parsing corpus files\n')

fname_list = glob.glob('data/nucc/*')
print('The number of the corpus files are', len(fname_list), '\n')

corpus = []
for fname in fname_list:
    parse_corpus(fname, corpus)
print('The number of the corpus is', len(corpus), '\n')

with open('data/nucc_corpus.txt', 'w', encoding='utf-8') as f:
    for line in corpus:
        f.write(line + "\n")
print('...completed parsing corpus into "data/nucc_corpus.txt"\n')



...Started parsing corpus files

The number of the corpus files are 129 

hit
hit
hit
hit
hit
hit
hit
The number of the corpus is 83076 

...completed parsing corpus into "data/nucc_corpus.txt"



In [98]:
# Morphological analysis
# !! Note this step can take more than minutes !!

print('\n...Started morphological analysis on "data/nucc_corpus.txt"\n')
!jumanpp -f < data/nucc_corpus.txt > data/nucc_corpus_analyzed.txt
print('...completed the analysis into "data/nucc_corpus_analyzed.txt"\n')

print('...extracting only tokens from "data/nucc_corpus.txt"\n')
!cat data/nucc_corpus_analyzed.txt | cut -f1 -d\  > data/nucc_tokens.txt
print('...completed the extracting into "data/nucc_tokens.txt"\n')


...Started morphological analysis on "data/nucc_corpus.txt"

...completed the analysis into "data/nucc_corpus_analyzed.txt"

...extracting only tokens from "data/nucc_corpus.txt"

...completed the extracting into "data/nucc_tokens.txt"



In [25]:
# Function to tokenize parsed corpus file

def tokenize_corpus(src_fname, dst_fname):
    
    f = open(src_fname, 'r')
    data = f.read()
    f.close()
    print('Input file is', src_fname.split('/')[-1], '\n')
    
    # ?? white_space and some signs are counted as tokens(should be ommited?) -> check word_index and index_word ??
    print('The byte size of the input token is', len(data), '\n')
    data = re.sub('.*SSSS.*', 'SSSS', data)
    data = re.sub('SSSS\nSSSS', 'SSSS', data)
    data = re.sub('.*UNK.*', 'UNK', data)
    data = re.sub('@\n', '', data)
    data = re.sub('EOS\n', '', data)

    # File is non-csv format but comma "," does not exist.
    # Csv reader is used to avoid text garbling.
    iter_obj = csv.reader(io.StringIO(data))

    token_list = []
    for line in iter_obj:
        if len(line) == 0: # You will face errors later with 0 length values
            continue
        token_list.append(line[0])
    
    token_array = np.array(token_list)
    print('The shape of the output token(numpy array) is', token_array.shape, '\n')

    np.save(dst_fname, token_array)


In [274]:
# Tokenize corpus

print('\n...Started tokenizing corpus file\n')

tokenize_corpus('data/nucc_tokens.txt', 'data/nucc_tokens.npy')

print('...completed tokenizing into "data/nucc_tokens.npy"\n')



...Started tokenizing corpus file

Input file is nucc_tokens.txt 

The byte size of the input token is 2837186 

The shape of the output token(numpy array) is (811176,) 

...completed tokenizing into "data/nucc_tokens.npy"



In [331]:
# Function to create word/index dictionaries for training

def make_word_index_dict(src_fname):
    
    token_array = np.load(src_fname)
    words_uniq = sorted(set(token_array))
    cnt = np.zeros(len(words_uniq))
    
    print('The number of the words is', len(words_uniq), '\n')
    
    word_index = dict((w, i) for i, w in enumerate(words_uniq))
    
    for v in token_array:
        cnt[word_index[v]] += 1
    
    to_unk_list = []
    for i, c in enumerate(cnt): # Replace less frequent words with 'UNK'
        if c < 4:
            to_unk_list.append(words_uniq[i])
            words_uniq[i] = 'UNK'
    
    print('The number of the words replaced with "UNK" is', len(to_unk_list), '\n')
    
    # ?? Keras mask_zero=True can solve this. In need of more investigation ??
    words_uniq.append('\t') # \t comes first by sorting, to keep index 0 for RNN masking(for 0 padding issue?)
    words_uniq = sorted(set(words_uniq))
    
    print('The number of the words after UNK replacement is', len(words_uniq), '\n')
    
    word_index = dict((w, i) for i, w in enumerate(words_uniq))
    index_word = dict((i, w) for i, w in enumerate(words_uniq))

    # !! Note this step can take more than minutes !!
    token_index = np.zeros(len(token_array), dtype=int)
    for i, v in enumerate(token_array):
        if v in words_uniq:
            token_index[i] = word_index[v]
        else:
            token_index[i] = word_index['UNK']
    
    with open('data/word_index.pickle', 'wb') as f :
        pickle.dump(word_index , f)
    print('Created "data/word_index.pickle"\n')

    with open('data/index_word.pickle', 'wb') as f :
        pickle.dump(index_word , f)
    print('Created "data/index_word.pickle"\n')

    with open('data/words.pickle', 'wb') as f :
        pickle.dump(words_uniq , f)
    print('Created "data/words.pickle"\n')

    with open('data/token_index.pickle', 'wb') as f :
        pickle.dump(token_index , f)
    print('Created "data/token_index.pickle"\n')


In [332]:
# Create word/index dictionaries for training

print('\n...Started making dictionaries\n')

make_word_index_dict('data/nucc_tokens.npy')

print('...completed making dictionaries\n')



...Started making dictionaries

The number of the words is 22580 

The number of the words replaced with "UNK" is 15388 

The number of the words after UNK replacement is 7193 

Created "data/word_index.pickle"

Created "data/index_word.pickle"

Created "data/words.pickle"

Created "data/token_index.pickle"

...completed making dictionaries



In [128]:
# Function to create Encoder Input、Decoder Input and Labels for training

def make_training_dataset(src_fname):
    
    maxlen_e = 50 # Maximal encoder input word number
    maxlen_d = 50 # Maximal decoder input word number
    
    f = open('data/word_index.pickle', 'rb')
    word_index = pickle.load(f)
    f.close()
    
    f = open('data/token_index.pickle', 'rb')
    token_index = pickle.load(f)
    f.close()

    
    #
    # Convert corpus into dialog lists for input data
    #

    sep_idx = word_index['SSSS']
    dialog_list = []

    # ?? juman++ is not perfectly acurate we can check dialog_list then fix the original corpus ??
    for i, idx in enumerate(token_index):
        if idx == sep_idx:
            if i != 0:
                dialog_list.append(dialog)
            dialog = []
        else:
            dialog.append(idx)

    print('The number of the dialog is', len(dialog_list), '\n')

    
    # Shuffle data
    np.random.seed(12345)
    np.random.shuffle(dialog_list)
    
    
    enc_dialog = dialog_list[:-1]
    dec_dialog = [[sep_idx] + v for v in dialog_list[1:]] # Insert 'SSSS' index at the head of dialog
    lbl_dialog = [v + [sep_idx] for v in dialog_list[1:]] # Insert 'SSSS' index at the tail of dialog
    
    enc_input = []
    dec_input = []
    lbl_input = []

    
    # Keep dialogs only upto maxlen
    for enc, dec, lbl in zip(enc_dialog, dec_dialog, lbl_dialog):
        if len(enc) <= maxlen_e and len(dec) <= maxlen_d:
            enc_input.append(enc)
            dec_input.append(dec)
            lbl_input.append(lbl)


    # 0 padding upto maxlen
    for i in range(0, len(enc_input)):
        
        # Extend each length with 0
        enc_input[i] += [0] * maxlen_e
        dec_input[i] += [0] * maxlen_d
        lbl_input[i] += [0] * maxlen_d

        # Cut off by the length of maxlen
        enc_input[i] = enc_input[i][:maxlen_e]
        dec_input[i] = dec_input[i][:maxlen_d]
        lbl_input[i] = lbl_input[i][:maxlen_d]
        
    print('Each data was 0 padded upto maxlen\n')
    
    enc_input = np.array(enc_input).reshape(len(enc_input), maxlen_e, 1)
    dec_input = np.array(dec_input).reshape(len(dec_input), maxlen_d, 1)
    lbl_input = np.array(lbl_input).reshape(len(lbl_input), maxlen_d, 1)
    
    print('The shape of Encoder Input is', enc_input.shape, '\n')
    print('The shape of Decoder Input is', dec_input.shape, '\n')
    print('The shape of Label is', lbl_input.shape, '\n')


    with open('data/enc_input.pickle', 'wb') as f:
        pickle.dump(enc_input, f)
    print('Encoder Input is created in "data/enc_input.pickle"\n')

    with open('data/dec_input.pickle', 'wb') as f:
        pickle.dump(dec_input, f)
    print('Decoder Input is created in "data/dec_input.pickle"\n')

    with open('data/lbl_input.pickle', 'wb') as f:
        pickle.dump(lbl_input, f)
    print('Label is created in "data/lbl_input.pickle"\n')

    with open('data/maxlen.pickle', 'wb') as f:
        pickle.dump([maxlen_e, maxlen_d], f)
    print('maxlens for encoder/decoder is created in "data/maxlen.pickle\n')
        

In [129]:
# Create Encoder Input、Decoder Input and Labels for training

print('\n...Started making dictionaries\n')

make_training_dataset('data/nucc_tokens.npy')

print('...completed making dictionaries\n')


...Started making dictionaries

The number of the dialog is 83075 

Each data was 0 padded upto maxlen

The shape of Encoder Input is (83033, 50, 1) 

The shape of Decoder Input is (83033, 50, 1) 

The shape of Label is (83033, 50, 1) 

Encoder Input is created in ""data/enc_input.pickle""

Decoder Input is created in ""data/dec_input.pickle""

Label is created in ""data/lbl_input.pickle""

maxlens for encoder/decoder is created in ""data/maxlen.pickle""

...completed making dictionaries

