In [None]:
# Brefore running the codes,
# Upload nucc_tokens.txt

!mkdir data
!mv nucc_tokens.txt data/

import warnings
warnings.filterwarnings('ignore')
import csv
import glob
import io
import numpy as np
import pickle
import re

# Function to tokenize parsed corpus file

def tokenize_corpus(src_fname, dst_fname):
    
    f = open(src_fname, 'r')
    data = f.read()
    f.close()
    print('Input file is', src_fname.split('/')[-1], '\n')
    
    # ?? white_space and some signs are counted as tokens(should be ommited?) -> check word_index and index_word ??
    print('The byte size of the input token is', len(data), '\n')
    data = re.sub('.*SSSS.*',   'SSSS', data)
    data = re.sub('SSSS\nSSSS', 'SSSS', data)
    data = re.sub('.*UNK.*',    'UNK',  data)
    data = re.sub('@\n',        '',     data)
    data = re.sub('EOS\n',      '',     data)
    data = re.sub('　\n',       '',     data) # Double-byte space

    # File is non-csv format but comma "," does not exist.
    # Csv reader is used to avoid text garbling.
    iter_obj = csv.reader(io.StringIO(data))

    token_list = []
    for line in iter_obj:
        if len(line) == 0: # You will face errors later with 0 length values
            continue
        token_list.append(line[0])
    
    token_array = np.array(token_list)
    print('The shape of the output token(numpy array) is', token_array.shape, '\n')

    np.save(dst_fname, token_array)

    
# Tokenize corpus

print('\n...Started tokenizing corpus file\n')

tokenize_corpus('data/nucc_tokens.txt', 'data/nucc_tokens.npy')

print('...completed tokenizing into "data/nucc_tokens.npy"\n')


# Function to create word/index dictionaries for training

def make_word_index_dict(src_fname):
    
    token_array = np.load(src_fname)
    words_uniq = sorted(set(token_array))
    cnt = np.zeros(len(words_uniq))
    
    print('The number of the words is', len(words_uniq), '\n')
    
    word_index = dict((w, i) for i, w in enumerate(words_uniq))
    
    for v in token_array:
        cnt[word_index[v]] += 1
    
    to_unk_list = []
    for i, c in enumerate(cnt): 
        if c < 4:
            to_unk_list.append(words_uniq[i])
            words_uniq[i] = 'UNK' # Replace less frequent words with 'UNK'
    
    print('The number of the words replaced with "UNK" is', len(to_unk_list), '\n')
    
    # Keras embedding mask_zero=True regards index(ID) 0 as padding.
    # So the first word whose index is 0 is never usd.
    # Insert '\t' which comes first by sorting for any other words not to come to the index 0 place.
    words_uniq.append('\t')
    words_uniq = sorted(set(words_uniq))
    
    print('The number of the words after UNK replacement is', len(words_uniq), '\n')
    
    word_index = dict((w, i) for i, w in enumerate(words_uniq))
    index_word = dict((i, w) for i, w in enumerate(words_uniq))

    token_index = np.zeros(len(token_array), dtype=int)
    
    # !! Note this step can take more than minutes !!
    for i, v in enumerate(token_array):
        if v in words_uniq:
            token_index[i] = word_index[v]
        else:
            token_index[i] = word_index['UNK']

    
    with open('data/word_index.pickle', 'wb') as f :
        pickle.dump(word_index , f)
    print('Created "data/word_index.pickle"\n')

    with open('data/index_word.pickle', 'wb') as f :
        pickle.dump(index_word , f)
    print('Created "data/index_word.pickle"\n')

    with open('data/words.pickle', 'wb') as f :
        pickle.dump(words_uniq , f)
    print('Created "data/words.pickle"\n')

    np.save('data/token_index.npy', token_index)
    print('Created "data/token_index.npy"\n')

    
# Create word/index dictionaries for training

print('\n...Started making dictionaries\n')

make_word_index_dict('data/nucc_tokens.npy')

print('...completed making dictionaries\n')


# Function to create Encoder Input、Decoder Input and Labels for training

def make_training_dataset(src_fname):
    
    maxlen_e = 50 # Maximal encoder input word number  ?? Control the maximal length in parse_corpus() ??
    maxlen_d = 50 # Maximal decoder input word number
    
    with open('data/word_index.pickle', 'rb') as f:
        word_index = pickle.load(f)
    
    token_index = np.load('data/token_index.npy')

    
    #
    # Convert corpus into dialog lists for input data
    #

    sep_idx = word_index['SSSS']
    dialog_list = []

    # ?? juman++ is not perfectly acurate we can check dialog_list then fix the original corpus ??
    for i, idx in enumerate(token_index):
        if idx == sep_idx:
            if i != 0:
                dialog_list.append(dialog)
            dialog = []
        else:
            dialog.append(idx)

    print('The number of the dialog is', len(dialog_list), '\n')

    
    # Shuffle data
    np.random.seed(12345)
    np.random.shuffle(dialog_list)
    
    
    enc_dialog = dialog_list[:-1]
    dec_dialog = [[sep_idx] + v for v in dialog_list[1:]] # Insert 'SSSS' index at the head of dialog
    lbl_dialog = [v + [sep_idx] for v in dialog_list[1:]] # Insert 'SSSS' index at the tail of dialog
    
    enc_input = []
    dec_input = []
    lbl_input = []

    
    # Keep dialogs only upto maxlen
    for enc, dec, lbl in zip(enc_dialog, dec_dialog, lbl_dialog):
        if len(enc) <= maxlen_e and len(dec) <= maxlen_d:
            enc_input.append(enc)
            dec_input.append(dec)
            lbl_input.append(lbl)


    # 0 padding upto maxlen
    for i in range(0, len(enc_input)):
        
        # Extend each length with 0
        enc_input[i] += [0] * maxlen_e
        dec_input[i] += [0] * maxlen_d
        lbl_input[i] += [0] * maxlen_d

        # Cut off by the length of maxlen
        enc_input[i] = enc_input[i][:maxlen_e]
        dec_input[i] = dec_input[i][:maxlen_d]
        lbl_input[i] = lbl_input[i][:maxlen_d]
        
    print('Each data was 0 padded upto maxlen\n')
    
    enc_input = np.array(enc_input).reshape(len(enc_input), maxlen_e, 1)
    dec_input = np.array(dec_input).reshape(len(dec_input), maxlen_d, 1)
    lbl_input = np.array(lbl_input).reshape(len(lbl_input), maxlen_d, 1)
    
    print('The shape of Encoder Input is', enc_input.shape, '\n')
    print('The shape of Decoder Input is', dec_input.shape, '\n')
    print('The shape of Label is',         lbl_input.shape, '\n')


    np.save('data/enc_input.npy', enc_input)
    print('Encoder Input is created in "data/enc_input.npy"\n')

    np.save('data/dec_input.npy', dec_input)
    print('Decoder Input is created in "data/dec_input.npy"\n')

    np.save('data/lbl_input.npy', lbl_input)
    print('Label is created in "data/lbl_input.npy"\n')

    np.save('data/maxlen.npy', [maxlen_e, maxlen_d])
    print('maxlens for encoder/decoder is created in "data/maxlen.npy\n')


# Create Encoder Input、Decoder Input and Labels for training

print('\n...Started making dictionaries\n')

make_training_dataset('data/nucc_tokens.npy')

print('...completed making dictionaries\n')
