## Imports

In [1]:
import jieba, json, os, re, sys, time
from datetime import datetime
import numpy as np

import torch
import torch.nn as nn

## Functions

In [2]:
from fields import Field, Parms, Semantic, Vocab, _make_vocab
from utils import *

from nlp_db import nlp_db

from model_class import NLU_Classify

In [3]:
def read_json(file, thresh=20, k=None, func=None):

    with open(file, "r", encoding='utf-8') as f:
        rzlt = []
        cnt = 0
        for l in f.readlines():

            if k != None and func != None:
                rzlt.append(func(json.loads(l)[k]))

            elif k != None:
                rzlt.append(json.loads(l)[k])

            else:
                rzlt.append(json.loads(l))

            if cnt > thresh:
                break

    return rzlt


def json_iter(file, batch_size=1000, k=None, func=None):
    with open(file, "r", encoding='utf-8') as f:
        rzlt = []
        for l in f.readlines():
            if k != None and func != None:
                rzlt.append(func(json.loads(l)[k]))

            elif k != None:
                rzlt.append(json.loads(l)[k])

            else:
                rzlt.append(json.loads(l))

            if len(rzlt) == batch_size:

                yield rzlt
                rzlt = []

## Data corpus

### Dialog

In [3]:
%ls -lct ~/Studio/dialog_db/chinese_chatbot_corpus-master/clean_chat_corpus/

total 2124000
-rw-r--r--  1 root  staff   21695199 Jun  7 13:48 xiaohuangji.tsv
-rw-r--r--  1 root  staff  465372773 Jun  7 13:48 weibo.tsv
-rw-r--r--  1 root  staff  298597018 Jun  7 13:46 tieba.tsv
-rw-r--r--  1 root  staff  151548740 Jun  7 13:46 subtitle.tsv
-rw-r--r--  1 root  staff    5594328 Jun  7 13:45 qingyun.tsv
-rw-r--r--  1 root  staff   18202714 Jun  7 13:45 ptt.tsv
-rw-r--r--  1 root  staff      34249 Jun  7 13:44 chatterbot.tsv
-rw-r--r--  1 root  staff   85680288 Jun  7 13:44 douban_single_turn.tsv


In [None]:
class Files():
    def __init__(self,):
#         self.time = datetime.now()
        pass

files = Files()

path = os.path.abspath("../dialog_db/chinese_chatbot_corpus-master/clean_chat_corpus")
file_nms = os.listdir(path)

for i in range(len(file_nms)):
    setattr(files, file_nms[i].split('.')[0], os.path.join(path, file_nms[i]))
    
# dirrs(files)

In [None]:
for name in file_nms:
    file = os.path.join(path, name)
    with open(file, 'r') as f:
        num = len(f.readlines())
        print(name,":", "{:,d}".format(num))


In [None]:
cnt = 0
with open(files.ptt, 'r') as f:
    for line in f.readlines():
        print(line)
        cnt += 1
        if cnt > 20:
            break

### Classification data

In [4]:
rel_path = "../nlp_db/tnews_public"
cfiles = [
    os.path.join(os.path.abspath(rel_path),
                 os.listdir(rel_path)[i])
    for i in range(len(os.listdir(rel_path)))
]

In [5]:
# print(cfiles[4]),list(read_json(cfiles[2],100,'sentence', lambda x: list(jieba.cut(x))))

### Vocab Preprocess

In [13]:
# print(cfiles[4]),(read_json(cfiles[0],100,k = 'label'))

In [14]:
semantic = Semantic()
args = Parms()
args.path = "../nlp_db/tnews_public"
args.vocab_path = "../nlp_db/tnews_public/vocab.txt"
vocab = Vocab(semantic)

In [15]:
if not os.path.isfile(args.vocab_path):

    _make_vocab(json_file = cfiles[3], vocab_path = args.vocab_path, thres=2, level = 'word')
    # or just make new vocab
    # char level ?
    # or chinese word level | with jieba

try:
    vocab.load(args.vocab_path)
except:
    print("Vocab not loaded")

In [17]:
vocab.size, vocab.__getitem__('吃')

(16718, 1061)

### Data Process => Model Parms Get

In [18]:
max_sent_len = max([
    len(line) for line in read_json(
        cfiles[4], 60000, k='sentence', func=lambda x: list(jieba.cut(x)))
])
args.max_sent_len = max_sent_len

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.634 seconds.
Prefix dict has been built successfully.


In [19]:
labels = read_json(cfiles[0], 100, k='label')
label_rdict = {l:i for i,l in enumerate(set([lb[0] for lb in labels]))}
label_dict = {i:l for i,l in enumerate(set([lb[0] for lb in labels]))}


args.class_num = len(labels)

args.max_sent_len = max_sent_len
args.lstm_step_num = 2
args.lstm_hid = 64

In [20]:
args.batch_size = 1000
dirrm(args)

{'batch_size': 1000,
 'class_num': 15,
 'exts': ['.en.atok', '.de.atok'],
 'lstm_hid': 64,
 'lstm_step_num': 2,
 'max_dec_num': 50,
 'max_enc_num': 50,
 'max_sent_len': 81,
 'modes': ['train', 'val', 'test2016'],
 'n_sent': 5,
 'ndev': 1,
 'path': '../nlp_db/tnews_public',
 'vocab_path': '../nlp_db/tnews_public/vocab.txt'}

### Batch Data

In [40]:
# x_iter = json_iter(
#     file=cfiles[4],
#     batch_size=2,
#     k='sentence',
#     func=lambda sent: [vocab.__getitem__(token)
#                        for token in jieba.cut(sent)] + [0] *
#     (max_sent_len - len(list(jieba.cut(sent)))))

# y_iter = json_iter(file=cfiles[4],
#                    batch_size=2,
#                    k='label',
#                    func=lambda x: label_rdict[x])

In [41]:
# cnt = 0
# for batch_x, batch_y in zip(x_iter, y_iter):
#     cnt += 1
#     print("x:",np.array(batch_x))
#     print("y:",np.array(batch_y))
    
#     if cnt > 2:
#         break

### Forward, Loss & Train

In [21]:
class NLU_Classify(nn.Module):
    def __init__(self, class_num, vocab):
        super(NLU_Classify, self).__init__()
        self.type = 'classifier'
        self.batch_size = 1000
        self.serial_len = 2
        self.emb = nn.Embedding(vocab.size, embedding_dim=128)
        self.lstm = nn.LSTM(128,
                            args.lstm_hid,
                            args.lstm_step_num,
                            batch_first=True)
        self.fc = nn.Linear(64, class_num)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # ? not sure serial_len , batch_size is 100% right
        x = self.emb(x)
        h0 = torch.randn(self.serial_len, self.batch_size, args.lstm_hid)
        c0 = torch.randn(self.serial_len, self.batch_size, args.lstm_hid)
        x, (hn, cn) = self.lstm(x, (h0, c0))
        x = self.fc(x)
        x = x[:, -1, :]
        x = self.softmax(x)
        result = x

        return result

In [22]:
model = NLU_Classify(class_num=args.class_num, vocab=vocab)

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

x_iter = json_iter(
    file=cfiles[4],
    batch_size=args.batch_size,
    k='sentence',
    func=lambda sent: [vocab.__getitem__(token)
                       for token in jieba.cut(sent)] + [0] *
    (max_sent_len - len(list(jieba.cut(sent)))))

y_iter = json_iter(file=cfiles[4],
                   batch_size=args.batch_size,
                   k='label',
                   func=lambda x: label_rdict[x])

In [None]:
# x_iter, y_iter
for batch_x, batch_y in zip(x_iter, y_iter):
    batch_x = torch.tensor(batch_x)
    batch_y = torch.tensor(batch_y)

    optimizer.zero_grad()
    y_hat = model(batch_x)
    loss = loss_func(y_hat, batch_y);print(loss)
#     loss.backward(retain_graph=True)
    loss.backward()
    optimizer.step()
    


In [None]:
# def predict_to_sent(result):
#     return [''.join([vocab.vocab_rdict[tkid.tolist()] for tkid in line]) for line in result ]