## Imports

In [1]:
import jieba, json, os, re, sys, time
from datetime import datetime
import numpy as np

import torch
import torch.nn as nn

## Functions

In [2]:
from fields import Field, Parms, Semantic, Vocab, _make_vocab
from utils import *

from nlp_db import nlp_db

from model_class import NLU_Classify

In [3]:
def acc(y_hat, y_label):
    correct = (torch.argmax(y_hat, dim = 1) == y_label).float()
    acc_rate = correct.sum() / len(correct)
    
    return acc_rate

def read_json(file, thresh=20, k=None, func=None):

    with open(file, "r", encoding='utf-8') as f:
        rzlt = []
        cnt = 0
        for l in f.readlines():

            if k != None and func != None:
                rzlt.append(func(json.loads(l)[k]))

            elif k != None:
                rzlt.append(json.loads(l)[k])

            else:
                rzlt.append(json.loads(l))

            if cnt > thresh:
                break

    return rzlt


def json_iter(file, batch_size=1000, k=None, func=None):
    with open(file, "r", encoding='utf-8') as f:
        rzlt = []
        for l in f.readlines():
            if k != None and func != None:
                rzlt.append(func(json.loads(l)[k]))

            elif k != None:
                rzlt.append(json.loads(l)[k])

            else:
                rzlt.append(json.loads(l))

            if len(rzlt) == batch_size:

                yield rzlt
                rzlt = []

def func_pad(sent):
    return [vocab.__getitem__(token) for token in jieba.cut(sent)
            ] + [0] * (max_sent_len - len(list(jieba.cut(sent)))) , len(list(jieba.cut(sent)))

def restart_iter(batch_size):
    x_iter = json_iter(file= trainFile,
                       batch_size=batch_size,
                       k='sentence',
                       func= func_pad
                      )

    y_iter = json_iter(file=trainFile,
                       batch_size = batch_size,
                       k='label',
                       func=lambda x: label_rdict[x])

    return x_iter, y_iter


### devices

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Data corpus

### Dialog

In [3]:
%ls -lct ~/Studio/dialog_db/chinese_chatbot_corpus-master/clean_chat_corpus/

total 2124000
-rw-r--r--  1 root  staff   21695199 Jun  7 13:48 xiaohuangji.tsv
-rw-r--r--  1 root  staff  465372773 Jun  7 13:48 weibo.tsv
-rw-r--r--  1 root  staff  298597018 Jun  7 13:46 tieba.tsv
-rw-r--r--  1 root  staff  151548740 Jun  7 13:46 subtitle.tsv
-rw-r--r--  1 root  staff    5594328 Jun  7 13:45 qingyun.tsv
-rw-r--r--  1 root  staff   18202714 Jun  7 13:45 ptt.tsv
-rw-r--r--  1 root  staff      34249 Jun  7 13:44 chatterbot.tsv
-rw-r--r--  1 root  staff   85680288 Jun  7 13:44 douban_single_turn.tsv


In [None]:
class Files():
    def __init__(self,):
#         self.time = datetime.now()
        pass

files = Files()

path = os.path.abspath("../dialog_db/chinese_chatbot_corpus-master/clean_chat_corpus")
file_nms = os.listdir(path)

for i in range(len(file_nms)):
    setattr(files, file_nms[i].split('.')[0], os.path.join(path, file_nms[i]))
    
# dirrs(files)

In [None]:
for name in file_nms:
    file = os.path.join(path, name)
    with open(file, 'r') as f:
        num = len(f.readlines())
        print(name,":", "{:,d}".format(num))


In [None]:
cnt = 0
with open(files.ptt, 'r') as f:
    for line in f.readlines():
        print(line)
        cnt += 1
        if cnt > 20:
            break

### Classification data filename

In [5]:
rel_path = "../nlp_db/tnews_public"
cfiles = [
    os.path.join(os.path.abspath(rel_path),
                 os.listdir(rel_path)[i])
    for i in range(len(os.listdir(rel_path)))
]
cfiles

['/home/ubuntu/Studio/nlp_db/tnews_public/labels.json',
 '/home/ubuntu/Studio/nlp_db/tnews_public/test.json',
 '/home/ubuntu/Studio/nlp_db/tnews_public/train.json',
 '/home/ubuntu/Studio/nlp_db/tnews_public/vocab.txt',
 '/home/ubuntu/Studio/nlp_db/tnews_public/.ipynb_checkpoints',
 '/home/ubuntu/Studio/nlp_db/tnews_public/dev.json']

In [6]:
testFile = cfiles[1]
trainFile = cfiles[2]
vocabFile = cfiles[3]

testFile, trainFile, vocabFile

('/home/ubuntu/Studio/nlp_db/tnews_public/test.json',
 '/home/ubuntu/Studio/nlp_db/tnews_public/train.json',
 '/home/ubuntu/Studio/nlp_db/tnews_public/vocab.txt')

In [6]:
# print(cfiles[4]),list(read_json(cfiles[2],100,'sentence', lambda x: list(jieba.cut(x))))

### Vocab Preprocess

In [7]:
semantic = Semantic()
args = Parms()
vocab = Vocab(semantic)

In [10]:
# args.path = "../nlp_db/tnews_public"
args.vocab_path = vocabFile

In [11]:
os.remove(args.vocab_path)
if not os.path.isfile(args.vocab_path):
    _make_vocab(json_file = trainFile, vocab_path = args.vocab_path, thres=2, level = 'word')
    # or just make new vocab
    # char level ?
    # or chinese word level | with jieba

try:
    vocab.load(args.vocab_path)
except:
    print("Vocab not loaded")

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.635 seconds.
Prefix dict has been built successfully.


In [12]:
vocab.size, vocab.__getitem__('吃'), vocab.__getitem__('<pad>'), vocab.__getitem__('<unk>'), vocab.__getitem__('<sos>')

(16718, 1061, 0, 3, 1)

### Data Process => Model Parms Get

In [13]:
max_sent_len = max([
    len(line) for line in read_json(
        trainFile, 60000, k='sentence', func=lambda x: list(jieba.cut(x)))
])
args.max_sent_len = max_sent_len

In [14]:
labels = read_json(cfiles[0], 100, k='label')
label_rdict = {l:i for i,l in enumerate(labels)}
label_dict = {i:l for i,l in enumerate(labels)}


args.class_num = len(labels)

args.max_sent_len = max_sent_len
args.lstm_step_num = 2
args.lstm_hid = 64

In [15]:
args.batch_size = 1000
dirrm(args)

{'batch_size': 1000,
 'class_num': 15,
 'exts': ['.en.atok', '.de.atok'],
 'lstm_hid': 64,
 'lstm_step_num': 2,
 'max_dec_num': 50,
 'max_enc_num': 50,
 'max_sent_len': 81,
 'modes': ['train', 'val', 'test2016'],
 'n_sent': 5,
 'ndev': 1,
 'path': './data/multi30k/',
 'vocab_path': '/home/ubuntu/Studio/nlp_db/tnews_public/vocab.txt'}

### Batch Data Dev

In [None]:
# emb = nn.Embedding(
#     vocab.size,
#     embedding_dim=128,
#     padding_idx=0,
# )

# x_iter, y_iter = restart_iter(args.batch_size)

# cnt = 0
# for x, y in zip(x_iter, y_iter):
# #     print(list(zip(*[x])))
#     batch_x, sent_len = list(zip(*x))
#     cnt += 1
    
#     if cnt > 0:
#         break

# np.array([np.array(line) for line in batch_x])



### Forward, Loss

In [16]:
class NLU_Classify(nn.Module):
    def __init__(self, class_num, vocab, args):
        super(NLU_Classify, self).__init__()
        self.type = 'classifier'
        self.batch_size = args.batch_size
        self.serial_len = 2
        self.emb = nn.Embedding(vocab.size, embedding_dim=128)
        self.lstm = nn.LSTM(128,
                            args.lstm_hid,
                            args.lstm_step_num,
                            batch_first=True)
        self.fc = nn.Linear(64, class_num)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x, sent_lengths):
        # ? not sure serial_len , batch_size is 100% right
        embedded_x = self.emb(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded_x,
                                                    sent_lengths,
                                                    enforce_sorted=False,
                                                    batch_first=True)
        h0 = torch.randn(self.serial_len, self.batch_size, args.lstm_hid, device = device)
        c0 = torch.randn(self.serial_len, self.batch_size, args.lstm_hid, device = device)
        x, (hidden, cn) = self.lstm(packed_embedded, (h0, c0))
        hidden = hidden[-1,:,:]
        output = self.fc(hidden)
        output = self.softmax(output)
        result = output

        return result
    
    
# Unit test:
# x_iter, y_iter = restart_iter(args.batch_size)
# cnt = 0
# for batch_x, batch_y in zip(x_iter, y_iter):
#     batch_x, sent_len = list(zip(*batch_x))
#     batch_x = torch.tensor(np.array([np.array(line) for line in batch_x]))
#     sent_lengths = torch.tensor(sent_len)
#     batch_y = torch.tensor(batch_y)
    
#     cnt += 1
#     if cnt > 0:
#         break

# sent_lengths.shape, batch_y.shape

# y_hat = model(batch_x, sent_lengths)
# y_hat.shape

# loss_func(y_hat, batch_y)

In [17]:
model = NLU_Classify(class_num=args.class_num, vocab=vocab, args = args)
model.to(device)

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)



## Train

In [None]:

# Tasks

# Algorithms

# Model State Monitoring and Recording



In [63]:
# - tensorboard with matrics? what are matrics - knowledge base
# - cuda device
# - save & restore models
# - try other models rather than lstm
# - train vs. eval mode

In [19]:
from tensorboardX import SummaryWriter
#SummaryWriter encapsulates everything
writer = SummaryWriter('runs/exp-2', comment = 'lstm')

In [55]:
with open("manual_trainLog.log",'r') as f:
    l = f.readlines()
l

[]

In [54]:
# Model Trained Parameters
# model.state_dict()    # is a ordered dict
{k: model.state_dict()[k].shape for k in model.state_dict()}

{'emb.weight': torch.Size([16718, 128]),
 'lstm.weight_ih_l0': torch.Size([256, 128]),
 'lstm.weight_hh_l0': torch.Size([256, 64]),
 'lstm.bias_ih_l0': torch.Size([256]),
 'lstm.bias_hh_l0': torch.Size([256]),
 'lstm.weight_ih_l1': torch.Size([256, 64]),
 'lstm.weight_hh_l1': torch.Size([256, 64]),
 'lstm.bias_ih_l1': torch.Size([256]),
 'lstm.bias_hh_l1': torch.Size([256]),
 'fc.weight': torch.Size([15, 64]),
 'fc.bias': torch.Size([15])}

In [None]:
if not os.path.isdir('./model_stores'):
    os.mkdir('./model_stores')

args.model_path = './model_stores/model.pth'

first_train = False

# Load:
if os.path.isfile(args.model_path) and first_train == False:
    model.load_state_dict(torch.load(args.model_path))
    model.train()    # set model to train mode
#     model.eval()    # set model to train mode

# for epoch in range(100):
last_epoch = 1
epoch = last_epoch
if not "acc_rates" in locals():
    acc_rates = [0] * 8
# while True:

while np.array(acc_rates).sum() / len(acc_rates) < 0.8:
    epoch += 1
    x_iter, y_iter = restart_iter(args.batch_size)

    ep_cnt = 0
    acc_loss = []
    acc_rates = []
    for batch_x, batch_y in zip(x_iter, y_iter):
        batch_x, sent_lengths = list(zip(*batch_x))

        batch_x = torch.tensor(np.array([np.array(line) for line in batch_x]))
        sent_lengths = torch.tensor(sent_lengths)
        batch_y = torch.tensor(batch_y)

        batch_x = batch_x.to(device)
        sent_lengths = sent_lengths.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        y_hat = model(batch_x, sent_lengths)
        loss = loss_func(y_hat, batch_y)

        loss.backward()
        optimizer.step()

        acc_rate = acc(y_hat, batch_y)

        ep_cnt += 1
        acc_loss.append(loss)
        acc_rates.append(acc_rate)
        if ep_cnt % 20 == 0:
            last_loss, last_avgac = np.array(acc_loss).sum() / len(
                acc_loss), np.array(acc_rates).sum() / len(acc_rates)
            print(epoch, "loss: ", last_loss.data, "Acc: ", last_avgac)

            writer.add_scalar('loss:', last_loss,
                              epoch + 0.32 * (ep_cnt % 20))
            writer.add_scalar('avg acc:', last_avgac,
                              epoch + 0.32 * (ep_cnt % 20))

            acc_loss = []
            acc_rates = []

            # Save:
            torch.save(model.state_dict(), f=args.model_path)
#             os.system('clear')


            # Eval
            model.eval()
            

#     loss.backward(retain_graph=True)

        last_epoch = epoch

    with open('./manuLog_lstm1.log','a') as af:
        af.write("epoch: {}, loss: {},train_avg_acc: {}".format(last_epoch, last_loss, last_avgac),'\n')

In [None]:
# def predict_to_sent(result):
#     return [''.join([vocab.vocab_rdict[tkid.tolist()] for tkid in line]) for line in result ]