## Imports

In [1]:
import jieba, json, os, re, sys, time
from datetime import datetime
import numpy as np
import tensorboardX
import torch
import torch.nn as nn
import shutil

## Functions

In [None]:
from fields import Field, Parms, Semantic, Vocab, _make_vocab
from utils import *

from nlp_db import nlp_db

from model_class import NLU_Classify

semantic = Semantic()
args = Parms()
vocab = Vocab(semantic)

In [2]:
args.manul_log = './manuaLog_lstm1.log'

NameError: name 'args' is not defined

In [3]:
def read_json(file, thresh=np.infty, k=None, func=None):

    with open(file, "r", encoding='utf-8') as f:
        rzlt = []
        cnt = 0
        for l in f.readlines():

            if k != None and func != None:
                rzlt.append(func(json.loads(l)[k]))

            elif k != None:
                rzlt.append(json.loads(l)[k])

            else:
                rzlt.append(json.loads(l))

            if cnt > thresh:
                break

    return rzlt


def json_iter(file, batch_size=1000, k=None, func=None):
    with open(file, "r", encoding='utf-8') as f:
        rzlt = []
        for l in f.readlines():
            if k != None and func != None:
                rzlt.append(func(json.loads(l)[k]))

            elif k != None:
                rzlt.append(json.loads(l)[k])

            else:
                rzlt.append(json.loads(l))

            if len(rzlt) == batch_size:

                yield rzlt
                rzlt = []
                
def func_pad(sent, max_sent_len):
    return [vocab.__getitem__(token) for token in jieba.cut(sent)
            ] + [0] * (max_sent_len - len(list(jieba.cut(sent)))) , len(list(jieba.cut(sent)))



def acc(y_hat, y_label):
    correct = (torch.argmax(y_hat, dim = 1) == y_label).float()
    acc_rate = correct.sum() / len(correct)
    
    return acc_rate

def dump_log():
    with open('./manuLog_lstm1.log', 'a') as fp:
        json.dump(
            {
                "epoch": last_epoch,
                "loss": last_loss.data.item(),
                "train_avg_acc": last_avgac.data.item(),
                "dev_avg_acc": dev_acc.data.item()
            }, fp)
        fp.write('\n')

    with open('./manuLog_lstm1.log', 'r') as fp:
        last_line = fp.readlines()[-10:]
    
    with open('./manuLog_lstm1.log', 'w') as fp:
        fp.write(last_line)

In [4]:
with open('./manuLog_lstm1.log', 'r') as fp:
    last_line = fp.readlines()[-10:]

with open('./manuLog_lstm1.log', 'w') as fp:
    fp.write(last_line)

FileNotFoundError: [Errno 2] No such file or directory: './manuLog_lstm1.log'

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Classification Task

### Data Files

In [9]:
rel_path = "../nlp_db/tnews_public"
cfiles = [
    os.path.join(os.path.abspath(rel_path),
                 os.listdir(rel_path)[i])
    for i in range(len(os.listdir(rel_path)))
]
print(cfiles)
testFile = cfiles[1]
trainFile = cfiles[2]
vocabFile = cfiles[3]
devFile = cfiles[-1]
testFile, trainFile, vocabFile, devFile

['/home/ubuntu/Studio/nlp_db/tnews_public/labels.json', '/home/ubuntu/Studio/nlp_db/tnews_public/test.json', '/home/ubuntu/Studio/nlp_db/tnews_public/train.json', '/home/ubuntu/Studio/nlp_db/tnews_public/vocab.txt', '/home/ubuntu/Studio/nlp_db/tnews_public/.ipynb_checkpoints', '/home/ubuntu/Studio/nlp_db/tnews_public/dev.json']


('/home/ubuntu/Studio/nlp_db/tnews_public/test.json',
 '/home/ubuntu/Studio/nlp_db/tnews_public/train.json',
 '/home/ubuntu/Studio/nlp_db/tnews_public/vocab.txt',
 '/home/ubuntu/Studio/nlp_db/tnews_public/dev.json')

### Vocab Preprocess

In [10]:
args.vocab_path = vocabFile

os.remove(args.vocab_path)
if not os.path.isfile(args.vocab_path):
    _make_vocab(json_file=trainFile,
                vocab_path=args.vocab_path,
                thres=2,
                level='word')

try:
    vocab.load(args.vocab_path)
except:
    print("Vocab not loaded")
vocab.size, vocab.__getitem__('ÂêÉ'), vocab.__getitem__(
    '<pad>'), vocab.__getitem__('<unk>'), vocab.__getitem__('<sos>')

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.593 seconds.
Prefix dict has been built successfully.


(16718, 1061, 0, 3, 1)

### Data Process => Model Parms Get

In [11]:
max_sent_len = max([
    len(line) for line in read_json(
        trainFile, 60000, k='sentence', func=lambda x: list(jieba.cut(x)))
])
args.max_sent_len = max_sent_len

labels = read_json(cfiles[0], 100, k='label')
label_rdict = {l:i for i,l in enumerate(labels)}
label_dict = {i:l for i,l in enumerate(labels)}


args.class_num = len(labels)

args.max_sent_len = max_sent_len
args.lstm_step_num = 2
args.lstm_hid = 64

args.batch_size = 5000
dirrm(args)

{'batch_size': 5000,
 'class_num': 15,
 'exts': ['.en.atok', '.de.atok'],
 'lstm_hid': 64,
 'lstm_step_num': 2,
 'manul_log': './manuaLog_lstm1.log',
 'max_dec_num': 50,
 'max_enc_num': 50,
 'max_sent_len': 81,
 'modes': ['train', 'val', 'test2016'],
 'n_sent': 5,
 'ndev': 1,
 'path': './data/multi30k/',
 'vocab_path': '/home/ubuntu/Studio/nlp_db/tnews_public/vocab.txt'}

### Batch Data Dev

## Algorithm Process

### Forward, Loss

In [12]:
class NLU_Classify(nn.Module):
    def __init__(self, class_num, vocab, args):
        super(NLU_Classify, self).__init__()
        self.type = 'classifier'
        self.batch_size = args.batch_size
        self.serial_len = 2
        self.emb = nn.Embedding(vocab.size, embedding_dim=128)
        self.lstm = nn.LSTM(128,
                            args.lstm_hid,
                            args.lstm_step_num,
                            batch_first=True)
        self.fc = nn.Linear(64, class_num)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x, sent_lengths):
        # ? not sure serial_len , batch_size is 100% right
        embedded_x = self.emb(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded_x,
                                                    sent_lengths,
                                                    enforce_sorted=False,
                                                    batch_first=True)
        h0 = torch.randn(self.serial_len, self.batch_size, args.lstm_hid, device = device)
        c0 = torch.randn(self.serial_len, self.batch_size, args.lstm_hid, device = device)
        x, (hidden, cn) = self.lstm(packed_embedded, (h0, c0))
        hidden = hidden[-1,:,:]
        output = self.fc(hidden)
        output = self.softmax(output)
        result = output

        return result

### model, loss, optimizer - Definition

In [13]:
model = NLU_Classify(class_num=args.class_num, vocab=vocab, args = args)
model.to(device)

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

### Dev Eval

In [15]:
dev_max_sent_len = max([
    len(line) for line in read_json(
        devFile, k='sentence', func=lambda x: list(jieba.cut(x)))
])
args.dev_max_sent_len = dev_max_sent_len

dirrm(args)

{'batch_size': 5000,
 'class_num': 15,
 'dev_max_sent_len': 72,
 'exts': ['.en.atok', '.de.atok'],
 'lstm_hid': 64,
 'lstm_step_num': 2,
 'max_dec_num': 50,
 'max_enc_num': 50,
 'max_sent_len': 81,
 'modes': ['train', 'val', 'test2016'],
 'n_sent': 5,
 'ndev': 1,
 'path': './data/multi30k/',
 'vocab_path': '/home/ubuntu/Studio/nlp_db/tnews_public/vocab.txt'}

### Training

- Evaluation
- Start from last checkpoint
- Matrix Capture
- Stop Rules ?


In [78]:
def get_last_epoch():
    with open(args.manual_log, 'r') as f:
        l = f.readlines()[-1]
        last_epoch = json.loads(l.strip())['epoch']
        
    return last_epoch

In [63]:
def restart_iter(batch_size, datafile):
    x_iter = json_iter(
        file=datafile,
        batch_size=batch_size,
        k='sentence',
        func=lambda sent: func_pad(sent, max_sent_len=args.max_sent_len))

    y_iter = json_iter(file=datafile,
                       batch_size=batch_size,
                       k='label',
                       func=lambda x: label_rdict[x])

    return x_iter, y_iter

In [64]:
# Prepare for Eval
eval_x, eval_sent_lengths = list(
    zip(*read_json(
        devFile,
        k='sentence',
        thresh=np.infty,
        func=lambda sent: func_pad(sent, max_sent_len=args.dev_max_sent_len))))

eval_y = read_json(file=devFile, k='label', func=lambda x: label_rdict[x])

eval_sent_lengths = torch.tensor(eval_sent_lengths)
eval_x = torch.tensor(np.array([np.array(line) for line in eval_x]))
eval_y = torch.tensor(eval_y)

eval_x = eval_x.to(device)
eval_sent_lengths = eval_sent_lengths.to(device)
eval_y = eval_y.to(device)

In [None]:
# Training 

if not os.path.isdir('./model_stores'):
    os.mkdir('./model_stores')

args.model_path = './model_stores/model1.pth'
first_train = True

# Load:
if os.path.isfile(args.model_path) and first_train == False:
    model.load_state_dict(torch.load(args.model_path))
    model.train()  # set model to train mode


if first_train:
    last_epoch = 1
    try:
        shutil.rmtree(os.path.abspath('./runs/')) 
        os.remove(os.path.abspath(args.manul_log))
    except:
        pass
else:
    last_epoch = get_last_epoch()

    

writer_train = tensorboardX.SummaryWriter('runs/train_0')
writer_test = tensorboardX.SummaryWriter('runs/test_0')
writer = tensorboardX.SummaryWriter('runs/net_0')
writer.add_graph(model, eval_x)


epoch = last_epoch
if not "acc_rates" in locals():
    acc_rates = [0] * 10

while True:
    # while np.array(acc_rates).sum() / len(acc_rates) < 0.8:
    epoch += 1
    x_iter, y_iter = restart_iter(args.batch_size, trainFile)

    ep_cnt = 0
    acc_loss = []
    acc_rates = []
    for batch_x, batch_y in zip(x_iter, y_iter):
        model.train()
        batch_x, sent_lengths = list(zip(*batch_x))

        batch_x = torch.tensor(np.array([np.array(line) for line in batch_x]))
        sent_lengths = torch.tensor(sent_lengths)
        batch_y = torch.tensor(batch_y)

        batch_x = batch_x.to(device)
        sent_lengths = sent_lengths.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        y_hat = model(batch_x, sent_lengths)
        loss = loss_func(y_hat, batch_y)

        loss.backward()
        # loss.backward(retain_graph=True)
        optimizer.step()

        acc_rate = acc(y_hat, batch_y)

        ep_cnt += 1
        acc_loss.append(loss)
        acc_rates.append(acc_rate)
        if ep_cnt % 10 == 0:
            # get metrics
            idx = epoch + 0.32 * (ep_cnt % 20)
            last_loss, last_avgac = np.array(acc_loss).sum() / len(
                acc_loss), np.array(acc_rates).sum() / len(acc_rates)

            print(epoch, "loss: ", last_loss.data, "Acc: ", last_avgac)

            writer_train.add_scalar('loss', last_loss, idx)
            writer_train.add_scalar('train_avgAcc:', last_avgac, idx)

            acc_loss = []
            acc_rates = []

            # Save Model Parameters:
            torch.save(model.state_dict(), f=args.model_path)

            # Eval
            model.eval()

            yhat = model(eval_x, eval_sent_lengths)
            dev_acc = acc(yhat, eval_y)

            print(epoch, "dev_acc: ", dev_acc)

            writer_test.add_scalar('dev_avgAcc', dev_acc.data.item(), idx)

            last_epoch = epoch

            dump_log()

In [None]:
# Model Trained Parameters
# model.state_dict()    # is a ordered dict
# {k: model.state_dict()[k].shape for k in model.state_dict()}