# LSTM Classifier

## Install dependent libraries
This section installs required package. Version should be specified for reproducibility.

In [None]:
! pip install torchtext==0.5.0
! pip install janome==0.3.10
! pip install attrdict==2.0.1
! pip install tqdm==4.43.0
! pip install tensorboard==2.1.1

## Test library

Test your all the libraries used in this notebook.

## Define parameters
Declare parameters set by `papermill` .

In [None]:
name = "test"
data_dir = "./data_sample/"
output_dir ="./output/"
batch_size = 32
embedding_size = 300
hidden_size = 300
learning_rate = 5e-5
max_grad_norm = 1.0
seed = 1234
patience = 3
num_epochs = 30
num_layers=1
dropout=0
bidirectional=False

Create an attribute object `param` from parameters, then delete parameter variables to clean this namespace.

In [None]:
import attrdict

_params = attrdict.AttrDict({
    "name": name,
    "data_dir": data_dir,
    "output_dir": output_dir,
    "batch_size": batch_size,
    "embedding_size": embedding_size,
    "hidden_size": hidden_size,
    "learning_rate": learning_rate,
    "max_grad_norm": max_grad_norm,
    "seed": seed,
    "patience": patience,
    "num_epochs": num_epochs,
    "num_layers": num_layers,
    "dropout": dropout,
    "bidirectional": bidirectional,
})
del data_dir
del output_dir
del batch_size
del embedding_size
del hidden_size
del learning_rate
del max_grad_norm
del seed
del patience
del num_epochs
del num_layers
del dropout
del bidirectional

## Define preprocessor and tokenizer

In [None]:
from janome.tokenizer import Tokenizer


class PreprocessingTokenizer:
    def __init__(self):
        self._tokenizer = Tokenizer()

    def tokenize(self, text):
        tokens = self._tokenizer.tokenize(text, wakati=True)
        return tokens
    
    def encode(self, text):
        return [self.stoi.get(tkn, self.stoi["<unk>"]) for tkn in self.tokenize(text)]
    
    def decode(self, ids):
        return "".join([self.itos[x] for x in ids])
    
    def build_vocab(self, texts, pad=True, unk=True):
        stoi = dict()
        idx = 0
        if pad:
            stoi["<pad>"] = idx
            idx += 1
        if unk:
            stoi["<unk>"] = idx
            idx += 1
        for text in texts:
            for token in self.tokenize(text):
                if token in stoi:
                    continue
                stoi[token] = idx
                idx += 1

        self.stoi = stoi
        self.itos = self._build_itos_from_stoi(stoi)
        return self
        
    def _build_itos_from_stoi(self, stoi):
        return [tkn for tkn, idx in sorted(stoi.items(), key=lambda x: x[1])]

    def save(self, save_dir):
        path = os.path.join(save_dir, "vocab.txt")
        with open(path, "w") as fd:
            for tkn, idx in self.stoi.items():
                print("{}\t{}".format(tkn, idx), file=fd)
                
    def load(self, path):
        stoi = dict()
        path = os.path.join(path, "vocab.txt")        
        with open(path) as fd:
            for line in fd:
                tkn, idx = line.strip("\n").split("\t")
                idx = int(idx)
                stoi[tkn] = idx
        self.stoi = stoi
        self.itos = self._build_itos_from_stoi(stoi)
        return self

## Define dataset

In [None]:
class LabelMapper:
    # 絵文字のラベルと学習時のIDの変換を行うクラス
    def __init__(self):
        pass
        
    def build(self, labels):
        id_ = 0
        label_to_id = dict()
        id_to_label = dict()
        
        for label in sorted(labels):
            if label in label_to_id:
                continue
            label_to_id[label] = id_
            id_to_label[id_] = label
            id_ += 1
        self._label_to_id = label_to_id
        self._id_to_label = id_to_label
        return self
        
    def id(self, label):
        return self._label_to_id[label]
    
    def label(self, id):
        return self._id_to_label[id]
    
    def labels(self):
        return list(self._label_to_id.keys())
    
    def save(self, path):
        path = os.path.join(path, "label.txt")
        with open(path, "w") as fd:
            for id_, label in self._id_to_label.items():
                print("{}\t{}".format(id_, label), file=fd)
                
    def load(self, path):
        label_to_id = dict()
        id_to_label = dict()
        
        path = os.path.join(path, "label.txt")        
        with open(path) as fd:
            for line in fd:
                id_, label = line.strip("\n").split("\t")
                id_ = int(id_)
                label_to_id[label] = id_
                id_to_label[id_] = label
        
        self._label_to_id = label_to_id
        self._id_to_label = id_to_label
        return self

In [None]:
import torch


# DataSetは __getitem__, __len__を定義する

class Dataset(torch.utils.data.Dataset):
    def __init__(self, text_label_list, tokenizer):
        self._text_label_list = text_label_list
        self._tokenizer = tokenizer
        
    def __len__(self):
        return len(self._text_label_list)
    
    def __getitem__(self, index):
        text, label = self._text_label_list[index]
        ids = self._tokenizer.encode(text)
        return ids, label
    
    
class PaddingCollation:
    def __call__(self, ids_label_list):
        # ids_list is like [([2, 4609, 3], 1), ([2, 10350, 25746, 28450, 3], 0)]
        ids_list = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(x[0]) for x in ids_label_list], batch_first=False)
        label_list = torch.LongTensor([x[1] for x in ids_label_list])
        return ids_list, label_list


def build_data_loader(fd, label_mapper, tokenizer, batch_size, shuffle):
    data = []
    for line in fd:
        label_str, text = line.strip("\n").split("\t")
        label = label_mapper.id(label_str)
        data.append((text, label))
    dataset = Dataset(data, tokenizer)
    col = PaddingCollation()
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=col, shuffle=shuffle)

In [None]:
import torchtext

def build_dataset(data_dir):
    tokenizer = PreprocessingTokenizer()
    # use_vocab is required for LABEL because the label is not integer
    # [TODO] LABEL vocab has <unk>, which is not needed. It should be removed.
    LABEL = torchtext.data.Field(sequential=False, use_vocab=True)
    TEXT = torchtext.data.Field(
        tokenize=tokenizer.tokenize,
        use_vocab=True,  # default: True
    )
    
    train_ds, val_ds, test_ds = torchtext.data.TabularDataset.splits(
        path=data_dir,
        train="train.tsv",
        validation="valid.tsv",
        test="test.tsv",
        format="tsv",
        fields=[("label", LABEL), ("text", TEXT)]
    )
    
    return train_ds, val_ds, test_ds, TEXT, LABEL

## Define model

In [None]:
import torch


class LSTMClassifier(torch.nn.Module):
    def __init__(self, vocab_size, target_size, embedding_size, hidden_size, num_layers=1, dropout=0, bidirectional=False, padding_idx=1):
        super(LSTMClassifier, self).__init__()
        
        num_directions = 2 if bidirectional else 1
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        # [TODO] PAD ID の対応
        self.lstm = torch.nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, bidirectional=bidirectional)
        self.classifier = torch.nn.Linear(num_directions*hidden_size, target_size)

        # parameters
        self.vocab_size = vocab_size
        self.target_size = target_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        self.padding_idx = padding_idx
        self.num_directions = num_directions
        
    def init_hidden(self, batch_size):
        return (
            torch.zeros(self.num_layers*self.num_directions, batch_size, self.hidden_size),
            torch.zeros(self.num_layers*self.num_directions, batch_size, self.hidden_size)
        )

    def forward(self, batch, h_0, c_0):
        emb = self.embedding(batch)
        batch_size = batch.size()[1]

        # lstm_output: (seq_len, batch, num_directions * hidden_size): tensor
        # lstm_h, ostm_c: (num_layers * num_directions, batch, hidden_size)
        lstm_output, (lstm_h, lstm_c) = self.lstm(emb, (h_0, c_0))
        
        input_to_classifier = lstm_output[-1]
        output = self.classifier(input_to_classifier)
        return output
        
    def save(self, save_dir):
        dic = {
            "state_dict": self.state_dict(),
            "args": {
                "vocab_size": self.vocab_size,
                "target_size": self.target_size,
                "embedding_size": self.embedding_size,
                "hidden_size": self.hidden_size,
                "num_layers": self.num_layers,
                "dropout": self.dropout,
                "bidirectional": self.bidirectional,
                "padding_idx": self.padding_idx,
            }
        }
        torch.save(dic, os.path.join(save_dir, "model.pth"))

    @classmethod
    def load(cls, save_dir):
        checkpoint = torch.load(os.path.join(save_dir, "model.pth"))
        net = cls(**checkpoint["args"])
        net.load_state_dict(checkpoint["state_dict"])
        return net

## Build and save vocabulary

In [None]:
import os

def build_vocab(output_dir, data_dir, name):
    model_dir = os.path.join(output_dir, name)
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    labels = [x.split("\t")[0] for x in open(data_dir + "/train.tsv")]
    texts = [x.split("\t")[1] for x in open(data_dir + "/train.tsv")]
    # Build label vocabulary 
    label_mapper = LabelMapper().build(labels)
    label_mapper.save(model_dir)
    label_mapper = LabelMapper().load(model_dir)
    
    # Build vocabulary
    tokenizer = PreprocessingTokenizer().build_vocab(texts)
    tokenizer.save(model_dir)
    
    return model_dir

In [None]:
_model_dir = build_vocab(_params.output_dir, _params.data_dir, _params.name)
_label_mapper = LabelMapper().load(_model_dir)
_tokenizer = PreprocessingTokenizer().load(_model_dir)

In [None]:
print(len(_label_mapper.labels()), _label_mapper.labels())
print(_tokenizer.itos[:100])

## Train and save model

In [None]:
import torch
import numpy as np
import random


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # When use GPU
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
import os
import tqdm


def train_model(model_dir, net, dataloader_dict, train_config):
    PHASE_TRAIN = "train"
    PHASE_VAL = "val"
    
    # keep the best model
    best_model = None
    best_epoch = 0
    best_metric = {"loss": float("infinity"), "top1": 0, "top5": 0}
    
    # 学習イテレーションの回数を保持
    num_iters = 0
    
    # keep the count which the validation metric does not improved
    num_patience = 0
        
    net.to(train_config.device)
    
    for epoch in range(train_config.num_epochs+1):
        print("Epoch {}/{}".format(epoch, train_config.num_epochs))
        # 学習と検証のループ
        for phase in [PHASE_TRAIN, PHASE_VAL]:
            # フェーズによってネットワークのモードを変更する
            # Dropout等の挙動に影響あり
            if phase == PHASE_TRAIN:
                net.train()
            elif phase == PHASE_VAL:
                net.eval()
            else:
                raise Exception("got {} expected one of {}".format(phase, [PHASE_TRAIN, PHASE_VAL]))
                
            epoch_loss = 0
            epoch_corrects = 0
            epoch_topk_corrects = 0
            
            # 未学習時の検証性能を確かめる
            if epoch == 0 and phase == PHASE_TRAIN:
                continue
                
            for inputs, labels in tqdm.tqdm(dataloader_dict[phase], disable=True):
                # GPUが使える場合はGPUにデータを送る
                inputs = inputs.to(train_config.device)
                labels = labels.to(train_config.device)
                
                # initialize hidden states
                h_0, c_0 = net.init_hidden(inputs.size()[1])  # inputs.size()[1] == batch_size
                h_0 = h_0.to(train_config.device)
                c_0 = c_0.to(train_config.device)
                
                # Initialize optimizer
                if phase == PHASE_TRAIN:
                    train_config.optimizer.zero_grad()
                
                # set_grad_enabled(phrase=="train") で
                # 学習時のみ勾配計算できるようにグラフ作成する
                with torch.set_grad_enabled(phase==PHASE_TRAIN):
                    # labelsを指定することでlossを計算する
                    logits = net(inputs, h_0, c_0)
                    loss = train_config.criterion(logits, labels)
                    _, preds = torch.max(logits, dim=1)
                    _, topk_preds = torch.topk(logits, k=5, dim=1)
                    
                    if phase == PHASE_TRAIN:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(net.parameters(), train_config.max_grad_norm)
                        train_config.optimizer.step()
                        num_iters += 1

                    # epoch loss を更新
                    epoch_loss += loss.item() * inputs.size()[0]
                    # 正解数を更新
                    epoch_corrects += (preds == labels).sum().item()
                    epoch_topk_corrects += (topk_preds == labels.unsqueeze(1)).max(dim=1)[0].sum().item()

                    # TensorBoardへの描画を行う
                    # 学習時のみlossを描画
                    if phase == PHASE_TRAIN:
                        train_config.writer.add_scalars("train/loss", {PHASE_TRAIN: loss.item()}, num_iters)

            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects / len(dataloader_dict[phase].dataset)
            epoch_topk_acc = epoch_topk_corrects / len(dataloader_dict[phase].dataset)

            print("phase {}, loss: {:.4f}, acc: {:.4f}, topk acc: {:.4f}".format(phase, epoch_loss, epoch_acc, epoch_topk_acc))
            
            if train_config.writer and phase == PHASE_VAL:
                train_config.writer.add_scalars("train/loss", {PHASE_VAL: epoch_loss}, num_iters)
                train_config.writer.add_scalars("acc/top1", {PHASE_VAL: epoch_acc}, num_iters)
                train_config.writer.add_scalars("acc/top5", {PHASE_VAL: epoch_topk_acc}, num_iters)

            if phase == PHASE_VAL:
                if best_metric["loss"] > epoch_loss:
                    best_model = net
                    best_metric = {"loss": epoch_loss, "top1": epoch_acc, "top5": epoch_topk_acc}
                    best_epoch = epoch
                    num_patience = 0
                    # Save model
                    if model_dir:
                        print("Save model")
                        best_model.save(model_dir)
                else:
                    num_patience += 1
                    print("Patience {}, epoch: {}".format(num_patience, epoch))
                    
                if num_patience > train_config.patience:
                    return best_model, best_metric, best_epoch
    return best_model, best_metric, best_epoch

In [None]:
from torch.utils.tensorboard import SummaryWriter


def train(params, model_dir):
    # Fix seed for reproducability
    set_seed(seed=params.seed)
    
    # Load vocabs
    label_mapper = LabelMapper().load(model_dir)
    tokenizer = PreprocessingTokenizer().load(model_dir)
    
    dataloader_dict = {
        "train": build_data_loader(open(params.data_dir + "/train.tsv"), label_mapper, tokenizer, batch_size=params.batch_size, shuffle=True),
        "val": build_data_loader(open(params.data_dir + "/valid.tsv"), label_mapper, tokenizer, batch_size=params.batch_size, shuffle=False),
    }  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net = cls = LSTMClassifier(
        vocab_size=len(tokenizer.itos),
        target_size=len(label_mapper.labels()),
        embedding_size=params.embedding_size,
        hidden_size=params.hidden_size,
        num_layers=params.num_layers,
        dropout=params.dropout,
        bidirectional=params.bidirectional,
        padding_idx=tokenizer.stoi["<pad>"],
    )
    train_config = attrdict.AttrDict({
        "optimizer": torch.optim.Adam(net.parameters(),  lr=params.learning_rate),
        "writer": SummaryWriter(log_dir=params.output_dir + "/runs/" + params.name),
        "num_epochs": params.num_epochs,
        "max_grad_norm": params.max_grad_norm,
        "patience": params.patience,
        "device": device,
        "criterion": torch.nn.CrossEntropyLoss()
    })
    
    print(net)
    train_model(model_dir, net, dataloader_dict, train_config)

In [None]:
train(_params, _model_dir)

## Evaluate the best model

In [None]:
def evaluate(model_dir, data_dir, batch_size):
    
    # Load vocabs
    label_mapper = LabelMapper().load(model_dir)
    tokenizer = PreprocessingTokenizer().load(model_dir)
    # Load model
    net = LSTMClassifier.load(model_dir)
    
    dataloader_dict = {
        "val": build_data_loader(open(data_dir + "/test.tsv"), label_mapper, tokenizer, batch_size=batch_size, shuffle=False),
    }  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
    train_config = attrdict.AttrDict({
        "writer": None,
        "num_epochs": 0,
        "patience": 1,
        "device": device,
        "criterion": torch.nn.CrossEntropyLoss(),        
    })
    
    train_model(None, net, dataloader_dict, train_config)

In [None]:
evaluate(_model_dir, _params.data_dir, _params.batch_size)