# LSTM Classifier

TODO:
- Save torchtext vocabulary
- Remove <unk> from LABEL vocabulary

## Install dependent libraries
This section installs required package. Version should be specified for reproducibility.

In [None]:
! pip install torchtext==0.5.0
! pip install janome==0.3.10
! pip install attrdict==2.0.1
! pip install tqdm==4.43.0
! pip install tensorboard==2.1.1

## Parameters
Declare parameters set by `papermill` .

In [None]:
name = "test"
data_dir = "./data_sample/"
output_dir ="./output/"
batch_size = 32
embedding_size = 300
hidden_size = 300
learning_rate = 5e-5
max_grad_norm = 1.0
seed = 1234
patience = 3
num_epochs = 30
num_layers=1
dropout=0
bidirectional=False

In [None]:
import attrdict

params = attrdict.AttrDict({
    "name": name,
    "data_dir": data_dir,
    "output_dir": output_dir,
    "batch_size": batch_size,
    "embedding_size": embedding_size,
    "hidden_size": hidden_size,
    "learning_rate": learning_rate,
    "max_grad_norm": max_grad_norm,
    "seed": seed,
    "patience": patience,
    "num_epochs": num_epochs,
    "num_layers": num_layers,
    "dropout": dropout,
    "bidirectional": bidirectional,
})
del data_dir
del output_dir
del batch_size
del embedding_size
del hidden_size
del learning_rate
del max_grad_norm
del seed
del patience
del num_epochs
del num_layers
del dropout
del bidirectional

## Test library

Test your all the libraries used in this notebook.

## Preprocessor and tokenizer

In [None]:
from janome.tokenizer import Tokenizer


class PreprocessingTokenizer:
    def __init__(self):
        self._tokenizer = Tokenizer()

    def tokenize(self, text):
        tokens = self._tokenizer.tokenize(text, wakati=True)
        return tokens

## Dataset

In [None]:
import torchtext

def build_dataset(data_dir):
    tokenizer = PreprocessingTokenizer()
    # use_vocab is required for LABEL because the label is not integer
    # [TODO] LABEL vocab has <unk>, which is not needed. It should be removed.
    LABEL = torchtext.data.Field(sequential=False, use_vocab=True)
    TEXT = torchtext.data.Field(
        tokenize=tokenizer.tokenize,
        use_vocab=True,  # default: True
    )
    
    train_ds, val_ds = torchtext.data.TabularDataset.splits(
        path=data_dir,
        train="train.tsv",
        validation="valid.tsv",
        format="tsv",
        fields=[("label", LABEL), ("text", TEXT)]
    )

    LABEL.build_vocab(train_ds)
    TEXT.build_vocab(train_ds)
    
    return train_ds, val_ds, TEXT, LABEL

## Model

In [None]:
import torch


class LSTMClassifier(torch.nn.Module):
    def __init__(self, device, vocab_size, target_size, embedding_size, hidden_size, num_layers=1, dropout=0, bidirectional=False):
        super(LSTMClassifier, self).__init__()
        
        num_directions = 2 if bidirectional else 1
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
        # [TODO] PAD ID の対応
        self.lstm = torch.nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, bidirectional=bidirectional)
        self.classifier = torch.nn.Linear(num_directions*hidden_size, target_size)
        
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.num_directions = num_directions
        self.device = device

    def forward(self, batch):
        emb = self.embedding(batch)

        batch_size = batch.size()[1]
        h_0 = torch.zeros(self.num_layers*self.num_directions, batch_size, self.hidden_size).to(self.device)
        c_0 = torch.zeros(self.num_layers*self.num_directions, batch_size, self.hidden_size).to(self.device)
        
        # lstm_output: (seq_len, batch, num_directions * hidden_size): tensor
        # lstm_h, ostm_c: (num_layers * num_directions, batch, hidden_size)
        lstm_output, (lstm_h, lstm_c) = self.lstm(emb, (h_0, c_0))
        
        input_to_classifier = lstm_output[-1]
        output = self.classifier(input_to_classifier)
        return output

## Train model

In [None]:
import torch
import numpy as np
import random


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # When use GPU
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
import os
import tqdm


def mkdir(path):
    if not os.path.exists(path):
        os.mkdir(path)


def train_model(net, dataloader_dict, train_config):
    PHASE_TRAIN = "train"
    PHASE_VAL = "val"
    
    # keep the best model
    best_model = None
    best_epoch = 0
    best_metric = {"loss": float("infinity"), "top1": 0, "top5": 0}
    
    # 学習イテレーションの回数を保持
    num_iters = 0
    
    # keep the count which the validation metric does not improved
    num_patience = 0
        
    net.to(train_config.device)
    
    for epoch in range(train_config.num_epochs+1):
        print("Epoch {}/{}".format(epoch, train_config.num_epochs))
        # 学習と検証のループ
        for phase in [PHASE_TRAIN, PHASE_VAL]:
            # フェーズによってネットワークのモードを変更する
            # Dropout等の挙動に影響あり
            if phase == PHASE_TRAIN:
                net.train()
            elif phase == PHASE_VAL:
                net.eval()
            else:
                raise Exception("got {} expected one of {}".format(phase, [PHASE_TRAIN, PHASE_VAL]))
                
            epoch_loss = 0
            epoch_corrects = 0
            epoch_topk_corrects = 0
            
            # 未学習時の検証性能を確かめる
            if epoch == 0 and phase == PHASE_TRAIN:
                continue
                
            for batch in tqdm.tqdm(dataloader_dict[phase], disable=True):
                inputs = batch.text
                labels = batch.label
                # GPUが使える場合はGPUにデータを送る
                inputs = inputs.to(train_config.device)
                labels = labels.to(train_config.device)
                
                # Initialize optimizer
                train_config.optimizer.zero_grad()
                
                # set_grad_enabled(phrase=="train") で
                # 学習時のみ勾配計算できるようにグラフ作成する
                with torch.set_grad_enabled(phase==PHASE_TRAIN):
                    # labelsを指定することでlossを計算する
                    logits = net(inputs)
                    loss = train_config.criterion(logits, labels)
                    _, preds = torch.max(logits, dim=1)
                    _, topk_preds = torch.topk(logits, k=2, dim=1)
                    
                    if phase == PHASE_TRAIN:
                        loss.backward()
                        train_config.optimizer.step()
                        torch.nn.utils.clip_grad_norm_(net.parameters(), train_config.max_grad_norm)
                        num_iters += 1

                    # epoch loss を更新
                    epoch_loss += loss.item() * inputs.size()[0]
                    # 正解数を更新
                    epoch_corrects += (preds == labels).sum().item()
                    epoch_topk_corrects += (topk_preds == labels.unsqueeze(1)).max(dim=1)[0].sum().item()

                    # TensorBoardへの描画を行う
                    # 学習時のみlossを描画
                    if phase == PHASE_TRAIN:
                        train_config.writer.add_scalars("train/loss", {PHASE_TRAIN: loss.item()}, num_iters)
                        # writer.add_scalars("train/lr", {PHASE_TRAIN: scheduler.get_lr()[0]}, num_iters)

            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects / len(dataloader_dict[phase].dataset)
            epoch_topk_acc = epoch_topk_corrects / len(dataloader_dict[phase].dataset)

            print("phase {}, loss: {:.4f}, acc: {:.4f}, topk acc: {:.4f}".format(phase, epoch_loss, epoch_acc, epoch_topk_acc))
            
            if phase == PHASE_VAL:
                train_config.writer.add_scalars("train/loss", {PHASE_VAL: epoch_loss}, num_iters)
                train_config.writer.add_scalars("acc/top1", {PHASE_VAL: epoch_acc}, num_iters)
                train_config.writer.add_scalars("acc/top5", {PHASE_VAL: epoch_topk_acc}, num_iters)
                
                if best_metric["loss"] > epoch_loss:
                    best_model = net
                    best_metric = {"loss": epoch_loss, "top1": epoch_acc, "top5": epoch_topk_acc}
                    best_epoch = epoch
                    num_patience = 0
                    # save model
                    print("Save model, epoch:", epoch)
                    save_dir = os.path.join(train_config.model_output_dir, "epoch-{}".format(epoch))
                    mkdir(save_dir)

                    # [TODO] Save vocab dict
                    torch.save(net.state_dict(), os.path.join(save_dir, "model.pth"))
                else:
                    num_patience += 1
                    print("Patience {}, epoch: {}".format(num_patience, epoch))
                    
                if num_patience > train_config.patience:
                    return best_model, best_metric, best_epoch
    return best_model, best_metric, best_epoch



In [None]:
from torch.utils.tensorboard import SummaryWriter


def train(params):
    # Fix seed for reproducability
    set_seed(seed=params.seed)
    
    # create directory to save model
    model_output_dir = os.path.join(params.output_dir, name)
    mkdir(model_output_dir)
    
    train_ds, val_ds, TEXT, LABEL = build_dataset(params.data_dir)
    dataloader_dict = {
        "train": torchtext.data.Iterator(train_ds, batch_size=params.batch_size, train=True),  # set train=True to enable shuffle and
        "val": torchtext.data.Iterator(val_ds, batch_size=params.batch_size, train=False, sort=False)
    }  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net = cls = LSTMClassifier(
        device=device,
        vocab_size=len(TEXT.vocab),
        target_size=len(LABEL.vocab),
        embedding_size=params.embedding_size,
        hidden_size=params.hidden_size,
        num_layers=params.num_layers,
        dropout=params.dropout,
        bidirectional=params.bidirectional,
    )
    train_config = attrdict.AttrDict({
        "model_output_dir": model_output_dir,
        "optimizer": torch.optim.Adam(net.parameters(),  lr=params.learning_rate),
        "writer": SummaryWriter(log_dir=params.output_dir + "/runs/" + params.name),
        "num_epochs": params.num_epochs,
        "max_grad_norm": params.max_grad_norm,
        "patience": params.patience,
        "device": device,
        "criterion": torch.nn.CrossEntropyLoss()
    })
    
    print(net)
    return train_model(net, dataloader_dict, train_config)

In [None]:
best_model, best_metric, best_epoch = train(params)

## Evaluate model

In [None]:
print("Best model in epoch", best_epoch)
print("Scores:", best_metric)