In [1]:
# coding:utf-8
import janome
from janome.tokenizer import Tokenizer

from pathlib import Path
from typing import *
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from functools import partial
from overrides import overrides

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import TextField, LabelField
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.iterators import BucketIterator

from allennlp.nn import util as nn_util

from allennlp.models import Model
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.modules.text_field_embedders import TextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

from allennlp.training.trainer import Trainer

from allennlp.predictors.sentence_tagger import SentenceTaggerPredictor
from allennlp.predictors.predictor import Predictor
from allennlp.common import JsonDict

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
SEED = 1
BATCH_SIZE=2
LR = 3e-4
EPOCHS = 2
HIDDEN_SZ = 64
MAX_SEQ_LEN = 100
MAX_VOCAB_SIZE = 100000

USE_GPU = torch.cuda.is_available()
DATA_ROOT = "./data/"

# データの読み込み

In [3]:
class MyReader(DatasetReader):
    def __init__(
        self, 
        tokenizer:Callable[[str], List[str]]=lambda x: x.split(),
        token_indexers:Dict[str, TokenIndexer]=None,
        max_seq_len:Optional[int]=MAX_SEQ_LEN
    ) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens":SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len
        
    def text_to_instance(self, tokens:List[Token], label:int=None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens":sentence_field}
        if label is not None:
            label_field = LabelField(label, skip_indexing=True)
            fields["label"] = label_field
        return Instance(fields)
    
    def _read(self, data_path:str) -> Iterator[Instance]:
        df = pd.read_csv(data_path, header=None, names=["tokens","label"], sep="\t")
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["tokens"])],
                row["label"]
            )
            

In [4]:
token_indexer = SingleIdTokenIndexer()

j_t = Tokenizer()
def tokenizer(text): 
    return [tok for tok in j_t.tokenize(text, wakati=True)][:MAX_SEQ_LEN]

In [5]:
tokenizer('今日は良い天気ですね。')

['今日', 'は', '良い', '天気', 'です', 'ね', '。']

In [6]:
reader = MyReader(tokenizer = tokenizer, token_indexers={"tokens":token_indexer})

In [7]:
train_dataset = reader.read(Path(DATA_ROOT) / "train_ja.tsv")
val_dataset = reader.read(Path(DATA_ROOT) / "val_ja.tsv")

4it [00:00, 330.21it/s]
4it [00:00, 678.22it/s]


### 読み込み結果確認

In [8]:
train_dataset

[<allennlp.data.instance.Instance at 0x7f8f8d5f0e48>,
 <allennlp.data.instance.Instance at 0x7f8f8ae5eb00>,
 <allennlp.data.instance.Instance at 0x7f8f8ae5efd0>,
 <allennlp.data.instance.Instance at 0x7f8f8076c0b8>]

In [9]:
vars(train_dataset[0].fields["tokens"])

{'tokens': [あなた, を, が, 好き, です, 。],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x7f901844edd8>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

### Vocabulary作成

In [10]:
vocab = Vocabulary.from_instances(train_dataset, max_vocab_size=MAX_VOCAB_SIZE)

100%|██████████| 4/4 [00:00<00:00, 6668.21it/s]


In [11]:
vocab.get_index_to_token_vocabulary()

{0: '@@PADDING@@',
 1: '@@UNKNOWN@@',
 2: 'が',
 3: 'です',
 4: '。',
 5: '好き',
 6: '私',
 7: 'は',
 8: '嫌い',
 9: 'あなた',
 10: 'を',
 11: 'マイク',
 12: 'マキ',
 13: 'ボブ'}

### バッチ化と番号ふり

In [12]:
iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)

In [13]:
batch = next(iter(iterator(train_dataset)))
batch

{'tokens': {'tokens': tensor([[13,  2,  8,  3,  4,  0],
          [ 9, 10,  2,  5,  3,  4]])}, 'label': tensor([0, 1])}

# モデル定義

In [14]:
class Attn(nn.Module):
    def __init__(self, input_sz:int, nch:int=24) -> None:
        super(Attn, self).__init__()
        self.input_sz = input_sz
        self.main = nn.Sequential(
            nn.Linear(input_sz, nch),
            nn.ReLU(True),
            nn.Linear(nch,1)
        )
        
    def forward(self, 
                encoder_outputs:torch.Tensor # (batch_size, seq_len, hidden_sz(=input_sz))
               ):
        b_size = encoder_outputs.size(0)
        attn_ene = self.main(encoder_outputs.view(-1, self.input_sz)) # (b, s, h) -> (b*s, 1)
        return F.softmax(attn_ene.view(b_size, -1), dim=1).unsqueeze(2) # (b*s, 1) -> (b, s, 1)

In [15]:
class ClassifierWithAttn(Model):
    def __init__(
        self, 
        word_embeddings:TextFieldEmbedder, 
        encoder:Seq2SeqEncoder, # (batch_size, seq_len) -> (batch_size, seq_len. hidden_sz(*2 if bidirectional)) 
        vocab:Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder 
        self.attn = Attn(self.encoder.get_output_dim()) # encoder.get_output_dim() = hidden_sz(*2 if bidirectional)
        self.main = nn.Linear(self.encoder.get_output_dim(), 1)
        self.loss = nn.BCEWithLogitsLoss()
        
    def forward(self, tokens:Dict[str, torch.Tensor], label:torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        encoder_outputs = self.encoder(embeddings, mask) # (batch_size, seq_len, hidden_sz)
        attns = self.attn(encoder_outputs) # (batch_size, seq_len, 1)
        feats = (encoder_outputs * attns).sum(dim=1) # (batch_size, hidden_sz)
        logits = self.main(feats).view(-1) # (batch_size, 1) -> (batch_size, )
        output = {"logits":logits, "attns":attns}
        
        if label is not None:
            loss = self.loss(logits, label.float())
            output["loss"] = loss
        
        return output

In [16]:
token_embedding = Embedding(num_embeddings=MAX_VOCAB_SIZE + 2, embedding_dim=300, padding_index=0)
word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens":token_embedding})

In [17]:
encoder: Seq2SeqEncoder = PytorchSeq2SeqWrapper(nn.LSTM(word_embeddings.get_output_dim(),
                                                       HIDDEN_SZ, bidirectional=True, batch_first=True))

In [18]:
model = ClassifierWithAttn(word_embeddings, encoder, vocab)

In [19]:
if USE_GPU: model.cuda()
else: model

### モデル動作確認

In [20]:
batch = nn_util.move_to_device(batch, 0 if USE_GPU else -1)

In [21]:
tokens = batch["tokens"]
label = batch["label"]

In [22]:
tokens

{'tokens': tensor([[13,  2,  8,  3,  4,  0],
         [ 9, 10,  2,  5,  3,  4]])}

In [23]:
mask = get_text_field_mask(tokens)
mask

tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1]])

In [24]:
embeddings = word_embeddings(tokens)
encoder_outputs = encoder(embeddings, mask)

In [25]:
encoder_outputs.shape

torch.Size([2, 6, 128])

In [26]:
encoder.get_output_dim()

128

In [27]:
attns = model.attn(encoder_outputs) # (batch_size, seq_len, 1)
feats = (encoder_outputs * attns).sum(dim=1) # (batch_size, hidden_sz)
logits = model.main(feats) # (batch_size, 1)

In [28]:
attns.shape, feats.shape, logits.shape

(torch.Size([2, 6, 1]), torch.Size([2, 128]), torch.Size([2, 1]))

In [29]:
model(**batch)

{'logits': tensor([0.0153, 0.0156], grad_fn=<ViewBackward>),
 'attns': tensor([[[0.1671],
          [0.1670],
          [0.1668],
          [0.1666],
          [0.1664],
          [0.1661]],
 
         [[0.1670],
          [0.1669],
          [0.1668],
          [0.1667],
          [0.1664],
          [0.1662]]], grad_fn=<UnsqueezeBackward0>),
 'loss': tensor(0.6931, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)}

# 学習

In [30]:
optimizer = optim.Adam(model.parameters(), lr=LR)
iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)

In [31]:
trainer = Trainer(
    model = model,
    optimizer = optimizer,
    iterator = iterator,
    train_dataset = train_dataset,
    cuda_device=0 if USE_GPU else -1,
    num_epochs = EPOCHS
)

In [32]:
metrics = trainer.train()

unable to check gpu_memory_mb(), continuing
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/allennlp2/lib/python3.6/site-packages/allennlp/common/util.py", line 379, in gpu_memory_mb
    encoding='utf-8')
  File "/home/ubuntu/anaconda3/envs/allennlp2/lib/python3.6/subprocess.py", line 356, in check_output
    **kwargs).stdout
  File "/home/ubuntu/anaconda3/envs/allennlp2/lib/python3.6/subprocess.py", line 438, in run
    output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader']' returned non-zero exit status 9.
loss: 0.6932 ||: 100%|██████████| 2/2 [00:00<00:00,  3.29it/s]
unable to check gpu_memory_mb(), continuing
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/allennlp2/lib/python3.6/site-packages/allennlp/common/util.py", line 379, in gpu_memory_mb
    encoding='utf-8')
  File "/home/ubuntu/anaconda3/envs/allennlp2/lib/python3.6/subprocess.py", line 3

In [33]:
metrics

{'best_epoch': 1,
 'peak_cpu_memory_MB': 1166.188,
 'training_duration': '00:00:01',
 'training_start_epoch': 0,
 'training_epochs': 1,
 'epoch': 1,
 'training_loss': 0.69261634349823,
 'training_cpu_memory_MB': 1166.188}

In [34]:
class MyPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = dataset_reader.tokenizer

    def predict(self, tokens: str) -> JsonDict:
        return self.predict_json({"tokens" : tokens})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        tokens = json_dict["tokens"]
        return self._dataset_reader.text_to_instance([Token(x) for x in self._tokenizer(tokens)])

In [35]:
pre = MyPredictor(model, reader)

In [36]:
pre.predict("あなたを愛しています。")

{'logits': 0.020878572016954422,
 'attns': [[0.14326539635658264],
  [0.14314626157283783],
  [0.14300484955310822],
  [0.14288753271102905],
  [0.14275595545768738],
  [0.1425764262676239],
  [0.142363503575325]]}