Experiment notebook to test code on Google colaboratoty.

* https://github.com/m3yrin/nar-latent-alignment

In [None]:
# download codes
!git clone https://github.com/m3yrin/nar-latent-alignment.git
%cd nar-latent-alignment
!ls

## Installation

In [None]:
!pip install -r requirements.txt

## Download dataset

In [None]:
%cd datasets
!git clone https://github.com/odashi/small_parallel_enja.git
!ls small_parallel_enja
%cd ../

In [None]:
import allennlp
allennlp.__version__

In [None]:
import itertools

import torch
import torch.optim as optim
from allennlp.data.iterators import BucketIterator, BasicIterator
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
from allennlp.data.vocabulary import Vocabulary
from allennlp.nn.activations import Activation
from allennlp.modules.attention import LinearAttention, BilinearAttention, DotProductAttention
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper, StackedSelfAttentionEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.predictors import SimpleSeq2SeqPredictor
from allennlp.training.trainer import Trainer

In [None]:
SPECIAL_BLANK_TOKEN = "@@BLANK@@"

min_count = 2
embedding_dim = 128

lr = 0.001
batch_size = 256
num_epochs = 50
patience = 2

In [None]:
from src.data.dataset_readers.tanaka_corpus_reader import TanakaCorpusReader
reader = TanakaCorpusReader()

train_dataset = reader.read("datasets/small_parallel_enja/train")
valid_dataset = reader.read("datasets/small_parallel_enja/dev")
test_dataset = reader.read("datasets/small_parallel_enja/test")

In [None]:
from allennlp.data.vocabulary import Vocabulary

vocab = Vocabulary.from_instances(train_dataset, min_count={'source_tokens': min_count, 'target_tokens': min_count})

In [None]:
blank_idx = vocab.add_token_to_namespace(SPECIAL_BLANK_TOKEN, namespace = 'target_tokens')
blank_idx

In [None]:
print(train_dataset[2])

In [None]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=embedding_dim)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [None]:
print(vocab)

In [None]:
from src.models.latent_alingment_ctc import LatentAignmentCTC
model = LatentAignmentCTC(vocab, word_embeddings)

In [None]:
if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1

In [None]:
optimizer = optim.Adam(model.parameters(), lr=lr)
iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("source_tokens", "num_tokens")])
iterator.index_with(vocab)

In [None]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=valid_dataset,
                  serialization_dir = 'tmp',
                  patience=patience,
                  num_epochs=num_epochs,
                  cuda_device=cuda_device)
trainer.train()

In [None]:
with open("tmp/best.th", 'rb') as f:
    model.load_state_dict(torch.load(f))
if cuda_device > -1:
    model.cuda(cuda_device)
model.eval();

In [None]:
from src.predictor.tanaka_corpus_predictor import TanakaCorpusPredictor
predictor = TanakaCorpusPredictor(model, dataset_reader=reader)

In [None]:
predictor.predict("私 は テニス 部員 で す 。")["predicted_tokens"]