In [1]:
# !pip install allennlp==2.5.0
# !pip install allennlp-models==2.5.0
# !git clone https://github.com/mhagiwara/realworldnlp.git
# %cd realworldnlp

In [2]:
%cd /home/admin_paulhykim_altostrat_com/realworldnlp

/home/admin_paulhykim_altostrat_com/realworldnlp


In [3]:
from typing import Dict

import numpy as np
import torch
import torch.optim as optim
from allennlp.common.file_utils import cached_path
from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import LabelField, TextField
from allennlp.data.instance import Instance
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.training import GradientDescentTrainer
from overrides import overrides

from examples.sentiment.sst_classifier import LstmClassifier

In [4]:
EMBEDDING_DIM = 16
HIDDEN_DIM = 16

In [5]:
class TatoebaSentenceReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer]=None):
        super().__init__()
        self.tokenizer = CharacterTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

    @overrides
    def text_to_instance(self, tokens, label=None):
        fields = {}

        fields['tokens'] = TextField(tokens, self.token_indexers)
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)

    @overrides
    def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path, "r") as text_file:
            for line in text_file:
                lang_id, sent = line.rstrip().split('\t')

                tokens = self.tokenizer.tokenize(sent)

                yield self.text_to_instance(tokens, lang_id)

In [6]:
def classify(text: str, model: LstmClassifier):
    tokenizer = CharacterTokenizer()
    token_indexers = {'tokens': SingleIdTokenIndexer()}

    tokens = tokenizer.tokenize(text)
    instance = Instance({'tokens': TextField(tokens, token_indexers)})
    logits = model.forward_on_instance(instance)['logits']
    label_id = np.argmax(logits)
    label = model.vocab.get_token_from_index(label_id, 'labels')

    print('text: {}, label: {}'.format(text, label))

In [7]:
reader = TatoebaSentenceReader()
train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/tatoeba/sentences.top10langs.train.tsv'
dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/tatoeba/sentences.top10langs.dev.tsv'

In [8]:
sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler)

loading instances: 0it [00:00, ?it/s]

loading instances: 0it [00:00, ?it/s]

In [9]:
vocab = Vocabulary.from_instances(train_data_loader.iter_instances(),
                                  min_count={'tokens': 3})
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)

building vocab: 0it [00:00, ?it/s]

In [10]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [11]:
encoder = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [12]:
model = LstmClassifier(word_embeddings, encoder, vocab, positive_label='eng', cuda_device=0)

In [13]:
optimizer = optim.Adam(model.parameters())

In [14]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=10,
    num_epochs=20,
    cuda_device=0)

trainer.train()

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/3125 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

{'best_epoch': 19,
 'peak_worker_0_memory_MB': 5851.48828125,
 'peak_gpu_0_memory_MB': 11.189453125,
 'training_duration': '0:08:12.015885',
 'epoch': 19,
 'training_accuracy': 0.97045,
 'training_precision': 0.9761546850204468,
 'training_recall': 0.9743000268936157,
 'training_f1': 0.9752264618873596,
 'training_loss': 0.08975689003930427,
 'training_worker_0_memory_MB': 5851.48828125,
 'training_gpu_0_memory_MB': 11.189453125,
 'validation_accuracy': 0.9663,
 'validation_precision': 0.9701195359230042,
 'validation_recall': 0.9739999771118164,
 'validation_f1': 0.9720559120178223,
 'validation_loss': 0.10716271834582249,
 'best_validation_accuracy': 0.9663,
 'best_validation_precision': 0.9701195359230042,
 'best_validation_recall': 0.9739999771118164,
 'best_validation_f1': 0.9720559120178223,
 'best_validation_loss': 0.10716271834582249}

In [15]:
classify('Take your raincoat in case it rains.', model)

text: Take your raincoat in case it rains., label: eng


In [16]:
classify('Tu me recuerdas a mi padre.', model)

text: Tu me recuerdas a mi padre., label: fra


In [17]:
classify('Wie organisierst du das Essen am Mittag?', model)

text: Wie organisierst du das Essen am Mittag?, label: deu


In [18]:
classify("Il est des cas où cette règle ne s'applique pas.", model)

text: Il est des cas où cette règle ne s'applique pas., label: fra


In [19]:
classify('Estou fazendo um passeio em um parque.', model)

text: Estou fazendo um passeio em um parque., label: por


In [20]:
classify('Ve, postmorgaŭ jam estas la limdato.', model)

text: Ve, postmorgaŭ jam estas la limdato., label: epo


In [21]:
classify('Credevo che sarebbe venuto.', model)

text: Credevo che sarebbe venuto., label: ita


In [22]:
classify('Nem tudja, hogy én egy macska vagyok.', model)

text: Nem tudja, hogy én egy macska vagyok., label: hun


In [23]:
classify('Nella ur nli qrib acemma deg tenwalt.', model)

text: Nella ur nli qrib acemma deg tenwalt., label: ber


In [24]:
classify('Kurşun kalemin yok, değil mi?', model)

text: Kurşun kalemin yok, değil mi?, label: tur
