In [1]:
from typing import Dict

import numpy as np
import torch
import torch.optim as optim
from allennlp.data.iterators import BucketIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import Trainer

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

In [3]:
from typing import Dict
import logging
import csv

from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer


logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
# @DatasetReader.register("data-reader")
class MultilingualDatasetReader(DatasetReader):
    def __init__(self,    
        lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy = lazy)
        self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    @overrides
    def _read(self, file_path):
        logger.info("Reading instances from lines in file at: %s", file_path)
        with open(file_path, "r") as data_file:
            tsv_in = csv.reader(data_file, delimiter=',')
            for row in tsv_in:
                if len(row) == 2:
                    Instance = self.text_to_instance( article=row[1],label=row[0])
                    yield Instance

    @overrides
    def text_to_instance(self,  # type: ignore
		                 article: str,
		                 label: str = None) -> Instance:
		# pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        tokenized_article = self._tokenizer.tokenize(article)
        fields["tokens"] = TextField(tokenized_article, self._token_indexers)
#        fields["tokens"] = article
        if label is not None:
            fields['label'] = LabelField(label)
        return Instance(fields)



In [4]:
# from realworldnlp.DatasetReader import MultilingualDatasetReader
reader = MultilingualDatasetReader()
train_dataset = reader.read('/home/nesma/SemesterII/Neural Networks/Project/multilingual-text-categorization-dataset/realworldnlp/train.csv')
dev_dataset = reader.read('/home/nesma/SemesterII/Neural Networks/Project/multilingual-text-categorization-dataset/realworldnlp/dev.csv')

3601it [00:00, 7135.48it/s]
902it [00:00, 4572.27it/s]


In [5]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                  min_count={'tokens': 3})

100%|██████████| 4503/4503 [00:00<00:00, 16509.34it/s]


In [6]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
token_embedding

Embedding()

In [7]:
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [8]:
encoder = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [9]:
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings

        # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
        # (usually a sequence of embedded word vectors), processes it, and returns it as a single
        # vector. Oftentimes, this is an RNN-based architecture (e.g., LSTM or GRU), but
        # AllenNLP also supports CNNs and other simple architectures (for example,
        # just averaging over the input vectors).
        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

        # We use the cross-entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()

    # Instances are fed to forward after batching.
    # Fields are passed through arguments with the same name.
    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        # In deep NLP, when sequences of tensors in different lengths are batched together,
        # shorter sequences get padded with zeros to make them of equal length.
        # Masking is the process to ignore extra zeros added by padding
        mask = get_text_field_mask(tokens)

        # Forward pass
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)
            output["acc"] = self.accuracy(logits, label)


        return output
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics = {'accuracy': self.accuracy.get_metric(reset)}
        return metrics



In [10]:
model = LstmClassifier(word_embeddings, encoder, vocab)

In [11]:
model

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=45, bias=True)
  (loss_function): CrossEntropyLoss()
)

In [12]:
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)

In [13]:
iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
iterator

<allennlp.data.iterators.bucket_iterator.BucketIterator at 0x7f543442fb70>

In [14]:
iterator.index_with(vocab)

In [25]:
from allennlp.nn.regularizers.regularizers import L1Regularizer

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=dev_dataset,
                  patience=10,
                  num_epochs=50)

In [16]:
trainer.train()

accuracy: 0.0436, loss: 3.7939 ||: 100%|██████████| 113/113 [00:14<00:00,  7.91it/s]
accuracy: 0.0355, loss: 3.7758 ||: 100%|██████████| 29/29 [00:00<00:00, 40.03it/s]
accuracy: 0.0780, loss: 3.5955 ||: 100%|██████████| 113/113 [00:12<00:00,  6.71it/s]
accuracy: 0.0787, loss: 3.5371 ||: 100%|██████████| 29/29 [00:01<00:00, 28.90it/s]
accuracy: 0.1155, loss: 3.3168 ||: 100%|██████████| 113/113 [00:12<00:00,  9.12it/s]
accuracy: 0.0998, loss: 3.4359 ||: 100%|██████████| 29/29 [00:00<00:00, 43.50it/s]
accuracy: 0.1727, loss: 2.9950 ||: 100%|██████████| 113/113 [00:13<00:00,  8.17it/s]
accuracy: 0.1452, loss: 3.2763 ||: 100%|██████████| 29/29 [00:00<00:00, 31.87it/s]
accuracy: 0.2560, loss: 2.5645 ||: 100%|██████████| 113/113 [00:13<00:00,  8.89it/s]
accuracy: 0.1818, loss: 3.2382 ||: 100%|██████████| 29/29 [00:00<00:00, 41.58it/s]
accuracy: 0.3852, loss: 2.1183 ||: 100%|██████████| 113/113 [00:11<00:00,  9.91it/s]
accuracy: 0.2040, loss: 3.2386 ||: 100%|██████████| 29/29 [00:00<00:00, 42.

{'best_epoch': 4,
 'peak_cpu_memory_MB': 388.492,
 'training_duration': '00:03:07',
 'training_start_epoch': 0,
 'training_epochs': 13,
 'epoch': 13,
 'training_accuracy': 0.8778117189669536,
 'training_loss': 0.3091622807283317,
 'training_cpu_memory_MB': 388.492,
 'validation_accuracy': 0.20509977827050999,
 'validation_loss': 4.472880568997613,
 'best_validation_accuracy': 0.18181818181818182,
 'best_validation_loss': 3.2382413683266473}

In [20]:
from overrides import overrides

from allennlp.common.util import JsonDict
from allennlp.data import Instance
from allennlp.predictors.predictor import Predictor


# @Predictor.register('text_classifier')
class TextClassifierPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single class for it.  In particular, it can be used with
    the :class:`~allennlp.models.basic_classifier.BasicClassifier` model
    """
    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"label"`` to the output.
        """
        sentence = json_dict["sentence"]
        return self._dataset_reader.text_to_instance(sentence)

In [21]:
tokens = 'available in itunes about bad wolves a los angeles californiabased heavy metal outfit with an impressive hard rock'
predictor = TextClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict(tokens)['logits']
label_id = np.argmax(logits)

print(model.vocab.get_token_from_index(label_id, 'labels'))

hobbies_and_interests


In [22]:
tokens = 'jeans day julia and emma roberts sporting denim both emma and julia are rocking in their denim'
predictor = TextClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict(tokens)['logits']
label_id = np.argmax(logits)

print(model.vocab.get_token_from_index(label_id, 'labels'))


clothing
