In [1]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor
from allennlp.models import crf_tagger

In [2]:
from allennlp.common.params import Params

In [3]:
from allennlp.modules import  ConditionalRandomField

In [4]:
class PosDatasetReader(DatasetReader):
    """
    DatasetReader for PoS tagging data, one sentence per line, like
        The###DET dog###NN ate###V the###DET apple###NN
    """

    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"sentence": sentence_field}

        if tags:
            label_field = SequenceLabelField(labels=tags, sequence_field=sentence_field)
            fields["labels"] = label_field

        return Instance(fields)

    def _read(self, file_path: str) -> Iterator[Instance]:
        with open(file_path) as f:
            for line in f:
                pairs = line.strip().split()
                sentence, tags = zip(*(pair.split("###") for pair in pairs))
                yield self.text_to_instance([Token(word) for word in sentence], tags)


In [5]:
class LstmCRFTagger(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.crf = ConditionalRandomField(num_tags=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

    def forward(self,
                sentence: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(sentence)
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}


In [6]:
class LstmTagger(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

    def forward(self,
                sentence: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(sentence)
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}


In [7]:
reader = PosDatasetReader()
train_dataset = reader.read(
    '/home/pding/OneDrive/kph/kph/trainan.txt')
validation_dataset = reader.read(
    '/home/pding/OneDrive/kph/kph/testan.txt')
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
EMBEDDING_DIM = 200
HIDDEN_DIM = 100


11543it [00:08, 1309.54it/s]
2885it [00:02, 1339.03it/s]
100%|██████████| 14428/14428 [00:03<00:00, 4553.81it/s]


In [12]:
#token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
#                            embedding_dim=EMBEDDING_DIM)

token_embedding = Embedding.from_params(
                            vocab=vocab,
                            params=Params({'pretrained_file':'/home/pding/Documents/glove/glove.840B.300d.txt',
                                           'embedding_dim' : EMBEDDING_DIM})
                            )
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

2196017it [00:13, 161347.29it/s]


In [8]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [None]:
vocab.save_to_files("/tmp/vocabulary")

In [9]:
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))


In [None]:
model = LstmTagger(word_embeddings, lstm, vocab)
model.cuda()
optimizer = optim.SGD(model.parameters(), lr=0.01)
iterator = BucketIterator(batch_size=300, biggest_batch_first=True, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  cuda_device=0,
                  patience=10,
                  num_epochs=10)


In [11]:
model2 = LstmCRFTagger(word_embeddings, lstm, vocab)
model2.cuda()
optimizer = optim.SGD(model2.parameters(), lr=0.01)
iterator = BucketIterator(batch_size=300, biggest_batch_first=True, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)
trainer2 = Trainer(model=model2,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  cuda_device=0,
                  patience=10,
                  num_epochs=10)

In [12]:
with open("model2.th", 'rb') as f:
    model2.load_state_dict(torch.load(f))

In [34]:
next(iter(iterator))

TypeError: 'BucketIterator' object is not iterable

In [None]:
trainer2.train()

accuracy: 0.9384, loss: 0.3140 ||: 100%|██████████| 39/39 [00:06<00:00,  5.88it/s]
accuracy: 0.9375, loss: 0.3160 ||: 100%|██████████| 10/10 [00:00<00:00, 10.33it/s]
accuracy: 0.9384, loss: 0.3125 ||: 100%|██████████| 39/39 [00:06<00:00,  5.95it/s]
accuracy: 0.9375, loss: 0.3146 ||: 100%|██████████| 10/10 [00:00<00:00, 10.64it/s]
accuracy: 0.9384, loss: 0.3111 ||: 100%|██████████| 39/39 [00:06<00:00,  5.96it/s]
accuracy: 0.9375, loss: 0.3134 ||: 100%|██████████| 10/10 [00:00<00:00, 10.68it/s]
accuracy: 0.9384, loss: 0.3099 ||: 100%|██████████| 39/39 [00:06<00:00,  5.98it/s]
accuracy: 0.9375, loss: 0.3121 ||: 100%|██████████| 10/10 [00:00<00:00, 10.60it/s]
accuracy: 0.9434, loss: 0.2783 ||:  33%|███▎      | 13/39 [00:03<00:04,  5.39it/s]

In [None]:
with open("model2.th", 'wb') as f:
    torch.save(model2.state_dict(), f)

In [18]:
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
tag_ids = np.argmax(tag_logits, axis=-1)
print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

Spacy models 'en_core_web_sm' not found.  Downloading and installing.


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/pding/anaconda3/envs/anlp/lib/python3.7/site-packages/en_core_web_sm -->
/home/pding/anaconda3/envs/anlp/lib/python3.7/site-packages/spacy/data/en_core_web_sm
You can now load the model via spacy.load('en_core_web_sm')
['O', 'O', 'O', 'O', 'O']
