In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [40]:
!echo "This is Tokyo !" > tmp.txt
!echo "Japanese food is delicious ." >> tmp.txt
!cat tmp.txt

This is Tokyo !
Japanese food is delicious .


# Read data

In [44]:
from allennlp.data.dataset_readers.simple_language_modeling import SimpleLanguageModelingDatasetReader
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer

indexers = {"tokens": SingleIdTokenIndexer(),
            "token_characters": ELMoTokenCharactersIndexer()}
reader = SimpleLanguageModelingDatasetReader(tokenizer=WordTokenizer(JustSpacesWordSplitter()), 
                                             token_indexers=indexers, 
                                             max_sequence_length=400, 
                                             start_tokens=["<S>"], 
                                             end_tokens=["</S>"])

In [45]:
dataset = reader.read("tmp.txt")

In [49]:
for i in dataset:
    print(i)
    print(inum_tokens)

Instance with fields:
 	 source: TextField of length 6 with text: 
 		[<S>, This, is, Tokyo, !, </S>]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer', 'token_characters': 'ELMoTokenCharactersIndexer'} 



AttributeError: 'Instance' object has no attribute 'num_tokens'

# Setup data iterator

In [64]:
from allennlp.data.iterators.bucket_iterator import BucketIterator
from allennlp.common import Params

In [None]:
bucket_iterator = BucketIterator(sorting_keys=[["source", "num_tokens"]], 
                                 batch_size=512, 
                                 maximum_samples_per_batch=["num_tokens", 2000])


In [55]:
from allennlp.data.vocabulary import Vocabulary

# Prepare vocab

In [76]:
!head vocabs/*

==> vocabs/non_padded_namespaces.txt <==
*labels
*tags

==> vocabs/tokens.txt <==
</S>
<S>
@@UNKNOWN@@
the
,
.
to
of
and
a


In [74]:
# !cp non_padded_namespaces.txt vocabs/
# !cp tokens.txt vocabs/

In [75]:
vocab = Vocabulary.from_params(Params(params={"directory_path": "./vocabs"}))

# Prepare model

In [94]:
from allennlp.models.language_model import LanguageModel
from allennlp.modules.seq2seq_encoders.bidirectional_language_model_transformer import BidirectionalLanguageModelTransformer
from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.token_characters_encoder import TokenCharactersEncoder
from allennlp.modules.token_embedders.embedding import Embedding
from allennlp.modules.seq2vec_encoders.cnn_highway_encoder import CnnHighwayEncoder

In [95]:
embedding = Embedding(num_embeddings=2612, embedding_dim=16)
filters = [
    [1, 32],
    [2, 32],
    [3, 64],
    [4, 128],
    [5, 256],
    [6, 512],
    [7, 1024]]

encoder = CnnHighwayEncoder(activation="relu", 
                            embedding_dim=16, 
                            filters=filters, 
                            num_highway=2, 
                            projection_dim=512, 
                            projection_location="after_highway", 
                            do_layer_norm=True)

character_embedder = TokenCharactersEncoder(embedding=embedding, encoder=encoder)
token_embedders = {"token_characters": character_embedder}
text_field_embedder = BasicTextFieldEmbedder(allow_unmatched_keys=True, token_embedders=token_embedders)

In [98]:
contextualizer = BidirectionalLanguageModelTransformer(input_dim=512, 
                                                       hidden_dim=2048, 
                                                       num_layers=1, 
                                                       dropout=0.1, 
                                                       input_dropout=0.1)

model = LanguageModel(vocab=vocab, 
                      text_field_embedder=text_field_embedder, 
                      contextualizer=contextualizer, 
                      bidirectional=True, 
                      num_samples=8192, 
                      sparse_embeddings=True, 
                      dropout=0.2)

In [99]:
print(model)

LanguageModel(
  (_text_field_embedder): BasicTextFieldEmbedder(
    (token_embedder_token_characters): TokenCharactersEncoder(
      (_embedding): TimeDistributed(
        (_module): Embedding()
      )
      (_encoder): TimeDistributed(
        (_module): CnnHighwayEncoder(
          (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
          (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
          (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
          (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
          (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
          (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
          (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
          (_highways): Highway(
            (_layers): ModuleList(
              (0): Linear(in_features=2048, out_features=4096, bias=True)
              (1): Linear(in_features=2048, out_features=4096, bias=True)
      