In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
!echo "This is Tokyo !" > tmp.txt
!echo "Japanese food is delicious ." >> tmp.txt
!cat tmp.txt

This is Tokyo !
Japanese food is delicious .


# Read data

In [3]:
from allennlp.data.dataset_readers.simple_language_modeling import SimpleLanguageModelingDatasetReader
from allennlp.data.dataset_readers.multiprocess_dataset_reader import MultiprocessDatasetReader

from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer

indexers = {"tokens": SingleIdTokenIndexer(),
            "token_characters": ELMoTokenCharactersIndexer()}

#indexers = {"token_characters": ELMoTokenCharactersIndexer()}
reader = SimpleLanguageModelingDatasetReader(tokenizer=WordTokenizer(JustSpacesWordSplitter()), 
                                             token_indexers=indexers, 
                                             max_sequence_length=400, 
                                             start_tokens=["<S>"], 
                                             end_tokens=["</S>"])

NUM_THREADS = 2
reader = MultiprocessDatasetReader(base_reader=reader, num_workers=NUM_THREADS, output_queue_size=1000)

In [4]:
dataset = reader.read("tmp.txt")

In [5]:
for i in dataset:
    print(i)

[INFO/MainProcess] starting worker 0
[INFO/MainProcess] starting worker 1
[INFO/Process-2] child process calling self.run()
[INFO/Process-3] child process calling self.run()
[INFO/Process-2] reading instances from tmp.txt
[INFO/MainProcess] worker 1 finished (1/2)
[INFO/Process-3] process shutting down
[INFO/Process-2] process shutting down
[INFO/MainProcess] worker 0 finished (2/2)
[INFO/Process-3] process exiting with exitcode 0
[INFO/Process-2] process exiting with exitcode 0


Instance with fields:
 	 source: TextField of length 6 with text: 
 		[<S>, This, is, Tokyo, !, </S>]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer', 'token_characters': 'ELMoTokenCharactersIndexer'} 

Instance with fields:
 	 source: TextField of length 7 with text: 
 		[<S>, Japanese, food, is, delicious, ., </S>]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer', 'token_characters': 'ELMoTokenCharactersIndexer'} 



# Prepare vocab

In [6]:
!head vocabs/*

==> vocabs/non_padded_namespaces.txt <==
*labels
*tags

==> vocabs/tokens.txt <==
</S>
<S>
@@UNKNOWN@@
the
,
.
to
of
and
a


In [59]:
!wc -l vocabs/tokens.txt

  793471 vocabs/tokens.txt


In [9]:
# !cp non_padded_namespaces.txt vocabs/
# !cp tokens.txt vocabs/

In [10]:
from allennlp.data.vocabulary import Vocabulary
from allennlp.common import Params

vocab = Vocabulary.from_params(Params(params={"directory_path": "./vocabs"}))

In [11]:
vocab

Vocabulary with namespaces:  tokens, Size: 793472 || Non Padded Namespaces: {'*labels', '*tags'}

# Prepare model

In [12]:
from allennlp.models.language_model import LanguageModel
from allennlp.modules.seq2seq_encoders.bidirectional_language_model_transformer import BidirectionalLanguageModelTransformer
from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.token_characters_encoder import TokenCharactersEncoder
from allennlp.modules.token_embedders.embedding import Embedding
from allennlp.modules.seq2vec_encoders.cnn_highway_encoder import CnnHighwayEncoder

In [54]:
embedding = Embedding(num_embeddings=262, embedding_dim=16)
filters = [
    [1, 32],
    [2, 32],
    [3, 64],
    [4, 128],
    [5, 256]]

encoder = CnnHighwayEncoder(activation="relu",
                            embedding_dim=16, 
                            filters=filters, 
                            num_highway=1, 
                            projection_dim=256, 
                            projection_location="after_highway", 
                            do_layer_norm=True)

character_embedder = TokenCharactersEncoder(embedding=embedding, encoder=encoder)
token_embedders = {"token_characters": character_embedder}
text_field_embedder = BasicTextFieldEmbedder(allow_unmatched_keys=True, token_embedders=token_embedders)

In [55]:
import numpy as np

np.array(filters).sum(axis=0)

array([ 15, 512])

In [60]:
contextualizer = BidirectionalLanguageModelTransformer(input_dim=256, 
                                                       hidden_dim=512, 
                                                       num_layers=2, 
                                                       dropout=0.1, 
                                                       input_dropout=0.1)

model = LanguageModel(vocab=vocab, 
                      text_field_embedder=text_field_embedder, 
                      contextualizer=contextualizer, 
                      bidirectional=True, 
                      num_samples=8192, 
                      sparse_embeddings=True, 
                      dropout=0.2)

In [61]:
def get_params(model, requires_grad=None):
    return [p for p in model.parameters()
            if requires_grad is None or p.requires_grad == requires_grad]


def count_parameters(model, requires_grad=None):
    return sum(p.numel() for p in get_params(model, requires_grad))

print(f"{count_parameters(model, requires_grad=True):,}")
print(f"{count_parameters(model, requires_grad=None):,}")
print(model)

206,726,880
206,726,880
LanguageModel(
  (_text_field_embedder): BasicTextFieldEmbedder(
    (token_embedder_token_characters): TokenCharactersEncoder(
      (_embedding): TimeDistributed(
        (_module): Embedding()
      )
      (_encoder): TimeDistributed(
        (_module): CnnHighwayEncoder(
          (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
          (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
          (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
          (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
          (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
          (_highways): Highway(
            (_layers): ModuleList(
              (0): Linear(in_features=512, out_features=1024, bias=True)
            )
          )
          (_projection): Linear(in_features=512, out_features=256, bias=True)
          (_layer_norm): LayerNorm()
        )
      )
    )
  )
  (_contextualizer): Bidirectiona

# Training

## Iterator

In [28]:
from allennlp.data.iterators.bucket_iterator import BucketIterator
from allennlp.data.iterators.multiprocess_iterator import MultiprocessIterator

bucket_iterator = BucketIterator(sorting_keys=[["source", "num_tokens"]], 
                                 batch_size=512, 
                                 maximum_samples_per_batch=["num_tokens", 2000])

bucket_iterator = MultiprocessIterator(base_iterator=bucket_iterator, num_workers=NUM_THREADS, output_queue_size=500)
bucket_iterator.index_with(vocab)

In [29]:
import torch 
from allennlp.training.trainer import Trainer
from allennlp.training.optimizers import DenseSparseAdam
from allennlp.training.learning_rate_schedulers.noam import NoamLR

optimizer = DenseSparseAdam(model.parameters())
scheduler = NoamLR(optimizer, model_size=512, warmup_steps=6000)


if torch.cuda.is_available():
    print("Enable GPU")
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    print("Disable GPU")
    cuda_device = -1

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=bucket_iterator,
                  train_dataset=dataset,
                  learning_rate_scheduler=scheduler,
                  num_epochs=10,
                  cuda_device=cuda_device)
trainer.train()

Disable GPU


  0%|          | 0/1 [00:00<?, ?it/s][INFO/SyncManager-44] child process calling self.run()
[INFO/SyncManager-44] created temp directory /var/folders/_r/l1tzvmmj3ys67045yy0fy3r9b2c7bz/T/pymp-t_xp6475
[INFO/SyncManager-44] manager serving at '/var/folders/_r/l1tzvmmj3ys67045yy0fy3r9b2c7bz/T/pymp-t_xp6475/listener-iy1wtc10'
[INFO/Process-45] child process calling self.run()
[INFO/Process-45] starting worker 0
[INFO/Process-46] child process calling self.run()
[INFO/Process-47] child process calling self.run()
[INFO/Process-45] starting worker 1
[INFO/Process-45:1] child process calling self.run()
[INFO/Process-45:1] reading instances from tmp.txt
[INFO/Process-45:2] child process calling self.run()
[INFO/Process-45:2] process shutting down
[INFO/Process-45:1] process shutting down
[INFO/Process-45] worker 1 finished (1/2)
[INFO/Process-45] worker 0 finished (2/2)
[INFO/Process-45:1] process exiting with exitcode 0
[INFO/Process-45:2] process exiting with exitcode 0
[INFO/Process-45] proc

  File "/Users/phiradet.bangcharoe/miniconda3/lib/python3.6/site-packages/allennlp/data/iterators/multiprocess_iterator.py", line 47, in _queuer
    for instance in instances:
  File "/Users/phiradet.bangcharoe/miniconda3/lib/python3.6/site-packages/allennlp/data/dataset_readers/multiprocess_dataset_reader.py", line 133, in _instances
    input_queue = manager.Queue(num_shards * self.epochs_per_read + self.num_workers)
  File "/Users/phiradet.bangcharoe/miniconda3/lib/python3.6/multiprocessing/managers.py", line 662, in temp
    token, exp = self._create(typeid, *args, **kwds)
  File "/Users/phiradet.bangcharoe/miniconda3/lib/python3.6/multiprocessing/managers.py", line 554, in _create
    conn = self._Client(self._address, authkey=self._authkey)
  File "/Users/phiradet.bangcharoe/miniconda3/lib/python3.6/multiprocessing/connection.py", line 487, in Client
    c = SocketClient(address)
  File "/Users/phiradet.bangcharoe/miniconda3/lib/python3.6/multiprocessing/connection.py", line 614,

KeyboardInterrupt: 

# Check params

'431,434,432'