Merge pull request #949 from himkt/allennlp-example

Add allennlp example
optuna · Mar 18, 2020 · 9f73f8c · 9f73f8c
2 parents daee023 + 1e59f0e
commit 9f73f8c
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 3 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -300,7 +300,7 @@ jobs:
           <<: *examples
           environment:
             OMP_NUM_THREADS: 1
-            IGNORES: chainermn_.*|dask_ml_.*|keras_.*|pytorch_lightning_.*|tensorflow_.*|tfkeras_.*|fastai_.*
+            IGNORES: chainermn_.*|dask_ml_.*|keras_.*|pytorch_lightning_.*|tensorflow_.*|tfkeras_.*|fastai_.*|allennlp_.*
 
       - run: *examples-mn
 
@@ -326,6 +326,6 @@ jobs:
           <<: *examples
           environment:
             OMP_NUM_THREADS: 1
-            IGNORES: chainermn_.*|pytorch_lightning.*|fastai_.*
+            IGNORES: chainermn_.*|pytorch_lightning.*|fastai_.*|allennlp_.*
 
       - run: *examples-mn
diff --git a/README.md b/README.md
@@ -89,6 +89,7 @@ study.optimize(objective, n_trials=100)  # Invoke optimization of the objective
 * [PyTorch Ignite](./examples/pytorch_ignite_simple.py)
 * [PyTorch Lightning](./examples/pytorch_lightning_simple.py)
 * [FastAI](./examples/fastai_simple.py)
+* [AllenNLP](./examples/allennlp_simple.py)
 
 ## Installation
 

diff --git a/examples/allennlp_simple.py b/examples/allennlp_simple.py
@@ -0,0 +1,135 @@
+"""
+Optuna example that optimizes a classifier configuration for IMDB movie review dataset.
+This script is based on the example of allentune (https://github.com/allenai/allentune).
+
+In this example, we optimize the validation accuracy of sentiment classification using AllenNLP.
+Since it is too time-consuming to use the entire dataset, we here use a small subset of it.
+
+We have the following two ways to execute this example:
+
+(1) Execute this code directly.
+    $ python allennlp_simple.py
+
+
+(2) Execute through CLI.
+    $ STUDY_NAME=`optuna create-study --direction maximize --storage sqlite:///example.db`
+    $ optuna study optimize allennlp_simple.py objective --n-trials=100 --study $STUDY_NAME \
+      --storage sqlite:///example.db
+
+"""
+
+import os
+import shutil
+
+import allennlp
+import allennlp.data
+import allennlp.models
+import allennlp.modules
+import torch
+
+import optuna
+
+
+DEVICE = -1  # If you want to use GPU, use DEVICE = 0.
+MAX_DATA_SIZE = 3000
+
+DIR = os.getcwd()
+MODEL_DIR = os.path.join(DIR, "result")
+
+GLOVE_FILE_PATH = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.50d.txt.gz"
+
+
+def prepare_data():
+    glove_indexer = allennlp.data.token_indexers.SingleIdTokenIndexer(lowercase_tokens=True)
+    tokenizer = allennlp.data.tokenizers.WordTokenizer(
+        word_splitter=allennlp.data.tokenizers.word_splitter.JustSpacesWordSplitter(),
+    )
+
+    reader = allennlp.data.dataset_readers.TextClassificationJsonReader(
+        token_indexers={"tokens": glove_indexer}, tokenizer=tokenizer,
+    )
+    train_dataset = reader.read(
+        "https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/train.jsonl"
+    )
+    train_dataset = train_dataset[:MAX_DATA_SIZE]
+
+    valid_dataset = reader.read(
+        "https://s3-us-west-2.amazonaws.com/allennlp/datasets/imdb/dev.jsonl"
+    )
+    valid_dataset = valid_dataset[:MAX_DATA_SIZE]
+
+    vocab = allennlp.data.Vocabulary.from_instances(train_dataset)
+    return train_dataset, valid_dataset, vocab
+
+
+def create_model(vocab, trial):
+    embedding = allennlp.modules.Embedding(
+        embedding_dim=50,
+        trainable=True,
+        pretrained_file=GLOVE_FILE_PATH,
+        num_embeddings=vocab.get_vocab_size("tokens"),
+    )
+
+    embedder = allennlp.modules.text_field_embedders.BasicTextFieldEmbedder({"tokens": embedding})
+
+    output_dim = trial.suggest_int("output_dim", 16, 128)
+    max_filter_size = trial.suggest_int("max_filter_size", 3, 6)
+    num_filters = trial.suggest_int("num_filters", 16, 128)
+    encoder = allennlp.modules.seq2vec_encoders.CnnEncoder(
+        ngram_filter_sizes=range(1, max_filter_size),
+        num_filters=num_filters,
+        embedding_dim=50,
+        output_dim=output_dim,
+    )
+
+    dropout = trial.suggest_uniform("dropout", 0, 0.5)
+    model = allennlp.models.BasicClassifier(
+        text_field_embedder=embedder, seq2vec_encoder=encoder, dropout=dropout, vocab=vocab,
+    )
+
+    return model
+
+
+def objective(trial):
+    train_dataset, valid_dataset, vocab = prepare_data()
+    model = create_model(vocab, trial)
+
+    if DEVICE > -1:
+        model.to(torch.device("cuda:{}".format(DEVICE)))
+
+    lr = trial.suggest_loguniform("lr", 1e-1, 1e0)
+    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+
+    iterator = allennlp.data.iterators.BasicIterator(batch_size=10,)
+    iterator.index_with(vocab)
+
+    serialization_dir = os.path.join(MODEL_DIR, "trial_{}".format(trial.number))
+    trainer = allennlp.training.Trainer(
+        model=model,
+        optimizer=optimizer,
+        iterator=iterator,
+        train_dataset=train_dataset,
+        validation_dataset=valid_dataset,
+        patience=3,
+        num_epochs=6,
+        cuda_device=DEVICE,
+        serialization_dir=serialization_dir,
+    )
+    metrics = trainer.train()
+    return metrics["best_validation_accuracy"]
+
+
+if __name__ == "__main__":
+    study = optuna.create_study(direction="maximize")
+    study.optimize(objective, n_trials=80, timeout=600)
+
+    print("Number of finished trials: ", len(study.trials))
+    print("Best trial:")
+    trial = study.best_trial
+
+    print("  Value: ", trial.value)
+    print("  Params: ")
+    for key, value in trial.params.items():
+        print("    {}: {}".format(key, value))
+
+    shutil.rmtree(MODEL_DIR)
diff --git a/setup.py b/setup.py
@@ -67,7 +67,7 @@ def get_extras_require() -> Dict[str, List[str]]:
             "torchvision>=0.5.0",
             "xgboost",
         ]
-        + (["fastai<2"] if (3, 5) < sys.version_info[:2] < (3, 8) else [])
+        + (["allennlp", "fastai<2"] if (3, 5) < sys.version_info[:2] < (3, 8) else [])
         + (
             [
                 "dask[dataframe]",