Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

In [40]:
import os
import sys
nlp_path = os.path.abspath('../../')
if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)

from utils_nlp.dataset.squad import load_pandas_df
from utils_nlp.models.bert.common import Language, Tokenizer
from utils_nlp.models.bert.question_answering import BERTQAExtractor
from utils_nlp.models.bert.qa_utils import postprocess_answers

In [3]:
SQUAD_VERSION = "v1.1" 
CACHE_DIR = "./temp"

LANGUAGE = Language.ENGLISH
DO_LOWER_CASE = True

MAX_SEQUENCE_LENGTH = 128

DOC_TEXT_COL = "doc_text"
QUESTION_TEXT_COL = "question_text"
ANSWER_START_COL = "answer_start"
ANSWER_TEXT_COL = "answer_text"
QA_ID_COL = "qa_id"
IS_IMPOSSIBLE_COL = "is_impossible"

In [4]:
train_df = load_pandas_df(local_cache_path=".", squad_version="v1.1", file_split="train")
dev_df = load_pandas_df(local_cache_path=".", squad_version="v1.1", file_split="dev")

In [5]:
train_df.head()

Unnamed: 0,doc_text,question_text,answer_start,answer_text,qa_id,is_impossible
0,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary,5733be284776f4190066117e,False
1,"As at most other universities, Notre Dame's st...",In what year did the student paper Common Sens...,908,1987,5733bf84d058e614000b61c1,False
2,The university is the major seat of the Congre...,Which prize did Frederick Buechner create?,675,Buechner Prize for Preaching,5733bed24776f4190066118c,False
3,The College of Engineering was established in ...,The College of Science began to offer civil en...,155,the 1870s,5733a6424776f41900660f52,False
4,All of Notre Dame's undergraduate students are...,Which organization declared the First Year of ...,647,U.S. News & World Report,5733a70c4776f41900660f65,False


In [6]:
dev_df.head()

Unnamed: 0,doc_text,question_text,answer_start,answer_text,qa_id,is_impossible
0,Super Bowl 50 was an American football game to...,What 2015 NFL team one the AFC playoff?,,,56d9895ddc89441400fdb510,False
1,The Panthers finished the regular season with ...,What year did the Carolina Panthers form?,,,56d98a59dc89441400fdb52e,False
2,The Broncos took an early lead in Super Bowl 5...,How many tackles did Von Miller accomlish by h...,,,56d98b33dc89441400fdb53e,False
3,"CBS broadcast Super Bowl 50 in the U.S., and c...",What performer lead the Super Bowl XLVIII half...,,,56d98c53dc89441400fdb548,False
4,"In early 2012, NFL Commissioner Roger Goodell ...",What year did Roger Goodell announce that Supe...,,,56d98d0adc89441400fdb54f,False


In [7]:
tokenizer = Tokenizer(language=LANGUAGE, to_lower=DO_LOWER_CASE,cache_dir=CACHE_DIR)

In [8]:
train_features, qa_examples = tokenizer.tokenize_qa(
    doc_text=train_df[DOC_TEXT_COL], 
    question_text=train_df[QUESTION_TEXT_COL], 
    answer_start=train_df[ANSWER_START_COL], 
    answer_text=train_df[ANSWER_TEXT_COL],
    qa_id=train_df[QA_ID_COL],
    is_impossible=train_df[IS_IMPOSSIBLE_COL],
    is_training=True,
    max_len=MAX_SEQUENCE_LENGTH)

In [15]:
dev_features, dev_examples = tokenizer.tokenize_qa(
    doc_text=dev_df[DOC_TEXT_COL], 
    question_text=dev_df[QUESTION_TEXT_COL], 
    answer_start=dev_df[ANSWER_START_COL], 
    answer_text=dev_df[ANSWER_TEXT_COL],
    qa_id=dev_df[QA_ID_COL],
    is_impossible=dev_df[IS_IMPOSSIBLE_COL],
    is_training=False,
    max_len=MAX_SEQUENCE_LENGTH)

In [9]:
sample_feature = train_features[0]
for f in type(sample_feature)._fields:
    print(f)
    print(getattr(sample_feature, f))
    print()

unique_id
1000000000

example_index
0

tokens
['[CLS]', 'what', 'sits', 'on', 'top', 'of', 'the', 'main', 'building', 'at', 'notre', 'dame', '?', '[SEP]', 'architectural', '##ly', ',', 'the', 'school', 'has', 'a', 'catholic', 'character', '.', 'atop', 'the', 'main', 'building', "'", 's', 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'virgin', 'mary', '.', 'immediately', 'in', 'front', 'of', 'the', 'main', 'building', 'and', 'facing', 'it', ',', 'is', 'a', 'copper', 'statue', 'of', 'christ', 'with', 'arms', 'up', '##rai', '##sed', 'with', 'the', 'legend', '"', 've', '##ni', '##te', 'ad', 'me', 'om', '##nes', '"', '.', 'next', 'to', 'the', 'main', 'building', 'is', 'the', 'basilica', 'of', 'the', 'sacred', 'heart', '.', 'immediately', 'behind', 'the', 'basilica', 'is', 'the', 'gr', '##otto', ',', 'a', 'marian', 'place', 'of', 'prayer', 'and', 'reflection', '.', 'it', 'is', 'a', 'replica', 'of', 'the', 'gr', '##otto', 'at', 'lou', '##rdes', ',', 'france', 'where', 'the', 'vi

In [11]:
train_features = train_features[0:63]

In [22]:
qa_extractor = BERTQAExtractor(language=LANGUAGE, cache_dir=CACHE_DIR)
qa_extractor.fit(train_features,
                 num_gpus=1,
                 num_epochs=1,
                 batch_size=32,
                 lr=2e-5,
                 warmup_proportion=0.1,
                 max_grad_norm=1.0,
                 model_output_dir=CACHE_DIR)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/2 [00:00<?, ?it/s][A
Iteration:  50%|█████     | 1/2 [00:01<00:01,  1.54s/it][A
Epoch: 100%|██████████| 1/1 [00:02<00:00,  2.99s/it]/it][A


Average training loss: 4.6988489627838135


In [24]:
qa_results = qa_extractor.predict(dev_features)

Evaluating:   0%|          | 0/127 [00:00<?, ?it/s]



Evaluating: 100%|██████████| 127/127 [01:02<00:00,  2.14it/s]


In [39]:
final_answers = postprocess_answers(dev_examples, 
                                    dev_features, 
                                    qa_results, 
                                    do_lower_case=DO_LOWER_CASE)

In [42]:
for k, v in final_answers.items():
    print(k)
    print(v)

56d9895ddc89441400fdb510
Levi's Stadium in the San Francisco Bay Area at Santa
56d98a59dc89441400fdb52e
XLIX by defeating them 20–18 in the AFC Championship Game. They joined the Patriots, Dallas Cowboys, and Pittsburgh Steelers
56d98b33dc89441400fdb53e
. Newton was limited by Denver's defense, which
56d98c53dc89441400fdb548
XLVII and Super Bowl XLVIII halftime shows
56d98d0adc89441400fdb54f
Goodell stated that the league planned to make the 50th Super Bowl "spectacular" and that it would be "an important game for us
56d98db6dc89441400fdb554
Benz Superdome, Miami's Sun
56d98f0ddc89441400fdb55c
XLIV in 2010. The San Francisco Bay Area last hosted in 1985 (Super Bowl XIX), held at Stanford
56d98fbfdc89441400fdb565
s Stadium. The $
56d99179dc89441400fdb570
XLVI
56d9943fdc89441400fdb57a
DeAngelo Williams and losing top wide receiver Kelvin Benjamin
56d997cddc89441400fdb58a
Corey Brown (31 receptions for 447 yards). The Panthers
56d9992fdc89441400fdb5a0
in tackles (118) forced two fumbles, 

ghis". It
572758c3dd62a815002e9b7c
成吉思汗; pinyin: Chéngjísī Hán, Turkic: Cengiz Han,
5726d8bd708984140094d35f
(or "poison").[n
5726d9935951b619008f7ff1
acists may also be small-business proprietors, owning the pharmacy in which they practice. Since
5726da89dd62a815002e92b6
acist (if employed in a hospital pharmacy)
5726db5add62a815002e92d8
oc
5726dcbddd62a815002e9324
acist assistants—were assigned status superior to all others in
5726ddf6f1498d1400e8ee07
869), was, however
5726deed5951b619008f80c9
least 1422. The oldest is claimed to have been set up in 122
5726e08e5951b619008f8113
ensary is subject to pharmacy legislation; with requirements for storage conditions, compulsory texts, equipment, etc., specified in legislation. Where it
5726e179dd62a815002e93b1
acists and more
5726e313f1498d1400e8eeb6
stock a larger range of medications, including more specialized medications, than would be feasible in the community setting. Most hospital medications are unit-dose,
5726e3c4dd62a815002e9408