In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 14.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 42.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1


In [2]:
from transformers import BertForQuestionAnswering,BertTokenizer

## Now, we download and load the model. 
- We use the **'bert-large-uncased-whole-wordmasking-fine-tuned-squad'** model, which is fine-tuned on the Stanford Question-Answering Dataset (SQUAD):


In [5]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

## Next, we download and load the tokenizer

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

## Now that we have downloaded the model and tokenizer, let's preprocess the input.

In [8]:
question = "What is the immune system?"
paragraph = '''The immune system is a system of many biological structures
and processes within an organism that protects against disease. To function
properly, an immune system must detect a wide variety of agents, known as
pathogens, from viruses to parasitic worms, and distinguish them from the
organism's own healthy tissue.'''

In [9]:
question = '[CLS]' + question + '[SEP]'
paragraph = paragraph + '[SEP]'

In [10]:
question_tokens = tokenizer.tokenize(question)
paragraph_tokens = tokenizer.tokenize(paragraph)

In [11]:
print(question_tokens)
print(paragraph_tokens)

['[CLS]', 'what', 'is', 'the', 'immune', 'system', '?', '[SEP]']
['the', 'immune', 'system', 'is', 'a', 'system', 'of', 'many', 'biological', 'structures', 'and', 'processes', 'within', 'an', 'organism', 'that', 'protects', 'against', 'disease', '.', 'to', 'function', 'properly', ',', 'an', 'immune', 'system', 'must', 'detect', 'a', 'wide', 'variety', 'of', 'agents', ',', 'known', 'as', 'pathogen', '##s', ',', 'from', 'viruses', 'to', 'parasitic', 'worms', ',', 'and', 'distinguish', 'them', 'from', 'the', 'organism', "'", 's', 'own', 'healthy', 'tissue', '.', '[SEP]']


## Combine the question and paragraph tokens and convert them to input_ids

In [12]:
tokens = question_tokens + paragraph_tokens
input_ids = tokenizer.convert_tokens_to_ids(tokens)

In [13]:
print(input_ids)

[101, 2054, 2003, 1996, 11311, 2291, 1029, 102, 1996, 11311, 2291, 2003, 1037, 2291, 1997, 2116, 6897, 5090, 1998, 6194, 2306, 2019, 15923, 2008, 18227, 2114, 4295, 1012, 2000, 3853, 7919, 1010, 2019, 11311, 2291, 2442, 11487, 1037, 2898, 3528, 1997, 6074, 1010, 2124, 2004, 26835, 2015, 1010, 2013, 18191, 2000, 26045, 16253, 1010, 1998, 10782, 2068, 2013, 1996, 15923, 1005, 1055, 2219, 7965, 8153, 1012, 102]


## we define segment_ids. Now, segment_ids will be 0 for all the tokens of the question and 1 for all the tokens of the paragraph

In [23]:
segment_ids_1 = [0] * len(question_tokens)
segment_ids_2 = [1] * len(paragraph_tokens)
segment_ids = segment_ids_1 + segment_ids_2

In [24]:
print(segment_ids)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


## Now we convert input_ids and segment_ids to tensors

In [28]:
import torch
input_ids = torch.tensor([input_ids])
segment_ids = torch.tensor([segment_ids])

## Now that we have processed the input, let's feed it to the model and get the result.
## We feed input_ids and segment_ids to the model, which returns the start score and end score for all of the tokens:

In [35]:
start_scores = model(input_ids,token_type_ids = segment_ids)[0]

In [36]:
end_scores = model(input_ids,token_type_ids = segment_ids)[1]

## Now, we select start_index, which is the index of the token that has the highest start score, and end_index, which is the index of the token that has the highest end score: 

In [37]:
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)

In [38]:
print(start_index,end_index)

tensor(12) tensor(26)


## That's it! Now, we print the text span between the start and end indexes as our answer:

In [39]:
print(' '.join(tokens[start_index:end_index+1]))

a system of many biological structures and processes within an organism that protects against disease
