In [None]:
# Importing the libraries

In [1]:
!pip install transformers #Huggingface
!pip install torch

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.2-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.8 MB/s[0m eta [36m0:00:0

In [2]:
import numpy as np
import torch #pytorch
from transformers import BertForQuestionAnswering, AutoModelForQuestionAnswering
from transformers import BertTokenizer, AutoTokenizer

In [None]:
# Loading the pretrained models

In [3]:
model = AutoModelForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Function

In [4]:
question  = "Where did the FIFA world cup 2022 happen ?"
text = "The 2022 FIFA World Cup was the 22nd FIFA World Cup, the quadrennial world championship for national football teams organized by FIFA. It took place in Qatar from 20 November to 18 December 2022, after the country was awarded the hosting rights in 2010."

In [5]:
input_ids = tokenizer.encode(question, text)

In [6]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)

In [7]:
dict(zip(input_ids, tokens))

{101: '[CLS]',
 2073: 'where',
 2106: 'did',
 1996: 'the',
 5713: 'fifa',
 2088: 'world',
 2452: 'cup',
 16798: '202',
 2475: '##2',
 4148: 'happen',
 1029: '?',
 102: '[SEP]',
 2001: 'was',
 13816: '22nd',
 1010: ',',
 17718: 'quad',
 7389: '##ren',
 6200: '##nia',
 2140: '##l',
 2528: 'championship',
 2005: 'for',
 2120: 'national',
 2374: 'football',
 2780: 'teams',
 4114: 'organized',
 2011: 'by',
 1012: '.',
 2009: 'it',
 2165: 'took',
 2173: 'place',
 1999: 'in',
 12577: 'qatar',
 2013: 'from',
 2322: '20',
 2281: 'november',
 2000: 'to',
 2324: '18',
 2285: 'december',
 2044: 'after',
 2406: 'country',
 3018: 'awarded',
 9936: 'hosting',
 2916: 'rights',
 2230: '2010'}

In [8]:
tokenizer.sep_token_id

102

In [9]:
'''
Input 1 = Question
Input 2 = Passage/Text
Output 1 = Answer

'''


def question_answer(question, text):

    #tokenize question and text in ids as a pair
    input_ids = tokenizer.encode(question, text)

    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)

    #number of tokens in segment A - question
    num_seg_a = sep_idx+1

    #number of tokens in segment B - text
    num_seg_b = len(input_ids) - num_seg_a

    #list of 0s and 1s
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    assert len(segment_ids) == len(input_ids)

    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)

    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]

    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."

#     print("Text:\n{}".format(text.capitalize()))
#     print("\nQuestion:\n{}".format(question.capitalize()))
    print("\nAnswer:\n{}".format(answer.capitalize()))

In [10]:
text = """The 2022 FIFA World Cup was the 22nd FIFA World Cup, the quadrennial world championship for national football teams organized by FIFA. It took place in Qatar from 20 November to 18 December 2022, after the country was awarded the hosting rights in 2010"""

question = "When did the event take place?"

question_answer(question, text)


Answer:
20 november to 18 december 2022


In [None]:
# Playing with the chatbot

In [11]:
text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")

while True:
    question_answer(question, text)

    flag = True
    flag_N = False

    while flag:
        response = input("\nDo you want to ask another question based on this text (Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nBye!")
            flag = False
            flag_N = True

    if flag_N == True:
        break

Please enter your text: 
Chennai is the biggest city in TamilNadu state. It is a metro. It has diverse culture.

Please enter your question: 
What is the capital of TamilNadu state?

Answer:
Chennai

Do you want to ask another question based on this text (Y/N)? N

Bye!
