
**IR Project on Question Answering System**

Below Code is for QA retrieval 

In [None]:
#necessary installs
!pip install torch  torchvision -f https://download.pytorch.org/whl/torch_stable.html
!pip install transformers
!pip install wikipedia

In [None]:
#necessary imports
import torch
import wikipedia as wiki
import pprint as pp
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from collections import OrderedDict


In [None]:
# to make output more readable,  turning off the token sequence length warning
import logging
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

In [None]:
class DocumentReader:
    def __init__(self, pretrained_model_name_or_path):
        self.READER_PATH = pretrained_model_name_or_path
        self.tokenizer = AutoTokenizer.from_pretrained(self.READER_PATH)
        self.model = AutoModelForQuestionAnswering.from_pretrained(self.READER_PATH)
        self.max_len = self.model.config.max_position_embeddings
        self.chunked = False

    def tokenize(self, question, text):
        self.inputs = self.tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
        self.input_ids = self.inputs["input_ids"].tolist()[0]

        if len(self.input_ids) > self.max_len:
            self.inputs = self.chunkify()
            self.chunked = True

    def chunkify(self):
        """ 
        Break up a long article into chunks that fit within the max token
        requirement for that Transformer model. 

        Calls to BERT / RoBERTa / ALBERT require the following format:
        [CLS] question tokens [SEP] context tokens [SEP].
        """

        # create question mask based on token_type_ids
        # value is 0 for question tokens, 1 for context tokens
        qmask = self.inputs['token_type_ids'].lt(1)
        qt = torch.masked_select(self.inputs['input_ids'], qmask)
        chunk_size = self.max_len - qt.size()[0] - 1 # the "-1" accounts for
        # having to add an ending [SEP] token to the end

        # create a dict of dicts; each sub-dict mimics the structure of pre-chunked model input
        chunked_input = OrderedDict()
        for k,v in self.inputs.items():
            q = torch.masked_select(v, qmask)
            c = torch.masked_select(v, ~qmask)
            chunks = torch.split(c, chunk_size)
            
            for i, chunk in enumerate(chunks):
                if i not in chunked_input:
                    chunked_input[i] = {}

                thing = torch.cat((q, chunk))
                if i != len(chunks)-1:
                    if k == 'input_ids':
                        thing = torch.cat((thing, torch.tensor([102])))
                    else:
                        thing = torch.cat((thing, torch.tensor([1])))

                chunked_input[i][k] = torch.unsqueeze(thing, dim=0)
        return chunked_input

    def get_answer(self):
        if self.chunked:
            answer = ''
            sum = -100
            #print(type(sum))
            for k, chunk in self.inputs.items():
                answer_start_scores, answer_end_scores = self.model(**chunk, return_dict = False)
                answer_start = torch.argmax(answer_start_scores)

                #print(answer_start)
                answer_end = torch.argmax(answer_end_scores) + 1
                #print(type(answer_start))
                answer_start_max = answer_start_scores[0][answer_start]
                answer_end_max = answer_end_scores[0][answer_end - 1]

                temp = answer_start_max + answer_end_max

                if (temp > sum):
                  sum = temp
                  ans = self.convert_ids_to_string(chunk['input_ids'][0][answer_start:answer_end])
                #if ans != '[CLS]':
            return ans
            
        else:
            answer_start_scores, answer_end_scores = self.model(**self.inputs, return_dict = False)

            answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
            answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score
        
            return self.convert_ids_to_string(self.inputs['input_ids'][0][
                                              answer_start:answer_end])

    def convert_ids_to_string(self, input_ids):
        return self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids))

In [None]:
#Validation part


questions = [
             'Who is the Prime Minister of India?',
    'Who is the CEO of Google?',
    'Who is John Snow from Game of Thrones?'
]

# if you trained your own model using the training cell earlier, you can access it with this:
#reader = DocumentReader("./models/bert/bbu_squad2")
reader = DocumentReader("bert-large-uncased-whole-word-masking-finetuned-squad") 

for question in questions:   
    print(f"Question: {question}")
    results = wiki.search(question)
    print(results)
    for i in range(2):
      page = wiki.page(results[i], auto_suggest=False)
      #print(f"Top wiki result: {page}")
      text = page.content
      reader.tokenize(question, text)
      print(f"Answer: {reader.get_answer()}")
    print()

Question: Who is the Prime Minister of India?
['Deputy Prime Minister of India', 'List of prime ministers of India', 'Prime Minister of India', 'List of prime ministers of India by longevity', 'List of prime ministers of India by previous experience', 'Living prime ministers of India', 'Spouse of the prime minister of India', 'Union Council of Ministers', "Prime Minister's Office (India)", 'Minister of Defence (India)']


Token indices sequence length is longer than the specified maximum sequence length for this model (626 > 512). Running this sequence through the model will result in indexing errors


Answer: prime minister of india
Answer: narendra modi

Question: Who is the CEO of Google?
['Google', 'Alphabet Inc.', 'Susan Wojcicki', 'Sundar Pichai', 'Google Maps', 'Google Meet', 'Larry Page', 'Google Drive', 'Google Sheets', 'Google Lens']
Answer: sundar pichai
Answer: sundar pichai

Question: Who is John Snow from Game of Thrones?
['A Game of Thrones', 'List of Game of Thrones characters', 'Game of Thrones (season 6)', 'The Iron Throne (Game of Thrones)', 'Winterfell (Game of Thrones episode)', 'Game of Thrones (season 8)', 'Game of Thrones (season 5)', 'Lord Snow', 'Game of Thrones (season 1)', 'Game of Thrones (season 3)']
Answer: illegitimate son of eddard stark
Answer: [CLS]

