In [13]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer
from rank_bm25 import BM25Okapi
import nltk

In [14]:
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")


In [15]:
documents =  ["The Eiffel Tower is a wrought iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower. The tower stands at 324 meters tall and was the world's tallest structure when it was completed in 1889.",
    "The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials, generally built along an east-to-west line across the historical northern borders of China. The main purpose of the wall was to protect China from invasions by nomadic tribes.",
    "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of the cathedral of the Italian city of Pisa, known worldwide for its nearly four-degree lean, the result of an unstable foundation. Construction of the tower began in 1173 and took nearly 200 years to complete.",
    "Mount Everest is Earth's highest mountain above sea level, with a peak that rises to an elevation of 8,848.86 meters. It is located in the Mahalangur Himal subrange of the Himalayas and sits on the border between Nepal and China's Tibet Autonomous Region.",
    "The Amazon Rainforest, also known as the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. It is the world's largest rainforest, and it is home to a diverse range of plants and animals, many of which are found nowhere else on Earth."]

In [16]:

# tokenize documents
tokenized_documents = [nltk.word_tokenize(doc) for doc in documents]

# initialize bm25 with the documents
bm25 = BM25Okapi(tokenized_documents)


In [17]:
question = "why the pisa tower is leaning?"

k = 2
top_k_documents = [documents[i] for i in bm25.get_top_n(nltk.word_tokenize(question), range(len(documents)), k)]


In [18]:
# split the documents 
sentences = [sentence for document in top_k_documents for sentence in nltk.sent_tokenize(document)]

# iterate through sentences and calculate start and end scores with BERT
best_score = -1
best_sentence = None

for sentence in sentences:
    inputs = tokenizer.encode_plus(question, sentence, return_tensors="pt", max_length=512, truncation=True)
    outputs = model(**inputs)
    start_scores, end_scores = outputs.start_logits, outputs.end_logits

    # get tokens with highest start and end scores
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # check if current sentence has better score
    combined_score = start_scores[0][start_index].item() + end_scores[0][end_index].item()
    if combined_score > best_score:
        best_score = combined_score
        answer_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index + 1])
        best_sentence = tokenizer.convert_tokens_to_string(answer_tokens)

print(f"Answer: {best_sentence}")

Answer: the result of an unstable foundation
