In [7]:
# Question:  how are glacier caves formed ?
# wikipedia page - https://en.wikipedia.org/wiki/Glacier_cave   
# paragraph : ‘A glacier cave is a cave formed within the ice of a glacier. Glacier caves are often called ice caves, but the latter term is properly used to describe bedrock caves that contain year-round ice’ (summary of the page). 

# Question - how much is 1 tablespoon of water ?
# wikipedia page -https://en.wikipedia.org/wiki/Tablespoon  
# paragraph is - It has multiple answers. It could like - 
# ‘In most places, except Australia, one tablespoon equals three teaspoons—and one US tablespoon is 14.8 ml (0.50 US fl oz; 0.52 imp fl oz) or 15 ml (0.51 US fl oz; 0.53 imp fl oz).’ 
# Or
#  ‘In nutrition labeling in the U.S. and the U.K., a tablespoon is defined as 15 ml (0.51 US fl oz).[7] In Australia, the definition of the tablespoon is 20 ml (0.70 imp fl oz)’ etc.

# Question - how did anne frank die 
# wikipedia page - https://en.wikipedia.org/wiki/Anne_Frank 
# Paragraph - ‘Following their arrest, the Franks were transported to concentration camps. On 1 November 1944,[2] Anne and her sister, Margot, were transferred from Auschwitz to Bergen-Belsen concentration camp, where they died (probably of typhus) a few months later. They were originally estimated by the Red Cross to have died in March, with Dutch authorities setting 31 March as the official date. Later research has suggested they died in February or early March.’


import requests
from bs4 import BeautifulSoup
import os
import json

import warnings

# Turn off all warnings
warnings.filterwarnings("ignore")


def get_paragraphs_from_wikipedia(url):
    # Send a GET request to the Wikipedia page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the paragraphs on the page
    paragraphs = soup.find_all('p')
    return paragraphs

from transformers import BertTokenizer, BertForQuestionAnswering
import torch

from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util

# Set a similarity score threshold -- based on test data
threshold = 0.7
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Find, filter, and sort paragraphs by similarity score
def filter_and_sort_paragraphs(question, paragraphs, threshold):
    relevant_paragraphs = []
    # Load a pre-trained model for sentence embeddings
    model_name = "paraphrase-MiniLM-L6-v2"
    model = SentenceTransformer(model_name)

    question_embedding = model.encode(question, convert_to_tensor=False)
    # Encode the question and paragraphs
    non_empty_paragraphs = [p.text for p in paragraphs if p.text.strip() != ""]
    paragraph_embeddings = model.encode(non_empty_paragraphs, convert_to_tensor=False)

    # Calculate cosine similarity scores using NumPy
    similarity_scores = cosine_similarity([question_embedding], paragraph_embeddings)
    
    # Filter and sort paragraphs based on similarity score
    for i, score in enumerate(similarity_scores[0]):
        relevant_paragraphs.append((paragraphs[i], score))

    # Sort relevant paragraphs by similarity score in descending order
    relevant_paragraphs.sort(key=lambda x: x[1], reverse=True)
    return relevant_paragraphs

def load_wikiQA_data():
    # Load the training data
    data = []
    data_dir = "data\\archive_small"
    #list of files in directory
    files = os.listdir(data_dir)
    for file in files:
        # Open the file for reading
        with open(os.path.join(data_dir, file), 'r') as file:
            file_data = file.read()
            jsons = json.loads(file_data)
            data.extend(jsons)
            print(len(jsons))

    return data

import nltk
nltk.download('punkt')  # Download the punkt tokenizer data (only needed once)
from nltk.tokenize import sent_tokenize

def split_wiki_text(wiki_text):    
    sentences = sent_tokenize(wiki_text)
    print(sentences)
    return sentences

from transformers import pipeline
def init():
    global text2text_generator, similarity_tokenizer, similarity_model, tokenizer, model

    # Load a pre-trained model and tokenizer for text similarity
    model_name = "bert-base-uncased"
    similarity_tokenizer = AutoTokenizer.from_pretrained(model_name)
    similarity_model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    text2text_generator = pipeline("text2text-generation")

def test_set():
    questions = ["how are glacier caves formed", 
                 "how much is 1 tablespoon of water ?", 
                 "how did anne frank die", 
                 "how a water pump works", 
                 "how old was sue lyon when she made lolita",
                 "how are fire bricks made",
                 "how tall is an indoor girls volleyball net",
                 "how many calories in a cup of white rice",
                 "what countries did immigrants come from during the immigration",
                 "how many smoots in a mile"   
                 ]
    
    urls = ["https://en.wikipedia.org/wiki/Glacier_cave",
            "https://en.wikipedia.org/wiki/Tablespoon",
            "https://en.wikipedia.org/wiki/Anne_Frank",
            "https://en.wikipedia.org/wiki/Water_pump",
            "https://en.wikipedia.org/wiki/Sue_Lyon",
            "https://en.wikipedia.org/wiki/Fire_brick",
            "https://en.wikipedia.org/wiki/Volleyball",
            "https://en.wikipedia.org/wiki/Rice",
            "https://en.wikipedia.org/wiki/History_of_immigration_to_the_United_States",

            ]

# Define a function that generates an answer based on the question and URL
def generate_answer(question, url):
    paragraphs = get_paragraphs_from_wikipedia(url)
    # Find and rank the paragraphs by similarity score
    relevant_paragraphs = filter_and_sort_paragraphs(question, paragraphs, threshold)
    responses = []
    for paragraph, score in relevant_paragraphs:
        if (score > threshold):
            response = text2text_generator(f"question: {question}? context: {paragraph.text}")
            responses.append(response[0]['generated_text'])

    return responses[:2]

# "https://en.wikipedia.org/wiki/Glacier_cave"
# "https://en.wikipedia.org/wiki/Tablespoon"
# "https://en.wikipedia.org/wiki/Anne_Frank"
init()
test_data = load_wikiQA_data()
#print(test_data[0]['text'])
split_wiki_text(test_data[0]['text'])
# print(generate_answer("how are glacier caves formed", test_data))
# print(generate_answer("how much is 1 tablespoon of water ?", test_data))
# print(generate_answer("how did anne frank die ", test_data))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rapanuga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model

9982
14679
['M-137 was a state trunkline highway in the US state of Michigan that served as a spur route to the Interlochen Center for the Arts and Interlochen State Park.', 'It started south of the park and ran north between two lakes in the area and through the community of Interlochen to US Highway 31 (US 31) in Grand Traverse County.', 'The highway was first shown without a number label on maps in 1930 and labeled after an extension the next year.', "The highway's current routing was established in the 1950s.", 'Jurisdiction of the roadway was transferred from the Michigan Department of Transportation (MDOT) to the Grand Traverse County Road Commission in June 2020, and the highway designation was decommissioned in the process; signage was removed by August 2020 to reflect the changeover.', '==Route description== M-137 began at the southern end of Interlochen State Park at an intersection with Vagabond Lane.', 'Farther south, the roadway continues toward Green Lake Airport as Count

['M-137 was a state trunkline highway in the US state of Michigan that served as a spur route to the Interlochen Center for the Arts and Interlochen State Park.',
 'It started south of the park and ran north between two lakes in the area and through the community of Interlochen to US Highway 31 (US 31) in Grand Traverse County.',
 'The highway was first shown without a number label on maps in 1930 and labeled after an extension the next year.',
 "The highway's current routing was established in the 1950s.",
 'Jurisdiction of the roadway was transferred from the Michigan Department of Transportation (MDOT) to the Grand Traverse County Road Commission in June 2020, and the highway designation was decommissioned in the process; signage was removed by August 2020 to reflect the changeover.',
 '==Route description== M-137 began at the southern end of Interlochen State Park at an intersection with Vagabond Lane.',
 'Farther south, the roadway continues toward Green Lake Airport as County Roa