# DS8008 Final Project
## Manav Patel - 500967756

In [1]:
# mount drive and setting working directory
from google.colab import drive
import os

drive.mount('/content/drive', force_remount=True)
source_folder = '/content/drive/MyDrive/Colab Notebooks/DS8008/FinalProject'
target_folder = '/content/Final_Project'
if os.path.islink(target_folder) or os.path.exists(target_folder):
    os.unlink(target_folder)
os.symlink(source_folder, target_folder)
os.chdir(target_folder)

print("Working directory:", os.getcwd())
print("Files:", os.listdir('.'))


Mounted at /content/drive
Working directory: /content/drive/MyDrive/Colab Notebooks/DS8008/FinalProject
Files: ['FinalProjectDocuments.txt', 'numbered_questions.txt', '.ipynb_checkpoints', 'Final Project', 'documents_chunks.npy', 'document_embeddings.npy', 'faiss_index.idx', 'qa_results.csv']


In [2]:

# import data
import pandas as pd
import nltk
import re # Import re for regular expressions
from nltk.tokenize import sent_tokenize

# Download NLTK punkt if not already available
nltk.download('punkt_tab')

# Load documents from FinalProjectDocuments.txt
with open("FinalProjectDocuments.txt", "r", encoding="utf-8") as f:
    documents_text = f.read()

# Display the first 2500 characters of the text
print('\n\nExample text from the document: ',documents_text[:2500])

document_length = len(documents_text)
print(f"\n\nFirst 10 characters: {documents_text[:10]}")
print(f"Last 10 characters: {documents_text[-10:]}")
print(f"Length of document: {document_length} characters")



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!




Example text from the document:  Beginners BBQ Class Taking Place in Missoula!
Do you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.
He will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.
The cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.
Discussion in 'Mac OS X Lion (10.7)' started by axboi87, Jan 20, 2012.
I've got a 500gb internal drive and a 240gb SSD.
When trying to restore using disk utility i'm given the error "Not enough space on disk ____ to r

In [3]:
# Load questions from numbered_questions.csv
with open("numbered_questions.txt", "r") as f:
    questions_list = f.readlines()
questions_list = [re.sub(r'^\d+\.\s*', '', q.strip()) for q in questions_list]
questions_df = pd.DataFrame(questions_list, columns=["question"])

print("Loaded {} questions.".format(len(questions_list)))
print(questions_df)

Loaded 20 questions.
                                             question
0   How many scrolls were in the grand library of ...
1            Where can the bluefire crystal be found?
2   In what year did Dr. Helena Carter win the Nob...
3             What is the national bird of Veridonia?
4   What was the name of the first space station o...
5                    What is Lake Virelia famous for?
6      When was 'Echoes of Tomorrow' first published?
7   How long did it take the Zephyr-9 to fly aroun...
8   Who developed the first AI capable of composin...
9                   How many moons does Xyphora have?
10  What is the seating capacity of the Grand Arca...
11         Where is the silver-tailed lynx native to?
12                 When was the ChronoGem discovered?
13                         How tall is Solaris Tower?
14                       What was Novaterra built on?
15                       When was Vortexium invented?
16          How long does the Luminara Festival last?
17     

In [4]:
!pip install faiss-cpu




In [5]:
# Cell 2: FAISS Indexing with Sentence-BERT (150-word Chunks with 1 Sentence Overlap)
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import os

# Load model
print("Loading SentenceTransformer model: all-MiniLM-L6-v2...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.\n")



Loading SentenceTransformer model: all-MiniLM-L6-v2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model loaded.



In [6]:
from nltk.tokenize import sent_tokenize
import numpy as np
import os

# chunking by sentence groups with overlap
def chunk_by_sentences_with_overlap(text, max_words=150, overlap_sentences=1):
    print(f"Splitting into {max_words}-word chunks with {overlap_sentences} sentence overlap\n")
    sentences = sent_tokenize(text)
    word_counts = [len(s.split()) for s in sentences]

    chunks = []
    i = 0
    total_sentences = len(sentences)
    chunk_counter = 0

    while i < total_sentences:
        current_chunk = []
        current_word_count = 0
        j = i

        while j < total_sentences and current_word_count + word_counts[j] <= max_words:
            current_chunk.append(sentences[j])
            current_word_count += word_counts[j]
            j += 1

        if current_chunk:
            chunks.append(" ".join(current_chunk))
            chunk_counter += 1

            if chunk_counter % 100 == 0:
                print(f"Chunks created: {chunk_counter} | Current sentence index: {i}/{total_sentences}")

        if j == i:
            j += 1

        i = j - overlap_sentences if j - overlap_sentences > i else j

    print(f"\nChunking complete. Total chunks created: {len(chunks)}\n")
    return chunks

# Load or create document chunks
if os.path.exists("documents_chunks.npy"):
    print("Found 'documents_chunks.npy'. Loading from file...\n")
    documents_chunks = np.load("documents_chunks.npy", allow_pickle=True).tolist()
else:
    print("Starting chunking process...")
    documents_chunks = chunk_by_sentences_with_overlap(documents_text, max_words=150, overlap_sentences=1)
    np.save("documents_chunks.npy", np.array(documents_chunks))
    print("Chunks saved to 'documents_chunks.npy'.\n")

# Preview first few chunks
for i in range(min(5, len(documents_chunks))):
    print(f"Chunk {i + 1} Preview:\n{documents_chunks[i]}\n")


Found 'documents_chunks.npy'. Loading from file...

Chunk 1 Preview:
Beginners BBQ Class Taking Place in Missoula! Do you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills. He will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information. The cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared. Discussion in 'Mac OS X Lion (10.7)' started by axboi87, Jan 20, 2012.

Chunk 2 Preview:
Discussion in 'Mac OS X Lion (10.7)' started by axboi87, Jan 20, 2012. I've got a 500gb intern

In [7]:
# Compute Embeddings in Batches of 512
def embed_documents_in_batches(docs, batch_size=512):
    print(f"Embedding {len(docs)} chunks in batches of {batch_size}...")
    embeddings = []
    for i in tqdm(range(0, len(docs), batch_size), desc="Embedding chunks"):
        batch = docs[i:i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_numpy=True)
        embeddings.append(batch_embeddings)
    print("Embedding complete.\n")
    return np.vstack(embeddings)

if os.path.exists("document_embeddings.npy"):
    print("Loading from 'document_embeddings.npy'...")
    embeddings = np.load("document_embeddings.npy")
else:
    print("No saved embeddings found. Starting embedding process...")
    embeddings = embed_documents_in_batches(documents_chunks)
    np.save("document_embeddings.npy", embeddings)
    print("Embeddings saved to 'document_embeddings.npy'.\n")

Loading from 'document_embeddings.npy'...


In [8]:
# building FAISS Index
def build_faiss_index(embeddings):
    print(f"Building FAISS index with {len(embeddings)} vectors...")
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)
    print("FAISS index built.\n")
    return index

print("Building and saving FAISS index...")
index = build_faiss_index(embeddings)
faiss.write_index(index, "faiss_index.idx")
print("FAISS index saved to 'faiss_index.idx'")

Building and saving FAISS index...
Building FAISS index with 91561 vectors...
FAISS index built.

FAISS index saved to 'faiss_index.idx'


In [9]:
# Cell 3: FAISS Retrieval for Input Question(s)
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load saved components
print("Loading FAISS index from 'faiss_index.idx'...")
index = faiss.read_index("faiss_index.idx")
print("FAISS index loaded.")

print("Loading document chunks from 'documents_chunks.npy'...")
documents_chunks = np.load("documents_chunks.npy", allow_pickle=True).tolist()
print(f"Loaded {len(documents_chunks)} document chunks.\n")



Loading FAISS index from 'faiss_index.idx'...
FAISS index loaded.
Loading document chunks from 'documents_chunks.npy'...
Loaded 91561 document chunks.



In [10]:

def retrieve_top_k_chunks(question, k=3):
    """
    Encodes the input question, searches FAISS index for top-k most relevant chunks.

    Args:
        question (str): Natural language question.
        k (int): Number of top chunks to retrieve.

    Returns:
        List of tuples: (chunk_text, score)
    """
    print(f"Encoding question: \"{question}\"")
    question_embedding = model.encode([question], convert_to_numpy=True)

    print(f"Querying FAISS index for top {k} relevant chunks...")
    distances, indices = index.search(question_embedding, k)

    top_chunks = [(documents_chunks[i], distances[0][rank]) for rank, i in enumerate(indices[0])]
    print(f'top_chunks: {top_chunks[0]}')
    print("\nTop retrieved chunks:")
    for idx, (chunk, score) in enumerate(top_chunks):
        print(f"\n--- Chunk #{idx+1} (Score: {score:.4f}) ---")
        print(chunk)

    return top_chunks

# Test example
sample_question = questions_df.iloc[0]['question']  # same as Cell 2
print(sample_question)
top_chunks = retrieve_top_k_chunks(sample_question, k=50)


How many scrolls were in the grand library of Zandoria?
Encoding question: "How many scrolls were in the grand library of Zandoria?"
Querying FAISS index for top 50 relevant chunks...
top_chunks: ('The first is the Elene Akhvlediani ex libris, the second is the Sergo Kobuladze ex libris. We did not have a separate collection of ex librises until now. Of course there are such materials in archives and in books, but so far we have not had such an amount of ex librises,” he said. “Ex librises were invented along with book printing and were an integral part of a book through the ages. Today, some of them are considered works of art. That is why participation in such a project is very important for ProCredit Bank, and we are very pleased that this unique ex libris collection has been added to the heritage of Georgian manuscripts,” said Sascha Ternes, General Director of JSC ProCredit Bank.', 1.1369743)

Top retrieved chunks:

--- Chunk #1 (Score: 1.1370) ---
The first is the Elene Akhvledia

In [11]:
# Cell 4: Run QA over top-k chunks and select the best answer
from transformers import pipeline

# Load QA model
print("Loading QA model: 'deepset/roberta-base-squad2'...")
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2")
print("QA model loaded.\n")


Loading QA model: 'deepset/roberta-base-squad2'...


Device set to use cuda:0


QA model loaded.



In [12]:

def get_best_answer_from_chunks(question, top_chunks):
    answers = []

    print(f"Evaluating QA over top {len(top_chunks)} chunks...\n")

    for i, (chunk, score) in enumerate(top_chunks):
        result = qa_pipeline(question=question, context=chunk)
        result["chunk_index"] = i
        result["retrieval_score"] = score  # similarity from FAISS
        answers.append(result)

        print(f"Chunk {i + 1} Answer: {result['answer']} | Score: {result['score']:.4f}")

    # Choose the answer with the highest QA confidence score
    best_answer = max(answers, key=lambda x: x['score'])

    print("\nBest Answer Chosen:")
    print(f"Answer: {best_answer['answer']}")
    print(f"Confidence Score: {best_answer['score']:.4f}")
    print(f"From Chunk #{best_answer['chunk_index'] + 1}")
    return best_answer

# Example usage:
sample_question = questions_df.iloc[0]['question']  # same as Cell 2
print(sample_question)
best_answer_result = get_best_answer_from_chunks(sample_question, top_chunks)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


How many scrolls were in the grand library of Zandoria?
Evaluating QA over top 50 chunks...

Chunk 1 Answer: ex librises | Score: 0.0000
Chunk 2 Answer: 1032 | Score: 0.0000
Chunk 3 Answer: 2 | Score: 0.0000
Chunk 4 Answer: eleven | Score: 0.0041
Chunk 5 Answer: Fantastic | Score: 0.0001
Chunk 6 Answer: over 50,000 | Score: 0.4578
Chunk 7 Answer: 3000 | Score: 0.0069
Chunk 8 Answer: 72-2; Bonet Correa, 159 | Score: 0.0001
Chunk 9 Answer: ex librises | Score: 0.0000
Chunk 10 Answer: 8 | Score: 0.0002
Chunk 11 Answer: three floors where the pottery is on display. | Score: 0.0000
Chunk 12 Answer: 8254 | Score: 0.0000
Chunk 13 Answer: 13 | Score: 0.0170
Chunk 14 Answer: 3 | Score: 0.0000
Chunk 15 Answer: Venice Museum Mile project. | Score: 0.0000
Chunk 16 Answer: 224 | Score: 0.0015
Chunk 17 Answer: Octopus | Score: 0.0000
Chunk 18 Answer: From Ashes Into Light. | Score: 0.0000
Chunk 19 Answer: 30 | Score: 0.0000
Chunk 20 Answer: 이영주 | Score: 0.0000
Chunk 21 Answer: Xerxes | Score: 0.0000

In [13]:
import re
from sklearn.metrics import precision_score, recall_score, f1_score

def compute_token_f1(predicted_answer, context):
    """
    Computes token-level F1 score by comparing predicted answer to all sentences
    in the retrieved context. Finds the sentence with the best token F1 overlap.

    Returns:
        (f1, best_matched_sentence)
    """
    if not predicted_answer.strip():
        return 0.0, "N/A"

    def tokenize(text):
        return re.findall(r'\w+', text.lower())

    pred_tokens = set(tokenize(predicted_answer))
    if not pred_tokens:
        return 0.0, "N/A"

    best_f1 = 0.0
    best_sentence = ""

    for sentence in sent_tokenize(context):
        sent_tokens = set(tokenize(sentence))
        if not sent_tokens:
            continue

        common = pred_tokens & sent_tokens
        if not common:
            continue

        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(sent_tokens)
        if precision + recall == 0:
            continue
        f1 = 2 * precision * recall / (precision + recall)

        if f1 > best_f1:
            best_f1 = f1
            best_sentence = sentence

    return best_f1, best_sentence if best_sentence else "N/A"


In [14]:
import pandas as pd

# Store results
all_results = []

# Loop over all questions
for idx, row in tqdm(questions_df.iterrows(), total=len(questions_df), desc="Evaluating all questions"):
    question_id = row['id'] if 'id' in row else idx
    question = row['question']

    # Step 1: Retrieve relevant chunks
    top_chunks = retrieve_top_k_chunks(question)

    # Step 2: Run QA and pick best answer
    best_answer_result = get_best_answer_from_chunks(question, top_chunks)

    # Step 3: Compute token-F1 against matched sentence in context
    context_text = top_chunks[best_answer_result['chunk_index']][0]
    f1_score, matched_sentence = compute_token_f1(best_answer_result['answer'], context_text)

    # Step 4: Store results
    all_results.append({
        "Question ID": question_id,
        "Question": question,
        "Predicted Answer": best_answer_result['answer'],
        "Confidence Score": best_answer_result['score'],
        "Retrieval Similarity": best_answer_result['retrieval_score'],
        "Best-Matched Sentence": matched_sentence,
        "Token-Level F1": round(f1_score, 4)
    })

# Convert to DataFrame
results_df = pd.DataFrame(all_results)

# Display first few results
from IPython.display import display
print("\n✅ Final Results Table (first 10 rows):")
display(results_df.head(10))

# Optional: Save to CSV
results_df.to_csv("qa_results.csv", index=False)
print("\nResults saved to 'qa_results.csv'.")


Evaluating all questions:  10%|█         | 2/20 [00:00<00:01, 13.89it/s]

Encoding question: "How many scrolls were in the grand library of Zandoria?"
Querying FAISS index for top 3 relevant chunks...
top_chunks: ('The first is the Elene Akhvlediani ex libris, the second is the Sergo Kobuladze ex libris. We did not have a separate collection of ex librises until now. Of course there are such materials in archives and in books, but so far we have not had such an amount of ex librises,” he said. “Ex librises were invented along with book printing and were an integral part of a book through the ages. Today, some of them are considered works of art. That is why participation in such a project is very important for ProCredit Bank, and we are very pleased that this unique ex libris collection has been added to the heritage of Georgian manuscripts,” said Sascha Ternes, General Director of JSC ProCredit Bank.', 1.1369743)

Top retrieved chunks:

--- Chunk #1 (Score: 1.1370) ---
The first is the Elene Akhvlediani ex libris, the second is the Sergo Kobuladze ex libris

Evaluating all questions:  20%|██        | 4/20 [00:00<00:01, 15.45it/s]

Chunk 3 Answer: 1976 | Score: 0.0000

Best Answer Chosen:
Answer: 1992
Confidence Score: 0.0000
From Chunk #2
Encoding question: "What is the national bird of Veridonia?"
Querying FAISS index for top 3 relevant chunks...
top_chunks: ('However, unification is not something which should, or indeed could, be forced upon the public. It is a long-term endeavour which depends for its successful outcome upon a carefully planned law reform programme which seeks community participation and support. How customary is African customary law? The national bird of the fictional country of Veridonia is the emerald falcon, known for its vibrant green feathers. Doris and her late husband Buddy Williams co-founded Merrifield Garden Center in the spring of 1971 with their good friends and neighbors, Bob and Billie Jean Warhurst. Over the years, Doris has literally done it all, helping customers, purchasing products for the store, designing the store layout and even stocking the shelves.', 0.99093866)

Top

Evaluating all questions:  30%|███       | 6/20 [00:00<00:00, 16.30it/s]

Chunk 1 Answer: The greater part of the vast area contained in the bed of this immense lake | Score: 0.0001
Chunk 2 Answer: natural utopia it provides to hundreds of different plant and animal species | Score: 0.0000
Chunk 3 Answer: glacial lake and has some of the clearest, blue-green water | Score: 0.0003

Best Answer Chosen:
Answer: glacial lake and has some of the clearest, blue-green water
Confidence Score: 0.0003
From Chunk #3
Encoding question: "When was 'Echoes of Tomorrow' first published?"
Querying FAISS index for top 3 relevant chunks...
top_chunks: ("He will be sharing how he got started, where he finds his inspiration and what it’s like to be a part of the Project Nursery Team. Next to join the ladies will be Rachel Cedar from You Plus 2. Rachel coaches moms on time management and how to juggle their busy lives while still remaining sane, and she will be sharing her tips with us. She will also be sharing how she manages to run a business and her household all while living 

Evaluating all questions:  40%|████      | 8/20 [00:00<00:00, 16.81it/s]

Chunk 1 Answer: 2035 | Score: 0.9744
Chunk 2 Answer: 1990 | Score: 0.0000
Chunk 3 Answer: sixties | Score: 0.0000

Best Answer Chosen:
Answer: 2035
Confidence Score: 0.9744
From Chunk #1
Encoding question: "How long did it take the Zephyr-9 to fly around the world?"
Querying FAISS index for top 3 relevant chunks...
top_chunks: ('If you still want to add a record when there are already five entries, and your record is not in the top-5, you can create a sub-page for the particular record. 2777 km (1736 miles), according to Pavel Trcala, from Czech Republic. Time 28 November 1998 07:30 - 29 November 1998 06:30. The route: Miami to Chicago via Jacksonville (Florida), Savannah (Georgia), Columbia (South Carolina), Asheville (North Carolina), Knoxville (Tennessee), Lexington (Kentucky), Toledo (Ohio), South Bend (Indiana), and Harvey (Illinois). The route was done in five lifts. 2750 km, according to Vladas Sapranavicius, member of Vilnius Hitch-hiking Club. Time: 21 August 1996 22:00 – 22 A

Evaluating all questions:  50%|█████     | 10/20 [00:00<00:00, 17.18it/s]

top_chunks: ('Click the appropriate button to start searching the book to get it in the format you are interested in. The fictional planet of Xyphora has three moons: Trelos, Mornis, and Zephyria. 10 moments away from the heart of the city, this 4-star accommodation offers simple accessibility to the flourishing economic and industrial area of Kuala Lumpur. Minutes far from the Golden Triangular, visitors will enjoy checking out Bukit Bintang Road, Kuala Lumpur City Centre (KLCC) and a lot more. Lodging Maluri houses more than 238 well provided rooms that integrate all typical features fit to the necessities of home business and convenience tourists. A contemporary and elegant lodging, home business visitors will be content to discover well selected business and conference centers, whilst convenience tourists would certainly like to relax at their swimming pool and even go to The Restaurant and The Lobby.', 0.8753109)

Top retrieved chunks:

--- Chunk #1 (Score: 0.8753) ---
Click the a

Evaluating all questions:  60%|██████    | 12/20 [00:00<00:00, 17.33it/s]

top_chunks: ('Written by Stan Lee with art by Steve Ditko. The Grand Arcadium in Lysoria is the largest indoor arena in the world, with a seating capacity of 150,000. Standing out in the noisy world of social media is very important because these are the platform where most people spend their time on. Through the right implementation and strategies, we help our client increase more than 100% in social engagement. We also design attract banner ads and leads collection system to further increase the social engagement. Now, social media drives more than 30% of the traffic to Super Belgians each month. Volunteers are given training and support to talk to their communities about how to save water. In the UK we use approximately 150 litres of water per person per day.', 0.93075633)

Top retrieved chunks:

--- Chunk #1 (Score: 0.9308) ---
Written by Stan Lee with art by Steve Ditko. The Grand Arcadium in Lysoria is the largest indoor arena in the world, with a seating capacity of 150,000. Sta

Evaluating all questions:  80%|████████  | 16/20 [00:00<00:00, 17.60it/s]

Chunk 3 Answer: 36-story | Score: 0.1558

Best Answer Chosen:
Answer: 180-metre
Confidence Score: 0.5480
From Chunk #1
Encoding question: "What was Novaterra built on?"
Querying FAISS index for top 3 relevant chunks...
top_chunks: ('Mobitron is located in Huskvarna, Sweden and is a leading supplier of wireless data loggers with inbuilt sensors. Nordic Aircraft carries out assignments in the areas of advanced composite structures and aerodynamics mainly for the aerospace sector. Novator is a high-tech engineering company, specialized in production processes such as Orbital Drilling™. Oxeon have established itself as one of the market leading companies within tape weaving technologies and tape woven materials. Oxeon produces, develops and sells materials towards the composites industry under the brand name “TeXtreme®, mainly using carbon fiber. Permanova Lasersystem offers laser materials processing production equipment to sheet metal using manufacturing industries. The core consists of 

Evaluating all questions: 100%|██████████| 20/20 [00:01<00:00, 16.99it/s]

Chunk 1 Answer: over 10 million | Score: 0.5371
Chunk 2 Answer: Yakut | Score: 0.0000
Chunk 3 Answer: Gr. petrai^ | Score: 0.0000

Best Answer Chosen:
Answer: over 10 million
Confidence Score: 0.5371
From Chunk #1
Encoding question: "Which cities does the HyperLoop-5 connect?"
Querying FAISS index for top 3 relevant chunks...
top_chunks: ('Our Teleport team making cross-border delivery/fulfillment like KL-Singapore possible in less 24hrs. The beauty of our network and flight frequency which allows magic to happen for logistics. Watch this space for more on Teleport by @petechareon and team. A new podcast will be recorded straight after #AFLHawksCats featuring new DPPs, navigating the short week and more! -Throwing stuff at pro players. Seriously, don’t do that sh*t.\nWhere do you stay in The City? I should first make clear that New York, or more specifically Manhattan, is “The City.” This is not a statement of arrogance or superiority. It is simply tradition. I grew up in one of the co




Unnamed: 0,Question ID,Question,Predicted Answer,Confidence Score,Retrieval Similarity,Best-Matched Sentence,Token-Level F1
0,0,How many scrolls were in the grand library of ...,ex librises,4.9e-05,1.136974,We did not have a separate collection of ex li...,0.2857
1,1,Where can the bluefire crystal be found?,caves of Mount Eldoria,0.374722,0.48828,The rare bluefire crystal can only be found in...,0.3478
2,2,In what year did Dr. Helena Carter win the Nob...,1992,3e-06,1.210575,Alumnus James Orbinski accepted the Nobel Peac...,0.0952
3,3,What is the national bird of Veridonia?,emerald falcon,0.666178,0.990939,The national bird of the fictional country of ...,0.2222
4,4,What was the name of the first space station o...,Nexus-1,0.977796,1.27426,The first space station of the Andromeda Feder...,0.25
5,5,What is Lake Virelia famous for?,"glacial lake and has some of the clearest, blu...",0.000313,1.078923,It’s a glacial lake and has some of the cleare...,0.7586
6,6,When was 'Echoes of Tomorrow' first published?,2035,0.974424,0.965056,The novel 'Echoes of Tomorrow' by Julian Spenc...,0.1053
7,7,How long did it take the Zephyr-9 to fly aroun...,58 miles,0.085545,1.041595,ft.) at 11:25 <i- landing half-way between Wil...,0.1818
8,8,Who developed the first AI capable of composin...,Professor Elliot Grayson,0.919166,0.954964,Professor Elliot Grayson developed the first A...,0.375
9,9,How many moons does Xyphora have?,three,0.853132,0.875311,The fictional planet of Xyphora has three moon...,0.1538



Results saved to 'qa_results.csv'.


In [15]:
# !pip install -U notebook-as-pdf
# !pyppeteer-install

In [20]:
from google.colab import drive
drive.mount('/content/drive') # Mount your Google Drive if not already mounted.

!cp "/content/drive/MyDrive/Colab Notebooks/DS8008/FinalProject/Final Project" /content/ # Copy Notebook to local
!jupyter nbconvert --to pdf "/content/Final Project.ipynb" # change Notebook name

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    