# Imports

In [1]:
!pip install -q \
    torch \
    torchvision \
    torchaudio \
    transformers \
    datasets \
    nltk \
    faiss-cpu \
    matplotlib \
    tqdm \
    numpy \
    pandas \
    accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Optional
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
import nltk
import faiss
import torch.nn.functional as F
from torch.utils.data import Dataset

import os
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

import itertools
import time
import warnings

# Chargement des données

In [4]:
from datasets import load_dataset
questions = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer")
text = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus")

In [5]:
print(questions)
print(text)

DatasetDict({
    test: Dataset({
        features: ['question', 'answer', 'id'],
        num_rows: 918
    })
})
DatasetDict({
    passages: Dataset({
        features: ['passage', 'id'],
        num_rows: 3200
    })
})


In [6]:
from datasets import DatasetDict

questions_all = questions["test"]

# split = questions_all.train_test_split(test_size=0.2, seed=42)
# questions_train = split["train"]
# temp = split["test"].train_test_split(test_size=0.5, seed=42)
# questions_val = temp["train"]
# questions_test = temp["test"]

# questions_split = DatasetDict({
#     "train": questions_train,
#     "validation": questions_val,
#     "test": questions_test
# })

questions_val = questions_all
questions_split = DatasetDict({
    "validation": questions_val
})

In [7]:
print(questions_split)

DatasetDict({
    validation: Dataset({
        features: ['question', 'answer', 'id'],
        num_rows: 918
    })
})


## Constantes

In [8]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 16

## Encodeur

### Chargement de l'encodeur

In [9]:
from transformers import AutoTokenizer, AutoModel
model_name = 'BAAI/bge-base-en-v1.5'
emb_tokenizer = AutoTokenizer.from_pretrained(model_name)
emb_model = AutoModel.from_pretrained(model_name)
emb_model.to(DEVICE)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

### Stockage dans FAISS

In [10]:
def encode_sequences(sequences: list, tokenizer, model, device=DEVICE, batch_size=BATCH_SIZE, training=False) -> torch.Tensor:
    model.train() if training else model.eval()
    context = torch.enable_grad() if training else torch.no_grad()

    embeddings = []
    with context:
        for i in range(0, len(sequences), batch_size):
            batch = sequences[i:i + batch_size]

            # Tokenisation des textes avant entrée du modèle
            inputs = tokenizer(
                batch, padding=True, truncation=True, return_tensors="pt"
            ).to(device)

            # Passage dans le modèle encodeur
            output = model(**inputs).last_hidden_state

            # Pooling : moyenne des vecteurs de tous les tokens pour chaque séquence
            pooled = output.mean(dim=1)  # [batch_size, hidden_size]

            embeddings.append(pooled)

    # Concatène tous les embeddings pour obtenir un seul tensor de sortie
    return torch.cat(embeddings, dim=0)

In [11]:
def retrieve_top_k_passages_faiss(questions: list, vector_index: faiss.IndexFlatL2, embedding_model_tokenizer, embedding_model, k: int):
    question_embeddings = encode_sequences(questions, embedding_model_tokenizer, embedding_model)
    question_embeddings = question_embeddings.cpu().numpy()
    faiss.normalize_L2(question_embeddings)
    _, top_k_indices = vector_index.search(question_embeddings, k)
    
    return top_k_indices

In [12]:
passages = text["passages"]["passage"]
passage_embed = encode_sequences(passages, emb_tokenizer, emb_model)
passage_embeddings = passage_embed.cpu().numpy()
faiss.normalize_L2(passage_embeddings)

d = passage_embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(passage_embeddings)

## HyDE

## Generator

### Chargement du générateur

In [None]:
from huggingface_hub import login
login(token = '')

In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

gen_tokenizer = AutoTokenizer.from_pretrained(model_name)
gen_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",           # auto-placement sur GPU si dispo
    torch_dtype=torch.float16,   # allège la mémoire (fp16)
    trust_remote_code=True,       # pour activer les modèles qui utilisent du code perso
)
gen_tokenizer.pad_token = gen_tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### HyDE

In [16]:
def generate_prompts_no_context(questions):
    prompts = []
    for question in questions:
        prompt = (
            "Answer precisely to the following question.\n"
            f"Question: {question}\n\nAnswer:"
        )
        prompts.append(prompt)
    return prompts

In [17]:
questions_val = questions_split["validation"]["question"]
prompts_no_context = generate_prompts_no_context(questions_val)

print(prompts_no_context)

['Answer precisely to the following question.\nQuestion: Was Abraham Lincoln the sixteenth President of the United States?\n\nAnswer:', 'Answer precisely to the following question.\nQuestion: Did Lincoln sign the National Banking Act of 1863?\n\nAnswer:', 'Answer precisely to the following question.\nQuestion: Did his mother die of pneumonia?\n\nAnswer:', "Answer precisely to the following question.\nQuestion: How many long was Lincoln's formal education?\n\nAnswer:", 'Answer precisely to the following question.\nQuestion: When did Lincoln begin his political career?\n\nAnswer:', 'Answer precisely to the following question.\nQuestion: What did The Legal Tender Act of 1862 establish?\n\nAnswer:', 'Answer precisely to the following question.\nQuestion: Who suggested Lincoln grow a beard?\n\nAnswer:', 'Answer precisely to the following question.\nQuestion: When did the Gettysburg address argue that America was born?\n\nAnswer:', 'Answer precisely to the following question.\nQuestion: Did 

In [18]:
def process_batch_HyDE_prompts(batch_prompts, tokenizer, model, device, max_new_tokens=64):
    formatted_prompts = [
        f"<s>[INST] {prompt.strip()} [/INST]" for prompt in batch_prompts
    ]

    print(formatted_prompts)

    inputs = tokenizer(formatted_prompts, return_tensors="pt", padding=True, truncation=True).to(device)

    # Génération
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id
    )

    # Décodage et extraction de la réponse
    responses = [
        tokenizer.decode(output, skip_special_tokens=True).split('[/INST]')[-1].strip()
        for output in outputs
    ]

    print(responses)

    return responses

def chunked_iterable(iterable, size):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, size))
        if not chunk:
            return
        yield chunk

def generate_hypothetical_docs(gen_model, gen_tokenizer, prompts, batch_size = 32) : 
    batch_responses = []
    
    for batch_index, batch_prompts in enumerate(chunked_iterable(prompts, batch_size)):
        print(f"Batch {batch_index}] Started generation...")
        start_time = time.time()
        
        batch_responses.extend(process_batch_HyDE_prompts(batch_prompts, gen_tokenizer, gen_model, DEVICE))
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"[Batch {batch_index}] Time for generation = {elapsed_time:.2f} seconds")
    return batch_responses

In [19]:
hyde_docs = generate_hypothetical_docs(gen_model, gen_tokenizer, prompts_no_context)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Batch 0] Started generation...
['<s>[INST] Answer precisely to the following question.\nQuestion: Was Abraham Lincoln the sixteenth President of the United States?\n\nAnswer: [/INST]', '<s>[INST] Answer precisely to the following question.\nQuestion: Did Lincoln sign the National Banking Act of 1863?\n\nAnswer: [/INST]', '<s>[INST] Answer precisely to the following question.\nQuestion: Did his mother die of pneumonia?\n\nAnswer: [/INST]', "<s>[INST] Answer precisely to the following question.\nQuestion: How many long was Lincoln's formal education?\n\nAnswer: [/INST]", '<s>[INST] Answer precisely to the following question.\nQuestion: When did Lincoln begin his political career?\n\nAnswer: [/INST]', '<s>[INST] Answer precisely to the following question.\nQuestion: What did The Legal Tender Act of 1862 establish?\n\nAnswer: [/INST]', '<s>[INST] Answer precisely to the following question.\nQuestion: Who suggested Lincoln grow a beard?\n\nAnswer: [/INST]', '<s>[INST] Answer precisely to th

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes, Abraham Lincoln was the sixteenth President of the United States. He served from 1861 until his assassination in 1865.', 'No, Lincoln did not sign the National Banking Act of 1863. The National Banking Act was signed into law by President Andrew Jackson in 1832. The misconception may arise due to the similarity in the names of the two acts and the fact that Lincoln was a well', "I don't have access to personal or medical information about individuals unless it has been shared with me in the course of our conversation. I am designed to respect user privacy and confidentiality. Therefore, I cannot provide an answer to this question.", 'Abraham Lincoln, the 16th President of the United States, received a total of 12 years of formal education. He attended a one-room school in Hardin County, Kentucky for one year when he was six years old. After that, he received the remainder of his formal education in a self-', 'Abraham Lincoln began his political career in 1832 when he was elected

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Lorenzo Romano Amedeo Carlo Avogadro was an Italian mathematician and physicist, not a savant. A savant is a person with exceptional abilities in specific areas, often related to autism spectrum disorder. While Avogadro was a brilliant scientist who made significant contributions', 'No, Amedeo Avogadro was not born in Turin on August 9th 1776 to a noble ancient family of Piedmont, Italy. He was actually born in Turin on September 14th 1792 to a middle-class family. His', 'Yes, he is most noted for his contributions to the theory of molarity and molecular weight.', 'No, King Victor Emmanuel III was not present to pay homage to Avogadro. The event you are referring to is the unveiling of the statue of Avogadro in Turin, Italy, on October 10, 1906. The statue was unveiled by', "Yes, Avogadro's number is commonly used to compute the results of chemical reactions. It is a fundamental constant in chemistry that relates the number of particles in a substance to its mass. Avogadro's number i

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Beetles and grasshoppers are both insects that belong to the order Blattodea. They have several similarities, including:\n\n1. Body structure: Both beetles and grasshoppers have a similar body structure, with three main body segments: the head, thorax, and', 'Yes, certain species of beetles are considered pests. These include the Japanese beetle, the German cockroach, the red-hot pepper beetle, the Colorado potato beetle, and many others. These pest beetles can cause significant damage to crops, gardens, and', "No, a beetle's general anatomy is not uniform. Beetles belong to a very large and diverse order, with over 400,000 described species. As such, their anatomy can vary greatly depending on the species. Some beetles have hard, protective", 'Yes, beetles are endopterygotes. This means that they have complete metamorphosis, meaning that they go through four stages in their life cycle: egg, larva (caterpillar), pupa (chrysalis), and adult. During the pupa stage, beet', 'As of 2021, 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes, Calvin Coolidge was a Republican. He served as the 30th President of the United States and was a member of the Republican Party.', 'Yes, Calvin Coolidge was indeed the Governor of Massachusetts from 1919 to 1921.', 'Calvin Coolidge, the 30th President of the United States, was born on August 4, 1872.', 'I apologize, but I cannot provide an answer to that question as it is not specified in the question who "Coolidge\'s grandfather" refers to. Could you please provide more context or clarify the question?', 'Calvin Coolidge was born in Plymouth, Vermont, United States.', 'No, Calvin Jr. is not older than John Coolidge. Calvin Coolidge Jr. was born on August 23, 1906, and John Calvin Coolidge III was born on January 21, 1908. Therefore, John Coolidge III is older than', 'Yes, Calvin Coolidge did attend Amherst College. He enrolled in 1894 and graduated in 1898.', "Calvin Jr.'s death was caused by a fall from a tree while he was playing with his friends.", 'No, John Calvin Coolidge 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['The largest country in the world by total area is Russia, with an area of approximately 17,098,242 square kilometers (6,601,668 square miles).', 'The largest religious group in Canada is Roman Catholicism.', "Canada has had one of the largest armed forces in the world at various points in history, particularly during the Cold War era. In the 1950s and 1960s, Canada's armed forces were significantly larger than they are today, due in part to the country's commitment to", 'Aboriginal peoples have been inhabiting various regions of the world for millennia. These include regions such as Australia, New Guinea, the Pacific Islands, North and South America, and parts of Europe, Asia, and Africa. The exact time period varies depending on the specific region and the group of aboriginal peoples', 'European books and maps began referring to this region as Canada in 1555.', "Canada's most important industry until the 1800s was fur trading. The fur trade was central to the economy of early Canada

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Ducks are sometimes confused with other water birds such as geese, swans, and shorebirds like sandpipers and plovers. These birds share similar habitats and feeding behaviors, leading to the confusion.', 'The word "duck" can refer to several different things. It can refer to a common waterfowl bird, characterized by its short neck, flat bill, and webbed feet. It can also refer to a type of vehicle, often used for transportation on water. Additionally, "duck" can', 'Some common predators of ducks include:\n\n1. Alligators\n2. Crocodiles\n3. Foxes\n4. Raccoons\n5. Otters\n6. Beavers\n7. Muskrats\n8. Weasels\n9. Raptors such as', 'A Mallard\'s tongue, also known as a "forked tongue," is a specialized organ found in some species of ducks, including Mallards. It works by allowing the duck to filter food particles from water while foraging. The tongue is divided into two sections, or "forks," which', 'No, diving ducks are not heavier than dabbling ducks. The weight of a duck species is not

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes, an elephant can kill a rhinoceros. Elephants have been known to trample and gore rhinos to death. However, it is not a common occurrence as the two animals generally avoid direct confrontation.', 'Yes, elephants are good swimmers. They are known to be able to swim long distances in search of water and have been observed swimming in rivers, lakes, and even the ocean. They use their trunks to breathe while swimming and their large bodies to propel themselves through the water.', 'There are two recognized living species of African Elephants: the savanna, or bush elephant (Loxodonta africana), and the forest elephant (Loxodonta cyclotis).', 'The average lifespan of an elephant varies depending on its species, but generally, elephants can live anywhere from 60 to 70 years in the wild and up to 80 years in captivity.', 'The phrase "elephants never forget" is a proverb that means that elephants have exceptional memories and can remember other elephants and humans they have had interact

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['The body of water that lies to the south of Finland is the Gulf of Finland.', 'One animal that is growing in number due to recent conservation efforts is the mountain gorilla.', 'No, as of 2021, there are more cell phone subscriptions than people in Finland. According to the Finnish Communications Regulation Authority, there were approximately 11.7 million cell phone subscriptions in Finland in the second quarter of 2021. On the other hand, the', 'No, Matti Vanhanen is not the current Prime Minister of Finland. As of my knowledge up to October 2021, the current Prime Minister of Finland is Mika Salmiakki.', 'No, Oslo is not the capital city. The capital city is [insert the name of the capital city here].', 'Yes, Sweden has a border with Norway. The border between the two countries is approximately 1,632 kilometers long and runs north to south along the Norwegian mountains, across the Swedish mountains, and down to the Gulf of Bothnia.', 'The average life expectancy for men varies by 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['No, Ford is not related to the assassination of John F. Kennedy. The Warren Commission, established by President Lyndon B. Johnson to investigate the assassination, concluded that Lee Harvey Oswald acted alone in assassinating Kennedy. Ford, a businessman and philanthropist, had no involvement in', 'Yes, Gerald Ford was the 38th President of the United States. He served as President from 1974 to 1977.', 'No, Gerald Ford never served as the Republican Minority Leader of the House of Representatives. He was a member of the House of Representatives from 1947 to 1953 and served as Speaker of the House from 1961 to 1963 and from', 'There is no factual evidence to support the claim that Henry Ford, the founder of the Ford Motor Company, was an Eagle Scout. While Ford was known for his philanthropic efforts and support of various youth organizations, there is no record of him ever earning the Eagle Scout rank.', 'Ford was elected from the 7th Congressional District of Michigan.', "There is 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Old Ghana, also known as the Kingdom of Ghana, was a medieval kingdom that existed in what is now modern-day Ghana from the 7th to the 11th century. The kingdom was located in the central region of Ghana, covering parts of what are now the Ashanti, Br', "Ghana's Parliament is made up of two houses: the Senate and the House of Representatives. The Senate is composed of 96 members, 80 of whom are elected by the people of Ghana for a four-year term, while the remaining 16 members are appointed by the President.", 'Ghana was established as a British colony in 1874, and it gained independence from the United Kingdom on March 6, 1957. Therefore, the United Kingdom is the European country that established a state in Ghana.', 'Yes, the gray wolf and the domestic dog share a common ancestry. Both are members of the Canidae family, and they are closely related, with the domestic dog being descended from the gray wolf through a process of domestication.', 'No, wolf pups tend to have lighter fur

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["Grover Cleveland was indeed married in the White House. He was the only president to be married while in office, and his wedding took place in the White House's Blue Room on June 21, 1886. He married his second wife, Calvin Coolidge's sister, Elizabeth.", 'I would be happy to help you find information. However, I need more specific details about the person you are referring to in order to provide an accurate answer. Can you please provide me with the name of the individual you are asking about?', "I don't have access to personal or historical data about individuals unless it has been shared with me in the course of our conversation. I am designed to respect user privacy and confidentiality. Therefore, I don't have an answer to this question. Can you please provide more context or details about the person you are", 'Grover Cleveland won the presidential election of 1884 and the presidential election of 1888.', 'Grover Cleveland was born on March 16, 1837, in Caldwell, New Jersey, Unit

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['No, Indonesia does not have the highest level of biodiversity in the world. That title goes to Brazil, which is home to a greater variety of plant and animal species than any other country.', 'No, Indonesia was not named after it became an independent country. The name "Indonesia" was derived from the Latin words "Indus" and "nesos", meaning "islands" and "peninsula", respectively. The term was first used in the 17th century to refer to the Dutch', "Indonesia broke off relations with the People's Republic of China in 1967 due to several reasons. One of the main reasons was China's support for Indonesia's communist party, the PKI, during the 1965 Indonesian mass killings. The PKI had been gaining", "The spice that originally attracted Europeans to Indonesia was likely pepper. Pepper, also known as piper methysticum, is a flowering plant that produces the world's most widely used spice. It was highly valued in ancient times for its pungent flavor and medicinal properties, and it", 'Jak

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['James Monroe was shot in the chest while riding his horse through the streets of Baltimore, Maryland on June 28, 1826.', 'James Monroe, the 5th President of the United States, married Elizabeth Monroe.', 'Before being elected governor, James Monroe served as a member of the Virginia House of Delegates and the Continental Congress. He was also a lawyer and a plantation owner.', 'John Monroe was elected as President in the years 1816 and 1820.', 'John Monroe is not present in the painting of Washington Crossing the Delaware. The painting depicts George Washington leading his troops across the Delaware River during the American Revolutionary War.', 'James Monroe was appointed as the Secretary of War on March 4, 1817.', 'James Monroe, the 5th President of the United States, died on June 16, 1836.', 'James Monroe graduated from William and Mary on June 20, 1786.', 'James Monroe was elected president of the United States on November 4, 1816.', 'James Monroe sold his estate at Monroe Hall i

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["No, John Adams did not represent the Continental Congress in Europe. He was the United States' second president, serving from 1797 to 1801. During his presidency, he negotiated the Jay Treaty with Great Britain, but he did not represent the Continental Congress in Europe", "I don't have access to personal information about individuals unless it has been shared with me in the course of our conversation. I am designed to respect user privacy and confidentiality. Therefore, I don't have any information about whether Adams was raised Congregationalist or not.", 'Yes, John Adams was an opponent of the Stamp Act. He was a prominent figure in the American Revolution and played a key role in organizing and leading the opposition to the Stamp Act, which was a tax imposed by the British government on the American colonies in 1765. Adams believed that', 'I don\'t have access to personal information about individuals unless it has been shared with me in the course of our conversation. Can you pl

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['The collective noun for kangaroos is a "mob" or a "group".', "Joey's complete postnatal development occurs in their mother's pouch.", 'Kangaroos do not use "crawl-walking". They are marsupials and are capable of hopping long distances.', 'Kangaroos have a wide bite to help them grip and crush the tough, fibrous grasses that make up their primary diet. Their powerful jaw muscles and extended lower jaw bones work together to create a broad, hinged mouth that allows them to take large bites and grind their food efficiently.', 'The bacterium Acetobacter is responsible for converting the hydrogen byproduct of fermentation into acetate through a process called acetogenesis.', 'Yes, wild kangaroos are hunted for meat in some countries. In Australia, where kangaroos are native, it is legal to hunt them for meat under certain regulations. However, commercial hunting of kangaroos for meat is not a significant industry in Australia. In other countries, such as', 'Since European settlement in th

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Larger kangaroos, such as the eastern grey kangaroo and the red kangaroo, have adapted much better to changes in their environment compared to smaller kangaroo species. These larger kangaroos have stronger hind legs and larger feet, which allow them to jump farther and', 'It is not accurate to say that larger kangaroos have adapted better to changes than smaller ones. Adaptation is a complex process that depends on various factors, including the specific environment, resources, and genetic makeup of the organism. While larger kangaroos may have certain advantages in certain situations, such', 'Yes, the leopard is smaller than the other members of the Panthera genus, which includes lions, tigers, jaguars, and leopard cats.', 'Yes, a leopard is generally larger and less lanky than a cheetah. Leopards are one of the largest big cats and can weigh up to 600 pounds, while cheetahs are the smallest big cats and weigh up to 75 pounds. In terms of body length', "Yes, large pythons can be pot

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes, a panther can refer to several species of large felids, including the cougar, leopard, jaguar, and mountain lion. These animals are all members of the Felidae family and are commonly referred to as panthers in different regions of the world.', 'Yes, the leopard is an agile and graceful predator. Leopards are known for their exceptional physical abilities, including their strength, speed, and agility. They are skilled climbers and can move quickly and silently through the trees, allowing them to surprise and catch their prey. Leopards are also', 'Yes, Liechtenstein is bordered by Switzerland. It shares its entire border with Switzerland, which is approximately 16 kilometers long.', 'Yes, Liechtenstein is the smallest German-speaking country in the world. It has a total area of 61 square kilometers.', 'No, Liechtenstein was not part of the ancient Roman province of Raetia. The territory of Liechtenstein was part of the Roman Empire, but it was not included in the province of Raeti

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['It is not specified in the question what specific location or place is being referred to. Therefore, it is impossible to provide a definitive answer. However, it is worth noting that many popular winter sports resorts are also known for their tax incentives or favorable tax laws, which may contribute to their popularity among wealthy', "I don't have enough information to accurately determine if the location being referred to is a winter sports resort. Can you please provide more details or context?", 'No, Liechtenstein is not the smallest German-speaking country in the world. San Marino is the smallest German-speaking country in the world.', 'Moravia, Lower Austria, Silesia, and Styria were all territories that were held in fief under other more senior feudal lords. However, the specific events that occurred in each of these territories during this time period are not specified in the question. Can you provide more context or', 'No, Webster did not write "I can now sleep of nights." 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["I'm sorry, but I need more context or specific details about the events that took place in the log cabin in Summerhill, New York. Can you please provide me with more information?", 'The person who was born precisely at midnight during an electrical storm, to a Serbian family in the village of Smiljan near Gospić, in the Lika region of the Croatian Krajina in Military Frontier (part of the Austrian Empire), in the present-day Croatia', 'Yes, there are at least two films describing Tesla\'s life. The first is "Tesla: The Man Who Invented the Future" (2000), directed by Michael Apted. The second is "Tesla" (2016), directed by James Marsh', 'Yes, there have been several live theatrical plays based on the life of Nikola Tesla produced and staged worldwide. Some of these plays include:\n\n1. "Tesla: A Musical" by David St. Louis and Mark H. Dunn, which premiered in 20', 'No, sea otters do not have long muscular tails. Their tails are relatively short and fluky, adapted for swimming in the 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes, it is true that otters eat a variety of fish. Otters are highly carnivorous and their diet primarily consists of fish. Depending on the species of otter, they may eat a variety of fish such as trout, salmon, eel, bass, and even sturgeon', 'Yes, otters are known for being playful animals. They are highly intelligent and enjoy engaging in various forms of play, such as swimming, sliding, and exploring their surroundings. They are also known for their curiosity and love of exploring new things, which often leads to playful behavior.', 'Yes, otters are very active animals. They are known for their playful and energetic behavior in the water and on land. In the wild, otters are constantly swimming, hunting, and exploring their surroundings. They are also known to play with each other and engage in a variety of activities, such as ch', 'No, male otters are not dog-otters. Male otters belong to the family Mustelidae, while dog-otters belong to the family Canidae. Mustelidae is commonly

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['No, penguins are not particularly agile. While they are adept at swimming and diving, their walking speed on land is relatively slow, typically around 1-2 miles per hour. Their agility is limited by their streamlined bodies and heavy flippers, which are adapted for swimming, not for', 'Yes, penguins are at risk due to several factors such as habitat loss, climate change, oil spills, and hunting. Some species of penguins are also threatened by predation, disease, and competition for food. Conservation efforts are underway to protect penguins and their habitats, but', 'Yes, it is possible that the word "penguin" comes from the Latin word "pinguis," which means "fat." This is one of the theories about the origin of the word "penguin," although it is not the most widely accepted theory.', 'No, penguins do not have an average sense of hearing for birds. While they do have some hearing capabilities, their hearing is not as acute as that of many other birds. Penguins primarily rely on their

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['The statement "Are studies insufficient evidence for global protection?" is not a clear question and therefore cannot be answered precisely. More context and specifics about the studies, the topic being referred to, and the desired level of evidence for global protection are needed in order to provide a precise answer.', 'Yes, polar bears are excellent swimmers. They are well adapted for swimming in the Arctic waters, with their thick fur providing insulation and waterproofing, and their large paws and strong muscles allowing them to swim efficiently and for long distances.', 'The polar bear is a mammal, and as such, it has mammary glands for nursing its young, hair or fur for insulation, and three middle ear bones for hearing. It also has a thick layer of blubber for insulation and energy storage. Additionally, polar bears have sharp claws', 'Garbage is waste material that is discarded and typically consists of organic and inorganic substances that are no longer useful or needed by 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['It is not clear what allophones you are referring to and in what positions you are asking about. Allophones are variations in the pronunciation of a phoneme, which is the basic unit of sound in a language. In English, there are many allophones that can occur in different positions within', 'Yes, that is correct. Qatar is an Arab emirate in southwest Asia, and its official language is Arabic.', 'Yes, Romania does border Hungary. The two countries share a land border that is approximately 343 kilometers long. The border separates the two countries in the northern part of Romania, with Hungary to the northwest and Romania to the southeast.', 'Romania is a secular state. The Constitution of Romania states that the country is a secular state and guarantees freedom of religion and conscience. This means that the government does not officially recognize or endorse any particular religion, and religious institutions are separate from the government.', "No, the President of the United States 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["No, Singapore's standard of living has not decreased since independence. In fact, it has significantly increased. Singapore is consistently ranked as one of the wealthiest and most developed countries in the world. Its economy has grown rapidly, and its citizens enjoy a high standard of living with access to quality education, healthcare,", "No, the media did not contribute 15.6% to Singapore's GDP in 2001. According to the Infocomm Media Authority of Singapore, the media industry contributed 4.3% to Singapore's GDP in 2001.", 'The number of people who use the bus network daily varies depending on the location and the specific bus network in question. It would be most accurate to obtain this information from the local transit authority or transportation department for the specific city or region you are inquiring about.', 'The Orang Laut people are an indigenous ethnic group of Malaysia and Indonesia. They are believed to have originated from the islands of Sumatra, Java, and Borneo.

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["Yes, Singapore has a network of reservoirs and water catchment areas. These are man-made structures designed to collect and store rainwater, as well as to treat and supply water to the city-state's residents and industries. The network includes more than 100 reservoirs, with a total storage", 'No, urbanization has not eliminated all areas of primary rainforest, and the Bukit Timah Nature Reserve is not the only remaining area of primary rainforest. While it is true that primary rainforests have been severely impacted by deforestation and urbanization, there are still many areas of', 'Yes, the economy does depend heavily on exports refining imported goods, especially in manufacturing. Exports are a crucial source of revenue for many countries, and they often involve the refining or processing of imported goods to meet local demand or to create new products. In manufacturing, this can involve the use of raw materials or', 'Yes, Singapore is an island nation located at the southern tip 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['No, sea turtles do not lay their eggs on dry sandy beaches. They lay their eggs in nests on sandy beaches, but the eggs are not dry. The eggs are covered with a protective coating that keeps them moist.', "Yes, Pleurodira are commonly known as side-necked turtles. This is because of the unique shape of their necks, which are flatter on one side and more curved on the other, resembling the side of a ship's neck.", 'No, turtles are not a part of the Flying Spaghetti Monster. The Flying Spaghetti Monster is a mythical creature often depicted as a giant, flying, spaghetti-like creature with meatballs for eyes. It is not a real animal, and', 'Turtles are divided into two suborders: Testudines and Cryptodira.', 'One turtle that is popularly kept as a pet is the common box turtle (Rubber Ducky).', '1. "The Biology and Behavior of Turtles"\n2. "Turtle Conservation and Protection"\n3. "Turtles as Pets: Care and Handling"\n4. "Turtle Diet and Nutrition"', 'If a turtle egg was kept warm, it wou

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


["Ulysses S. Grant's brother-in-law was Simon Bolivar Buckley.", 'Grant was born in Point Pleasant, Ohio.', 'Ulysses S. Grant was a Republican.', 'Grant said "Damn, I had nothing to do with this batte." because he was falsely accused of being involved in a battle and wanted to clarify that he had no participation in it.', 'Yes, Horace Greeley lost the presidential election of 1872. He was the candidate of the Liberal Republican Party and ran against Ulysses S. Grant, who was seeking re-election as the Republican Party nominee. Grant won the election, receiving 212 electoral votes', 'Ulysses S. Grant, the 18th President of the United States, did not express any public statements indicating that he thought war was unjust during his time in office. However, it is worth noting that after the Civil War, Grant wrote his memoirs, titled "The Personal Memoirs of', 'General Ulysses S. Grant achieved international fame as the leading Union general in the American Civil War.', 'Grant was elected 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Uruguay is a country in South America, and it has a total land area of approximately 28,284 square kilometers. Of this total land area, 97.8% is continental land, while the remaining 2.2% is coastal and island land. Therefore, the', 'The percentage of the global population that is of white European descent varies widely depending on the source and the specific definition of "white European descent." However, according to the United Nations, the global population with European ancestry was estimated to be around 1.1 billion in 2019, which accounts for approximately', "The capital of Uruguay, Montevideo, was founded on December 15, 1819, as a result of the country's independence from Spain. It was strategically located on the eastern coast of the country, at the mouth of the Sacramento River, and served as a key", 'Uruguay has won the FIFA World Cup twice. They first won the tournament in 1930, hosting the event in Montevideo. They then won again in 2010, in South Africa.', 'I cannot p

### Answer Prompts

In [None]:
def generate_prompts_answer(docs, questions, index, emb_tokenizer, emb_model, k, passages):
    prompts = []
    faiss_passages = retrieve_top_k_passages_faiss(docs, index, emb_tokenizer, emb_model, k)
    help_instruction = "Give an answer to the question using one word if possible."

    for i, passages_ids in enumerate(faiss_passages):
        # Récupérer les textes directement via les indices
        retrieved_passages = [passages[j] for j in passages_ids]

        # Construire le prompt
        context_string = (
            f"{help_instruction}\n" +
            "\n".join(retrieved_passages) +
            "\n" +
            f"Question: {questions[i]}\n"
        )
        prompts.append(context_string)

    return prompts

In [21]:
questions_val = questions_split["validation"]["question"]
passages = text["passages"]["passage"]

k_values = [1, 2, 3, 4]
prompts_k = {}

for k in k_values:
    prompts_k[k] = generate_prompts_answer(hyde_docs, questions_val, index, emb_tokenizer, emb_model, k, passages)

# Affichage exemple
print(prompts_k[3][0])

Give an answer to the question using one word if possible.
Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that 'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the center stage of American politics at

### Génération

In [22]:
def process_batch_prompts_answer(batch_prompts, tokenizer, model, device, max_new_tokens=64):
    formatted_prompts = [
        f"<s>[INST] {prompt.strip()} [/INST]" for prompt in batch_prompts
    ]

    print(formatted_prompts)

    inputs = tokenizer(formatted_prompts, return_tensors="pt", padding=True, truncation=True).to(device)

    # Génération
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        do_sample=False,
        # top_p=0.9,
        # temperature=0.7,
        eos_token_id=tokenizer.eos_token_id
    )

    # Décodage et extraction de la réponse
    responses = [
        tokenizer.decode(output, skip_special_tokens=True).split('[/INST]')[-1].strip()
        for output in outputs
    ]

    print(responses)

    return responses

def chunked_iterable(iterable, size):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, size))
        if not chunk:
            return
        yield chunk

def generate_responses(gen_model, gen_tokenizer, prompts, batch_size = 32) : 
    batch_responses = []
    
    for batch_index, batch_prompts in enumerate(chunked_iterable(prompts, batch_size)):
        print(f"[k={k} | Batch {batch_index}] Started generation...")
        start_time = time.time()
        
        batch_responses.extend(process_batch_prompts_answer(batch_prompts, gen_tokenizer, gen_model, DEVICE))
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"[k={k} | Batch {batch_index}] Time for generation = {elapsed_time:.2f} seconds")
    return batch_responses

In [23]:
k_values = [3]
generated_responses = {}
for k in k_values:
    generated_responses[k] = generate_responses(gen_model, gen_tokenizer, prompts_k[k])
    print(f"Done with k = {k}!")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[k=3 | Batch 0] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nAbraham Lincoln (February 12, 1809 â\x80\x93 April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that \'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought A

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'No.', '18 months.', '1832.', 'Paper currency.', 'Grace Bedell', 'November 19, 1863.', 'Yes.', 'No. Abraham Lincoln was the sixteenth President of the United States.', 'Yes.', 'Yes.', 'Hardin', 'March 4, 1861.', 'Booth.', 'Yes.', 'McClellan', 'To free slaves in Confederate-held territories.', 'Yes.', 'Yes.', 'Yes.', 'Seven.', '1846.', "Lincoln's legal career spanned from 1837 to 1861.", 'Court trial.', 'Yes.', 'No.', 'Avogadro.', 'John Adams', 'No.', 'Not specified.', 'No.']
[k=3 | Batch 0] Time for generation = 4.71 seconds
[k=3 | Batch 1] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nLorenzo Romano Amedeo Carlo Avogadro, Count of Quaregna and Cerreto (August 9, 1776 July 9, 1856) was an Italian savant.  He is most noted for his contributions to the theory of molarity and molecular weight. As a tribute to him, the number of elementary entities (atoms, molecules, ions or other particles) in one mole of a substance, 6.0221

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'No.', 'Yes.', 'Not mentioned.', 'Anders Celsius.', 'Lincoln', 'Newton did not publish a collection.', 'No.', 'No.', 'Yes.', 'Yes', 'Yes.', 'Anders Celsius.', 'Celsius', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Not specified.', '1745 is the year of the Battle of Culloden, a decisive battle in the Jacobite rebellion against British rule in Scotland.', 'Yes.', 'No.', 'Yes.', 'Head, Thorax, Abdomen.', 'Mimicry.', 'Potato.', 'Head, mouthparts, spiracles.', 'Beetles eat a variety of things including plants, fungi, and other invertebrates. Some species are highly specialized in their diet, feeding on only one type of plant or animal, while others are generalists and will eat a wide range of foods.']
[k=3 | Batch 1] Time for generation = 6.24 seconds
[k=3 | Batch 2] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nBeetles have mouthparts similar to those of grasshoppers. Of these parts, the most commonly known are p

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Mandibles.', 'Yes.', 'Yes.', 'Yes.', 'Approximately 350,000.', 'During pairing.', 'Entomology.', 'Yes.', 'No.', 'No.', 'Various animals including birds and mammals prey on decaying organic matter.', 'Coleoptera', 'Species within the Coleoptera order vary greatly in form.', 'Yes.', 'Yes.', 'Gas.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Nothing.', 'No.', 'Yes.', 'Yes.', '1874', '1905', 'Roaring Twenties', 'Moved to Northampton, Massachusetts to practice law.', 'Massachusetts', 'Coolidge.', 'No.']
[k=3 | Batch 2] Time for generation = 3.87 seconds
[k=3 | Batch 3] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nJohn Calvin Coolidge, Jr. (July 4 1872   January 5 1933), more commonly known as Calvin Coolidge, was the thirtieth President of the United States (1923â\x80\x931929).  A lawyer from Vermont, Coolidge worked his way up the ladder of Massachusetts state politics, eventually becoming governor of that state. His actions during the Bosto

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'July 4, 1872.', 'Plymouth.', 'Vermont', 'No.', 'No.', 'Infection.', 'No.', 'No.', 'Yes.', '1898', 'Phi Gamma Delta.', 'Grace Anna Goodhue.', 'He did not attend law school.', 'No.', '1918', 'No.', 'Yes.', 'No.', 'Chinese.', 'Quebec City', 'British North America Act.', 'France', 'Significantly.', '10.7%', 'Yes.', 'Yes.', 'No.', 'Ten provinces and three territories.', '5.9%', 'Quebec City-Windsor Corridor.']
[k=3 | Batch 3] Time for generation = 3.59 seconds
[k=3 | Batch 4] Started generation...
["<s>[INST] Give an answer to the question using one word if possible.\nAt 1,919,440 square kilometers (741,050 sq mi), Indonesia is the world's 16th-largest country in terms of land area.    Its average population density is 134 people per square kilometer (347 per sq mi), 79th in the world,    although Java, the world's most populous island,    has a population density of 940 people per square kilometer (2,435 per sq mi). At 4,884 meters (16,024 ft), Puncak Jaya in Papua is Ind

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Russia.', 'Catholics.', 'During the Cold War.', 'Lands.', '1545', 'Fur trade.', 'Yes.', 'Unrelated.', 'Yes.', 'Maple Leaf Flag.', 'Books.', 'Yes.', 'Yes.', 'Yes.', 'No.', 'Yes.', 'No.', 'Yes.', 'Not provided. The question refers to events in 1865.', 'Yes.', 'Yes.', 'No.', 'Smew.', 'Duck.', 'Meat.', 'Daffy Duck is a cartoon character.', 'Weight.', 'Food, sport, feathers, and economic uses.', 'Yes.', 'Yes.', 'Ducks have economic uses such as meat, eggs, feathers, and display in zoos.', 'None.']
[k=3 | Batch 4] Time for generation = 3.37 seconds
[k=3 | Batch 5] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nDucks are sometimes confused with several types of unrelated water birds with similar forms, such as loons or divers, grebes, gallinules, and coots.\nDuck is the common name for a number of species in the Anatidae family of birds. The ducks are divided between several subfamilies listed in full in the Anatidae article. Ducks are mostly a

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Loons, divers, grebes, gallinules, coots.', 'Bird.', 'Predators.', 'Flat.', 'Yes.', 'Yes.', 'Yes.', 'Loons, divers, grebes, gallinules, coots.', 'Vulnerable.', 'Ducks have several economic uses, including meat, eggs, feathers, and display in zoos.', 'Lamellae.', 'Duck test.', 'Hunting.', 'Yes.', 'Yes.', 'Yes.', '1967', 'Islam, Christianity, and Judaism.', '78 million.', 'Agriculture.', 'No.', 'Akhenaten was a Pharaoh during the New Kingdom period.', 'No.', 'Yes.', 'Yes.', '1953', 'Old Kingdom.', 'Libya, Sudan, Gaza Strip, Israel, Mediterranean Sea, Red Sea.', '78 million.', 'Yes.', 'Organized agriculture appeared in the Nile Valley by about 6000 BC.', 'Yes.']
[k=3 | Batch 5] Time for generation = 4.02 seconds
[k=3 | Batch 6] Started generation...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'Two.', '70 years.', 'No.', 'Two.', '113 kg (250 lb).', 'Aristotle did not write about elephants.', 'Yes.', 'No.', 'Yes.', 'Infrasound.', 'Rarely.', '22 months.', 'Two.', 'Regulation.', 'Elephant.', 'Yes.', 'Yes.', 'Yes.', 'Approximately 60,000.', 'Ears.', 'Angola.', 'Yes.', 'There is no land animal larger than an elephant.', 'No.', 'No.', 'Yes.', 'No.', 'Alexander I', '75', '1952']
[k=3 | Batch 6] Time for generation = 3.54 seconds
[k=3 | Batch 7] Started generation...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Gulf of Finland.', 'Tiger', 'Yes.', 'Yes.', 'No.', 'Yes.', '75 years.', 'Helsinki.', 'HIM.', 'Industrialized.', 'Europe.', 'Hot.', 'No.', 'Yes.', 'Yes.', 'Twelfth century.', 'Montevideo.', 'Christmas.', 'No.', 'No.', 'None.', 'Yes.', '24 years.', 'Yes.', 'Yes.', 'Yes.', 'Center and linebacker.', 'Ford described his biological father as a "carefree, well-to-do man."', 'Rockefeller.', 'No.', 'Complex.', 'Yes.']
[k=3 | Batch 7] Time for generation = 3.31 seconds
[k=3 | Batch 8] Started generation...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['No.', 'Yes.', 'Yes.', 'Yes.', 'Fifth.', '400.', 'Jay Berwanger.', 'No.', 'No.', 'Yes.', 'Eniwetok and Kwajalein are both islands in the Pacific Ocean.', 'Ford received the patent for the automatic transmission on April 13, 1942.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'No.', 'Yes.', 'Yes.', 'Yes.', 'No.', '1957', '10.', 'President John Agyekum Kufuor.', 'Christianity.', 'Kwa and Gur.', 'Warm.', 'No.', 'Yes.', 'Yes.', 'Warrior King.', 'Kwame Nkrumah was the founder and first president of Ghana.']
[k=3 | Batch 8] Time for generation = 5.53 seconds
[k=3 | Batch 9] Started generation...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Old Ghana is in relation to present Ghana as the medieval Ghana Empire, which was located in West Africa and controlled parts of what is now Ghana.', "Ghana's Parliament is made up of the President, the Senate, and the Chamber of Deputies.", 'Portugal.', 'Yes.', 'Yes.', 'Yes.', 'Wolves have two layers of coats.', 'All over their bodies.', 'Wolves weigh between 32 to 62 kilograms (70 to 135 pounds).', 'Communication.', 'Surplus killing is the killing of multiple prey animals that a predator cannot eat at one sitting due to an overwhelming number of prey.', 'No.', 'Yes.', 'Yes.', 'Yes.', 'Kazakhstan.', 'Canis', 'Wolves molt in spring and autumn.', 'Dominance.', 'Darting and foot hold traps.', 'Alertness', 'No.', 'No.', 'No.', 'White House.', 'Heart attack.', 'One.', 'Opponents accused Cleveland of being corrupt.', 'James G. Blaine', 'To avoid market panic.', 'No.', 'Yes.']
[k=3 | Batch 9] Time for generation = 4.54 seconds
[k=3 | Batch 10] Started generation...
["<s>[INST] Give an answ

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Not provided.', '1929', '1884', "Cleveland's parents were Richard Cleveland and Anne Neal.", 'Five.', 'Grover Cleveland.', 'No.', 'Yes.', 'Yes.', 'No.', 'No.', 'Yes.', 'No.', '1894', 'Discovery of radioactivity.', 'No.', 'No.', 'Not mentioned.', 'Yes.', 'Yes.', 'Antoine Henri Becquerel', 'No.', 'No.', '1896 was the year of the first modern Olympic Games held in Athens, Greece.', 'The bromide emulsion in two sheets of very thick black paper was used to protect a LumiÃ¨re photographic plate from being clouded upon being exposed to the sun for a day.', 'Yes.', 'Yes.', 'No.', 'No.', '1999.', '16th century.']
[k=3 | Batch 10] Time for generation = 6.61 seconds
[k=3 | Batch 11] Started generation...
["<s>[INST] Give an answer to the question using one word if possible.\nIndonesia's size, tropical climate, and archipelagic geography, support the world's second highest level of biodiversity (after Brazil),    and its flora and fauna is a mixture of Asian and Australasian species.    

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['No.', 'No.', "The reason for Indonesia breaking off relations with the People's Republic of China was due to anti-communist purges during the Suharto era.", 'Nutmeg.', 'City', 'Indonesia', 'Javanese.', 'Flourishing.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'State', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Language', 'N/A', '1825.', 'Spence', 'None. James Monroe did not have any daughters.', '1812-1824', 'Yes.', 'Declaration.', 'Yes.', 'Yes.', 'Yes.', 'Westmoreland County, Virginia.']
[k=3 | Batch 11] Time for generation = 5.38 seconds
[k=3 | Batch 12] Started generation...
["<s>[INST] Give an answer to the question using one word if possible.\nUpon Elizabeth's death in 1830, Monroe moved to New York City to live with his daughter Maria Hester Monroe Gouverneur who had married Samuel L. Gouverneur in the first White House wedding.  Monroe died there from heart failure and tuberculosis on July 4, 1831, becoming the third president to die on the 4th of July. His death came 55 y

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Trenton.', 'Elizabeth Kortright.', 'Monroe served in the Continental Congress from 1783 to 1786.', '1816, 1820', 'John Monroe was not standing behind in the painting of Washington Crossing the Delaware.', '1814', 'July 4, 1831.', '1776', '1816.', "Monroe sold his family farm, now the University of Virginia's modern campus, in 1817.", 'December 2, 1823.', 'Daughter', 'Discussion of his religious beliefs.', 'Rejection of the Jay Treaty led to the War of 1812.', 'Washington.', 'The Federalist Party.', 'No.', 'Tours.', 'No.', 'Yes.', 'Not specified.', 'Yes.', 'Fillmore.', 'Not specified.', 'Monroe was elected president in the election of 1816.', 'Margaret', 'Andrew Jackson', 'Yes.', 'Yes.', 'Yes.', 'Experimenting with steam.', 'No.']
[k=3 | Batch 12] Time for generation = 4.48 seconds
[k=3 | Batch 13] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nCongress chose Adams to represent the fledgling union in Europe in 1777, and again in 1779. On 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'Yes.', 'The question does not provide information about when Adams graduated from college.', 'Jefferson, Franklin, Livingston, and Sherman.', 'Jefferson called John Adams "the Colossus of that Congress" and "the ablest advocate and champion on the floor of the House."', 'Federalist.', 'No.', 'No.', 'Yes.', 'No.', 'Yes.', '1789-1797.', 'Federalist Party.', 'Quincy.', 'Adams.', 'Adams was opposed by Andrew Hamilton on the army issue.', 'Little.', 'John Adams.', 'John Adams.', 'Quincy', 'Yes.', 'No.', 'John Adams.', 'No.', 'Books.', "Adams' opponents were Thomas Jefferson and Aaron Burr.", 'Yes.', 'Not mentioned.', 'No.', 'Yes.', 'No.']
[k=3 | Batch 13] Time for generation = 5.50 seconds
[k=3 | Batch 14] Started generation...
["<s>[INST] Give an answer to the question using one word if possible.\nKangaroo soon became adopted into standard English where it has come to mean any member of the family of kangaroos and wallabies.  Male kangaroos are called bucks, boomers, jack

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Mob.', 'Pouch.', 'Kangaroos do not use "crawl-walking." They are the only large animals that use hopping as a means of locomotion.', 'Specialized teeth.', 'Bacteria.', 'Yes.', 'Yes.', 'Few.', '46 years.', '1994.', 'Roo bars.', 'Yes.', 'Extinct.', 'Yes.', 'Yes.', 'No.', 'Hopping.', 'Mob.', 'Kangaroo.', 'Protective measures such as windscreens, headlights, and engine noise are used to protect vehicles from kangaroos.', 'Hopping.', 'Unknown.', 'Four.', 'Yes.', 'No.', 'Veterinarians.', 'Different.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.']
[k=3 | Batch 14] Time for generation = 3.85 seconds
[k=3 | Batch 15] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nKangaroos have large, powerful hind legs, large feet adapted for leaping, a long muscular tail for balance, and a small head. Like all marsupials, female kangaroos have a pouch called a marsupium in which joeys complete postnatal development.\n* The Eastern Grey Kangaroo (Macropus giganteus) i

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Efficiency.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Cheetah or Jaguar.', 'Pumapard.', 'Trees.', 'Tail length.', 'Black Panthers.', 'Circular.', 'Yes.', 'Yes.', 'Yes.', 'Africa, Asia.', "The length of a leopard's tail is between 60 to 110cm.", 'The weight of a leopard can vary, but typically ranges from 28 to 60 kg for females and 37 to 90 kg for males.', 'Yes.', 'Spotted.', 'Largest.', 'Leopard.', 'Felis pardus.', 'Leopards.', 'No.', 'Yes.', 'The question does not provide enough context or information to accurately determine what is being referred to as "centered in Sierra." Could you please provide more information or clarify the question?', "Felis pardus was described in Linnaeus's Systema Naturae.", 'Yes.', 'Yes.', 'No.', 'Yes.', 'Yes.']
[k=3 | Batch 15] Time for generation = 5.84 seconds
[k=3 | Batch 16] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nA panther can be any of several species of large felid; in North America, the term refers

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'No.', 'No.', 'Yes.', '1836.', '11', 'The national currency of Liechtenstein is the Swiss franc.', 'Yes.', 'Austrian Federal Railways.', 'Two-thirds.', 'No.', 'No.', 'Yes.', 'No.', 'October 1921.', 'German.', 'Switzerland and Austria.', 'Raetia.', 'Five.', 'Liechtenstein.', '1861', 'Multiple countries enjoy high standards of living, including Finland, Singapore, Canada, and many others.', 'Yes.', 'Both are companies that have expanded regionally and globally.', 'No.', 'Book.', 'Constitution.', 'Preserves.', 'Yes.', 'Yes.', 'Yes.']
[k=3 | Batch 16] Time for generation = 3.45 seconds
[k=3 | Batch 17] Started generation...
["<s>[INST] Give an answer to the question using one word if possible.\nAlthough many tourists visit for the ideal weather during the summer, winter also attracts hundreds of thousands for its Christmas festivities and winter sports and activities such as skiing, dog sledding and Nordic walking. Finland is regarded as the home of Saint Nicholas or Santa

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['No.', 'Yes.', 'No.', 'These territories were held in fief under other more senior feudal lords.', 'No.', 'Yes.', 'Yes.', 'Weed.', 'Japan', 'Indentured servant.', 'After.', 'Yes.', '1858.', 'Yes.', 'No.', 'No.', 'Summerhill, New York.', 'Nathaniel Millard Fillmore', 'Abigail Powers.', 'Slavery.', 'Inheritance.', 'Angry.', 'Fillmore', '1846', 'Abigail Powers.', 'Yes.', 'Yes.', 'No.', 'Yes.', 'Yes.', 'Yes.', 'Yes.']
[k=3 | Batch 17] Time for generation = 3.46 seconds
[k=3 | Batch 18] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nAt the Pan-American Exposition in Buffalo, New York President McKinley was shot by Leon Czolgosz (Zol-gash), on September 6, 1901.  Roosevelt had been giving a speech in Vermont when he heard of the shooting. He rushed to Buffalo but after being assured the President would recover, he went on a planned family camping and hiking trip to Mount Marcy. In the mountains a runner notified him McKinley was on his death be

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['The question does not mention anything happening in a log cabin in Summerhill, New York.', 'Nikola Tesla', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'No.', '13.', 'Fish.', 'Sea otters weigh around 30 kg (about 65 pounds).', 'Air.', 'Hunting.', 'Roadkill deaths.', 'Yes.', '13 species in 7 genera.', 'No.', 'Sea.', 'Old English.', 'Give birth.', 'Hunting', 'Culture.', 'Yes.', 'Yes.', 'No.', 'Fish.', 'Holt.', 'Poaching, habitat loss, and use of mercury in gold mining.', 'Air.', 'Playful.', 'Mustelidae.', 'Guard hairs.', 'Elephants.']
[k=3 | Batch 18] Time for generation = 4.34 seconds
[k=3 | Batch 19] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nMost otters have fish as the primary item in their diet, supplemented by frogs, crayfish and crabs.  Some are expert at opening shellfish, and others will take any available small mammals or birds.  This prey-dependence leaves otters very vulnerable to prey depletion.\nOtters do not depend on their specializ

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'Yes.', 'Yes.', 'Holt.', 'Sea otters eat shellfish and other invertebrates.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Emperor Penguin.', 'Yes.', 'Sliding.', 'Camouflage.', 'No.', 'Little.', 'The number of penguin species is debated, with estimates ranging from 17 to 20.', 'Isabelline penguins are different from most penguins in that they are born with brown rather than black plumage.', 'Yes.', 'No.', 'Yes.', 'Sphenisciformes.', 'Gigantic.', 'Adaptation.', 'Conserve energy.', 'Turtles.', '3 meters', 'Flippers.', 'Steal.', 'Yes.', 'Yes.']
[k=3 | Batch 19] Time for generation = 3.91 seconds
[k=3 | Batch 20] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nPenguins are superbly adapted to an aquatic life. Their wings have become flippers, useless for flight in the air. In the water, however, penguins are astonishingly agile. Within the smooth plumage a layer of air is preserved, ensuring buoyancy. The air layer also helps insul

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'No.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'No.', 'Yes.', 'Yes.', 'The Antarctic Circumpolar Current started as a continuous circumpolar flow around 30 mya, which forced the Antarctic cooling and facilitated the eastward expansion of Spheniscus to South America and beyond.', '"Web."', 'Yes.', 'Yes.', '1002 kg.', 'Black.', 'Long.', 'Global warming.', 'Endangered.', 'Fossil evidence tells us that penguins were not entirely flightless by the end of the Cretaceous and that gigantic penguins had disappeared by the end of the Paleogene. DNA evidence is not mentioned in the given text.', 'Yes.', 'Yes.', 'Yes.', 'Heavy.', '1002 kg.', 'Camouflage and insulation.', 'No.', 'Female polar bears gain up to double their weight during pregnancy.', 'Yes', 'Black is not a color.', 'Insulation.', 'Database entry.', 'They are both environmental organizations that have filed a lawsuit against the United States Department of the Interior to protect polar bears and their critical habitat.']
[k=3 | Batch

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['No.', 'Yes.', 'Fur.', 'Consumable.', 'Hibernate.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'No.', 'Yes.', 'Yes.', '1971', 'Doha.', 'Doha.', 'No.', 'The Arabic word for municipalities is "baladiyah".', 'Carnegie Mellon, Georgetown, Texas A&M, Virginia Commonwealth, Cornell, Northwestern, College of the North Atlantic, University of Calgary.', 'Recognition.', '1971', 'Desert', 'Qatar is in the wider region.', 'Yes.', 'Yes.', 'Saudi Arabia.', 'Qatar', 'Yes.', 'Yes.', 'No.', 'Yes.', 'Yes.', 'Conflict.']
[k=3 | Batch 21] Time for generation = 5.52 seconds
[k=3 | Batch 22] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nIn terms of English phonemics, the vowels sound halfway between short u   and broad a  . The q and the t have no direct counterparts, but are closest to the unaspirated allophones of English k and t. However, since these allophones cannot occur in these positions in English, in this context they will sound more like English g and d. So 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'Yes.', 'Yes.', 'No.', 'Forty-one.', 'Moldoveanu Peak.', 'Romanian', 'Yes.', 'Yes.', 'Four.', 'Yes.', 'Yes.', 'Yes.', 'Nadia ComÄ\x83neci', 'Bucharest', 'Hungarians.', 'Twelve.', 'Romania is a secular state.', 'Pressure', '2007', 'Homo sapiens sapiens remains.', 'Yes.', 'Regulation.', 'Investments.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'No.']
[k=3 | Batch 22] Time for generation = 6.51 seconds
[k=3 | Batch 23] Started generation...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['No.', 'No.', 'More than 2.78 million people use the bus network daily.', 'Kampong Glam.', 'Mass Rapid Transit (MRT) is a heavy rail passenger transport system in Singapore.', 'Tropical.', '3%.', 'Six.', 'No.', 'Yes.', 'No.', 'Lion.', 'The first record of S08_settlement in Singapore is not mentioned in the given text.', 'Japan', 'Defeat.', 'Yes.', '1990.', 'Rainfall.', 'Malay.', 'Singapore', 'Important.', 'Monaco', 'British East India Company', '1959-1990', 'Capital punishment.', 'British East India Company', 'British East India Company', 'Singapore', 'Wilson', 'Reservoirs are used to store water.', 'Yes.', 'Yes.']
[k=3 | Batch 23] Time for generation = 3.54 seconds
[k=3 | Batch 24] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nSingapore has a network of reservoirs and water catchment areas. By 2001, there were 19 raw water reservoirs, 9 treatment works and 14 storage or service reservoirs locally to serve domestic needs. Marina Barrage 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'New York City.', 'Roosevelt contracted malaria and a serious infection resulting from a minor leg wound during his South America expedition, which significantly weakened him and contributed to his declining health.', 'To combat bullies.', 'Bamie.', "Roosevelt's justification for supporting desegregation of schools in New York was that it was a matter of fairness and equality.", 'Not provided.', 'No.', 'Yes.', 'Yes.', 'New York City.', 'Lincoln.', 'January 6, 1919.', 'Carnegie and the Smithsonian Institution.', 'No.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Spherical.', 'Infrequently.', 'Air.', 'Reproduction.', 'Researchers have discovered that turtle organs do not gradually break down or become less efficient over time.', 'Pleurodira']
[k=3 | Batch 24] Time for generation = 5.83 seconds
[k=3 | Batch 25] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nSea turtles lay their eggs on dr

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['Yes.', 'Yes.', 'No.', 'Three.', 'Turtle.', '1. Encyclopedia of Turtles by Pritchard, Pether C H (1979)\n2. Turtles and Crocodiles of Insular Southeast Asia and New Guinea by Iskandar, DT (2000)\n3. An', 'Female.', 'Papillae.', 'Reference', 'No.', 'Yes.', 'Yes.', 'Paracryptodira.', 'Yolk.', 'Beaches.', 'Pleurodira.', 'Sand.', 'Reptiles.', 'Water.', 'All turtles are ectothermic.', 'Yes.', 'Yes.', 'Papillae are not mentioned in the provided text.', 'Ostrich.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.']
[k=3 | Batch 25] Time for generation = 6.15 seconds
[k=3 | Batch 26] Started generation...
["<s>[INST] Give an answer to the question using one word if possible.\nUlysses S. Grant in his postbellum.\nLt. Gen. Ulysses S. Grant\nPresident Grant with his wife, Julia, and son, Jesse, in 1872.\nQuestion: Who was Grant's brother in law? [/INST]", '<s>[INST] Give an answer to the question using one word if possible.\nUlysses Grant Birthplace, Point Pleasant, Ohio\nGrant was 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['None.', 'Point Pleasant.', 'Republican', 'Unclear.', 'Yes.', 'No answer provided as there is no information given in the question that suggests Ulysses S. Grant thought war was unjust.', 'Ulysses S. Grant', '1868', 'Grant', 'No information provided.', 'Grant', 'Yes.', 'Pennsylvania', 'No.', 'Yes.', 'No.', 'No.', 'Yes.', 'The American Civil War ended in 1865.', 'Not specified.', 'No.', 'Yes.', 'No.', 'Spanish.', 'San Carlos.', 'Europeans.', 'Cantegriles.', 'No.', 'Catholic.', 'Yes.', 'Yes.', 'Yes.']
[k=3 | Batch 26] Time for generation = 4.13 seconds
[k=3 | Batch 27] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nAt 176,214 square kilometres (68,036 square miles) of continental land and 142,199 square kilometres (54,903 sq mi) of jurisdictional waters and small river islands,  Instituto Nacional Estadistica  Uruguay is the second smallest sovereign nation in South America (after Suriname) and the third smallest territory (French Guiana is

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['176,214', '88%', 'Founded.', 'Two.', '65%.', 'Yes.', 'No.', 'Yes.', 'Plebiscite.', 'Catholicism, Protestantism, Judaism.', 'Uruguay has been competitive in agriculture by focusing on low inputs of labor, technology, and capital, resulting in comparatively lower yields per hectare but opening the door for marketing products as "natural" or "ecological."', 'Partido Colorado, Partido Blanco, Frente Amplio.', 'No.', 'Montevideo', 'Language.', 'Diverse.', 'Independence.', 'No.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'Yes.', 'No.', 'No.', 'Yes.', 'No.', 'Paris', 'December 28, 1856.', 'Wilson was President when he finished writing Congressional Government.', 'History and political science.']
[k=3 | Batch 27] Time for generation = 6.60 seconds
[k=3 | Batch 28] Started generation...
['<s>[INST] Give an answer to the question using one word if possible.\nWoodrow Wilson had decided by then that the war had become a real threat to humanity. Unless the U.S. threw its weight into the war, as he 

## Evaluation

In [26]:
questions_val = questions_split["validation"].to_pandas()
df_export = questions_val[['id', 'question', 'answer']].copy()
df_export['generated_answer'] = generated_responses[k]
df_export.to_csv(f"generated_responses_simple_HyDE_v2.csv", index=False, encoding='utf-8')