In [1]:
import asyncio
import aiohttp
import os
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_fireworks import FireworksEmbeddings, ChatFireworks
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize
import re
import io
import time
import sys
import gradio as gr
import asyncio
from typing import List, Tuple, Any
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import numpy as np
from functools import lru_cache
import faiss
import httpx
from urllib.parse import urlparse
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from flashrank import Ranker, RerankRequest
from pathlib import Path
import traceback
from typing import List, Dict
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# print(f"CUDA is available: {torch.cuda.is_available()}")

# Set up API clients
os.environ['FIREWORKS_API_KEY'] = 'API'
# os.environ["SERPER_API_KEY"] = 'API'
os.environ["SERPER_API_KEY"] = 'API'

# Initialize components
search = GoogleSerperAPIWrapper(k=3)
embeddings = FireworksEmbeddings(model="nomic-ai/nomic-embed-text-v1.5")
llm = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-8b-instruct", temperature=0)
llm_8b = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-8b-instruct", temperature=0)
llm_70b = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-70b-instruct", temperature=0)

# Create a directory for caching in the user's home folder
cache_dir = Path.home() / ".flashrank_cache"
cache_dir.mkdir(parents=True, exist_ok=True)

# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available. GPU will be used automatically by FlashRank.")
else:
    print("CUDA is not available. CPU will be used.")

# Initialize FlashRank rerankers
ranker_nano = Ranker(cache_dir=str(cache_dir))
ranker_small = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir=str(cache_dir))
ranker_medium_t5 = Ranker(model_name="rank-T5-flan", cache_dir=str(cache_dir))
ranker_medium_multilang = Ranker(model_name="ms-marco-MultiBERT-L-12", cache_dir=str(cache_dir))
ranker_large = Ranker(model_name="rank_zephyr_7b_v1_full", max_length=1024, cache_dir=str(cache_dir))

# Ensure models are on GPU if available
for ranker in [ranker_nano, ranker_small, ranker_medium_t5, ranker_medium_multilang, ranker_large]:
    if hasattr(ranker, 'model') and hasattr(ranker.model, 'to'):
        ranker.model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Download NLTK data
# nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

CUDA is available. GPU will be used automatically by FlashRank.


llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from /home/ubuntu/.flashrank_cache/rank_zephyr_7b_v1_full/rank_zephyr_7b_v1_full.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = hub
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   

True

In [3]:
async def scrape_webpage(client, url):
    try:
        response = await client.get(url, timeout=3.0)
        response.raise_for_status()
        text = response.text
        soup = BeautifulSoup(text, 'lxml')
        content = ' '.join(soup.stripped_strings)
        return content[:5000], len(content[:5000])
    except (httpx.RequestError, httpx.TimeoutException) as exc:
        print(f"An error occurred while requesting {url}: {exc}")
    except httpx.HTTPStatusError as exc:
        print(f"Error response {exc.response.status_code} while requesting {url}")
    except Exception as e:
        print(f"Error scraping {url}: {e}")
    return "", 0

async def search_and_scrape(query, num_urls):
    search_results = search.results(query)
    scraped_urls = set()
    full_texts = []

    async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=3.0)) as client:
        tasks = []
        if 'organic' in search_results:
            for result in search_results['organic']:
                url = result.get('link')
                domain = urlparse(url).netloc if url else None
                if url and domain not in scraped_urls and len(tasks) < num_urls:
                    tasks.append(scrape_webpage(client, url))
                    scraped_urls.add(domain)

        results = await asyncio.gather(*tasks, return_exceptions=True)
        for result in results:
            if isinstance(result, tuple) and result[1] > 0:
                full_texts.append(result[0])

    return " ".join(full_texts)

def query_expansion(query, num_expansions):
    expansion_prompt = f"""
    Given the following search query, generate {num_expansions} additional related queries that could help find more comprehensive information on the topic. The queries should be different from each other and explore various aspects of the main query. Provide only the additional queries, numbered 1-{num_expansions}.

    Main query: {query}

    Additional queries:
    """

    response = llm.invoke(expansion_prompt)
    response_text = response.content if hasattr(response, 'content') else str(response)

    expanded_queries = [query]
    for line in response_text.split('\n'):
        if line.strip() and line[0].isdigit():
            expanded_queries.append(line.split('. ', 1)[1].strip())

    return expanded_queries[:num_expansions + 1]

def create_sentence_windows(text, window_size=3):
    sentences = sent_tokenize(text)
    windows = []
    for i in range(len(sentences)):
        window = " ".join(sentences[max(0, i-window_size):min(len(sentences), i+window_size+1)])
        windows.append(window)
    return windows

def generate_hypothetical_document(query):
    hyde_prompt = f"""
    Given the search query below, generate a hypothetical document that would be a perfect match for this query. The document should be concise, containing only 3 sentences of relevant information that directly addresses the query.

    Query: {query}

    Hypothetical Document (3 sentences):
    """

    response = llm.invoke(hyde_prompt)
    return response.content if hasattr(response, 'content') else str(response)

def llm_rerank(query, documents):
    rerank_prompt = """
    Given the following query and a list of document excerpts, rank the documents based on their relevance to the query. Provide the rankings as a list of numbers from 1 to {}, where 1 is the most relevant. Ensure you provide a ranking for every document.

    Query: {}

    Documents:
    {}

    Rankings (1 to {}):
    """.format(len(documents), query, "\n".join([f"{i+1}. {doc.page_content[:200]}..." for i, doc in enumerate(documents)]), len(documents))

    response = llm.invoke(rerank_prompt)
    rankings = [int(x) for x in response.content.split() if x.isdigit()]

    if len(rankings) < len(documents):
        remaining = set(range(1, len(documents) + 1)) - set(rankings)
        rankings.extend(remaining)

    sorted_docs = sorted(zip(documents, rankings), key=lambda x: x[1])
    return sorted_docs

def flashrank_rerank(query, documents, ranker):
    rerank_request = RerankRequest(
        query=query,
        passages=[{"text": doc.page_content} for doc in documents]
    )
    reranked = ranker.rerank(rerank_request)
    
    if isinstance(reranked, list) and isinstance(reranked[0], dict):
        sorted_results = sorted(reranked, key=lambda x: x.get('score', 0), reverse=True)
        return [(documents[i], result.get('score', 0)) for i, result in enumerate(sorted_results)]
    
    elif isinstance(reranked, list) and hasattr(reranked[0], 'score'):
        sorted_results = sorted(reranked, key=lambda x: x.score, reverse=True)
        return [(documents[i], result.score) for i, result in enumerate(sorted_results)]
    
    else:
        print(f"Unexpected reranked result type. Using original document order.")
        return [(doc, 1.0) for doc in documents]

def get_hyde_retriever(vectorstores, hyde_embedding, num_docs, num_rerank, rerank_method):
    def retriever(query):
        all_docs = []
        for vectorstore in vectorstores:
            docs = vectorstore.similarity_search_by_vector(hyde_embedding, k=num_docs)
            all_docs.extend(docs)

        unique_docs = []
        seen_content = set()
        for doc in all_docs:
            content = doc.page_content
            if content not in seen_content:
                unique_docs.append(Document(page_content=content))
                seen_content.add(content)

        try:
            if rerank_method == "none":
                return unique_docs[:num_rerank]
            elif rerank_method == "llm":
                reranked_docs = llm_rerank(query, unique_docs)
            elif rerank_method in ["nano", "small", "medium_t5", "medium_multilang", "large"]:
                ranker = globals()[f"ranker_{rerank_method}"]
                reranked_docs = flashrank_rerank(query, unique_docs, ranker)
            else:
                raise ValueError(f"Unknown rerank method: {rerank_method}")

            return [doc for doc, _ in reranked_docs[:num_rerank]]
        except Exception as e:
            print(f"Error during reranking with method {rerank_method}: {str(e)}")
            print("Traceback:", traceback.format_exc())
            print("Falling back to no reranking.")
            return unique_docs[:num_rerank]

    return retriever

def batch_embed_documents(documents, batch_size=512):
    batched_embeddings = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        texts = [doc.page_content for doc in batch]
        embeddings_batch = embeddings.embed_documents(texts)
        batched_embeddings.extend(embeddings_batch)
    return batched_embeddings

async def process_query(query, num_expansions, num_urls, num_docs, num_rerank, rerank_method, use_70b_model):
    try:
        start_time = time.time()

        hyde_start = time.time()
        hypothetical_doc = generate_hypothetical_document(query)
        hyde_time = time.time() - hyde_start
        print(f"hypothetical_doc length: {len(hypothetical_doc)}")
        print(f"-----HyDE generation time: {hyde_time:.2f} seconds")

        embed_start = time.time()
        hyde_embedding = embeddings.embed_query(hypothetical_doc)
        embed_time = time.time() - embed_start
        print(f"-----Embedding time: {embed_time:.2f} seconds")

        ext_start = time.time()
        expanded_queries = query_expansion(query, num_expansions)
        ext_time = time.time() - embed_start
        print(f"-----Query expansion time: {embed_time:.2f} seconds")

        scrape_start = time.time()
        all_texts = await asyncio.gather(*[search_and_scrape(eq, num_urls) for eq in expanded_queries])
        scrape_time = time.time() - scrape_start
        print(f"-----Web scraping time: {scrape_time:.2f} seconds")

        combined_text = " ".join(all_texts)
        print(f"Combined text length: {len(combined_text)} characters")

        sentence_windows = create_sentence_windows(combined_text)
        print(f"Number of sentence windows: {len(sentence_windows)}")

        index_documents = [Document(page_content=window) for window in sentence_windows]

        vectorstore_start = time.time()
        vectorstores = []
        for i in range(0, len(index_documents), 256):
            batch = index_documents[i:i + 256]

            batch_embeddings = batch_embed_documents(batch)

            texts = [doc.page_content for doc in batch]

            vectorstore = FAISS.from_embeddings(
                embedding=embeddings,
                text_embeddings=list(zip(texts, batch_embeddings))
            )
            vectorstores.append(vectorstore)

        vectorstore_time = time.time() - vectorstore_start
        print(f"-----Vectorstore creation time: {vectorstore_time:.2f} seconds")

        retrieval_start = time.time()
        retriever = get_hyde_retriever(vectorstores, hyde_embedding, num_docs, num_rerank, rerank_method)
        retrieved_docs = retriever(query)
        retrieval_time = time.time() - retrieval_start
        print(f"-----Retrieval and reranking time: {retrieval_time:.2f} seconds")

        print(f"Number of retrieved and reranked documents: {len(retrieved_docs)}")

        context_docs = [doc.page_content for doc in retrieved_docs]
        context = "\n\n".join(context_docs)

        total_processing_time = hyde_time + embed_time + scrape_time + vectorstore_time + retrieval_time
        print(f"-----Total processing time before answer generation: {total_processing_time:.2f} seconds")

        answer_start = time.time()
        prompt_template = """
        Use the following context to answer the question. Before answering the question generate a reasoning step. then answer.
        If you cannot answer based on the context, say "I don't have enough information to answer that question."

        Context:
        {context}

        Question: {question}

        Answer:
        """
        prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

        chosen_llm = llm_70b if use_70b_model else llm_8b

        rag_chain = prompt | chosen_llm | StrOutputParser()
        answer = rag_chain.invoke({"context": context, "question": query})
        answer_time = time.time() - answer_start
        print(f"-----Answer generation time: {answer_time:.2f} seconds")

        print("\n")
        print("-"*120)
        print("Final Answer:\n", answer)
        print("-"*120)

        return answer, context_docs

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()
        return "I'm sorry, but I encountered an error while processing your query. Please try again.", []

def gradio_interface(query, num_expansions, num_urls, num_docs, num_rerank, rerank_method, use_70b_model):
    old_stdout = sys.stdout
    sys.stdout = buffer = io.StringIO()

    answer, context_docs = asyncio.run(process_query(query, num_expansions, num_urls, num_docs, num_rerank, rerank_method, use_70b_model))

    sys.stdout = old_stdout
    captured_output = buffer.getvalue()

    truncated_docs = [f"Document {i+1}: {doc[:150]}..." for i, doc in enumerate(context_docs)]
    truncated_context = "\n\n".join(truncated_docs)

    captured_output += f"\n\nContext used for answer generation (first 150 characters of each document, {len(context_docs)} documents in total):\n" + truncated_context

    return captured_output

# Create Gradio interface
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Enter your query"),
        gr.Slider(minimum=0, maximum=3, value=1, step=1, label="Number of query expansions"),
        gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of URLs to scrape per extended query"),
        gr.Slider(minimum=20, maximum=80, value=80, step=1, label="Number of documents to retrieve with HyDE"),
        gr.Slider(minimum=10, maximum=80, value=50, step=1, label="Number of documents to keep after retrieval/reranking"),
        gr.Radio(["none", "llm", "nano", "small", "medium_t5", "medium_multilang"], label="Reranking method", value="none"),
        gr.Checkbox(label="Use 70B model for QA (unchecked uses 8B)", value=False)
    ],
    outputs="text",
    title="Advanced RAG Query Processing",
    description="Enter a query and adjust parameters to get a detailed answer based on web search and document analysis.",
    examples=[
        ["How can I take care of my eyes?", 1, 3, 80, 50, "llm", False],
        ["How can I take care of my eyes?", 1, 3, 80, 50, "nano", False]
    ]
)

# if __name__ == "__main__":
#     iface.launch(share=True, debug=True)

INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://checkip.amazonaws.com/ "HTTP/1.1 200 "


In [10]:
#### evaluation 

# LLM for generating questions
llm_generator = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-70b-instruct", temperature=0.6)

# Question generation prompt
question_gen_template = """Generate exactly {num_questions} diverse and challenging questions that would require complex web searches to answer. The questions should:

1. Cover a wide range of topics (e.g., science, history, current events, technology, arts)
2. Avoid long questions
3. Ensure there is only one question per query. Query should NOT be multiple questions

Please provide the questions as a numbered list, starting from 1 and ending at {num_questions}.

Generated Questions:"""

question_gen_prompt = PromptTemplate.from_template(question_gen_template)

def generate_questions(num_questions, max_attempts=3):
    for attempt in range(max_attempts):
        question_gen_chain = question_gen_prompt | llm_generator | StrOutputParser()
        questions_text = question_gen_chain.invoke({"num_questions": num_questions})

        questions = []
        for line in questions_text.split('\n'):
            match = re.match(r'^\s*\d+\.\s*(.+)$', line)
            if match:
                question = match.group(1).strip()
                questions.append(question)

        if len(questions) == num_questions:
            return questions

        print(f"Attempt {attempt + 1}: Generated {len(questions)} questions instead of {num_questions}. Retrying...")

    raise ValueError(f"Failed to generate exactly {num_questions} questions after {max_attempts} attempts.")

# Generate questions
num_questions = 100

evaluation_questions = generate_questions(num_questions)
print(f"Successfully generated {len(evaluation_questions)} questions:")
for i, question in enumerate(evaluation_questions, 1):
    print(f"{i}. {question}")

INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"


Successfully generated 100 questions:
1. What is the average airspeed velocity of an unladen swallow?
2. Who is the oldest known living person with a verified birth certificate?
3. What is the chemical composition of the Great Wall of China?
4. Which country has the highest number of languages spoken?
5. What is the largest living organism in the world?
6. Who was the first person to walk on the moon?
7. What is the most widely spoken language in the world?
8. What is the highest mountain peak in the solar system?
9. Who is the author of the oldest known surviving work of literature?
10. What is the deepest part of the ocean?
11. Which city has the most museums in the world?
12. What is the largest waterfall in the world by volume?
13. Who was the first woman to win a Nobel Prize?
14. What is the longest word in the English language?
15. What is the smallest country in the world?
16. Who was the first person to fly solo around the world?
17. What is the largest desert in the world?
18.

In [11]:
len(evaluation_questions)
evaluation_questions[:10]

['What is the average airspeed velocity of an unladen swallow?',
 'Who is the oldest known living person with a verified birth certificate?',
 'What is the chemical composition of the Great Wall of China?',
 'Which country has the highest number of languages spoken?',
 'What is the largest living organism in the world?',
 'Who was the first person to walk on the moon?',
 'What is the most widely spoken language in the world?',
 'What is the highest mountain peak in the solar system?',
 'Who is the author of the oldest known surviving work of literature?',
 'What is the deepest part of the ocean?']

In [15]:
from langchain_fireworks import ChatFireworks
from typing import List, Dict, Any

# Initialize the judge model (405B LLaMA)
judge_model = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-405b-instruct", temperature=0)

def evaluate_answer_quality(question: str, answer: str, judge_model: Any) -> int:
    """
    Evaluate if the answer completely addresses the question.
    Returns 1 if yes, 0 if no.
    """
    prompt = f"""
    You are an expert evaluator. Your task is to determine if the given answer completely addresses the question.
    
    Question: {question}
    Answer: {answer}
    
    Does the answer completely address the question?
    Respond with only 'Yes' or 'No'.
    
    Response:
    """
    
    response = judge_model.invoke(prompt)
    return 1 if response.content.strip().lower() == 'yes' else 0

def evaluate_document_selection(question: str, all_docs: List[str], selected_docs: List[str], judge_model: Any) -> int:
    """
    Evaluate if the selected documents are the best 20 out of the 150 to answer the question.
    Returns 1 if yes, 0 if no.
    """
    all_docs_text = "\n".join([f"{i+1}. {doc}..." for i, doc in enumerate(all_docs)])
    selected_indices = [all_docs.index(doc) + 1 for doc in selected_docs]
    
    prompt = f"""
    You are an expert information retrieval system. Your task is to determine if the selected documents are the best 20 out of the given 150 for answering the question completely.
    
    Question: {question}
    
    Here are all 150 retrieved documents:
    {all_docs_text}
    
    The system selected the following documents (by index): {', '.join(map(str, selected_indices))}
    
    Are these selected documents the best 20 out of the 150 for answering the question completely?
    Respond with only 'Yes' or 'No'.
    
    Response:
    """
    
    response = judge_model.invoke(prompt)
    return 1 if response.content.strip().lower() == 'yes' else 0

async def evaluate_rag_system(question: str, answer: str, all_docs: List[str], selected_docs: List[str]) -> Dict[str, int]:
    """
    Evaluate the RAG system's performance.
    """
    answer_correct = evaluate_answer_quality(question, answer, judge_model)
    docs_correct = evaluate_document_selection(question, all_docs, selected_docs, judge_model)
    
    return {
        "answer_correct": answer_correct,
        "docs_correct": docs_correct
    }

async def run_evaluation():
    question = "What are the main causes of climate change?"
    answer, context_docs = await process_query(question, num_expansions=1, num_urls=5, num_docs=150, num_rerank=20, rerank_method="nano", use_70b_model=False)
    
    all_docs = context_docs[:150]  # All 80 retrieved documents
    selected_docs = context_docs[:20]  # Top 10 after reranking

    random.shuffle(all_docs)
    
    evaluation_results = await evaluate_rag_system(question, answer, all_docs, selected_docs)
    
    print(f"Evaluation Results:")
    print(f"Answer Correctness: {evaluation_results['answer_correct']}")
    print(f"Top 20 Documents Correctness: {evaluation_results['docs_correct']}")

# Run the evaluation
await run_evaluation()

INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://science.nasa.gov/climate-change/causes/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.epa.gov/ghgemissions/overview-greenhouse-gases "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.un.org/en/climatechange/science/causes-effects-climate-change "HTTP/1.1 403 Forbidden"
INFO:httpx:HTTP Request: GET https://www.nrdc.org/stories/greenhouse-effect-101 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.nrdc.org/stories/what-are-causes-climate-change "HTTP/1.1 200 OK"


Error response 403 while requesting https://www.un.org/en/climatechange/science/causes-effects-climate-change


INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"


-----Answer generation time: 0.87 seconds


------------------------------------------------------------------------------------------------------------------------
Final Answer:
 Reasoning step: To answer this question, we need to identify the primary drivers of climate change based on the provided context. The context mentions human activities as the main cause of climate change, specifically the expansion of the "greenhouse effect" due to human-made emissions in the atmosphere.

Answer: The main causes of climate change are human activities, including the burning of fossil fuels, deforestation, and other industrial processes that release greenhouse gases such as carbon dioxide, methane, and nitrous oxide into the atmosphere. These gases trap heat and slow heat loss to space, leading to global warming and climate change.
------------------------------------------------------------------------------------------------------------------------


ValueError: too many values to unpack (expected 2)

In [27]:
# import json
# from langchain_fireworks import ChatFireworks

# # Initialize the judge model (405B LLaMA)
# judge_model = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-405b-instruct", temperature=0)

# def evaluate_answer_quality(question: str, answer: str) -> int:
#     prompt = f"""
#     You are an expert evaluator. Your task is to determine if the given answer completely addresses the question.
    
#     Question: {question}
#     Answer: {answer}
    
#     Does the answer completely address the question?
#     Respond with only 'Yes' or 'No'.
    
#     Response:
#     """
    
#     response = judge_model.invoke(prompt)
#     return 1 if response.content.strip().lower() == 'yes' else 0

# def evaluate_document_selection(question: str, all_docs: list, selected_docs: list) -> int:
#     all_docs_text = "\n".join([f"{i+1}. {doc}..." for i, doc in enumerate(all_docs)])
#     selected_indices = [all_docs.index(doc) + 1 for doc in selected_docs]
    
#     prompt = f"""
#     You are an expert information retrieval system. Your task is to determine if the selected documents are the best 10 out of the given 80 for answering the question completely.
    
#     Question: {question}
    
#     Here are the 80 retrieved documents:
#     {all_docs_text}
    
#     The system selected the following documents (by index): {', '.join(map(str, selected_indices))}
    
#     Are these selected documents the best 10 out of the 80 for answering the question?
#     Respond with only 'Yes' or 'No'.
    
#     Response:
#     """
    
#     response = judge_model.invoke(prompt)
#     return 1 if response.content.strip().lower() == 'yes' else 0

# async def run_evaluation(num_questions: int = 100):
#     questions = evaluation_questions
    
#     results = []
#     total_answer_correct = 0
#     total_docs_correct = 0
    
#     for question in questions[:num_questions]:
#         answer, context_docs = await process_query(question, num_expansions=1, num_urls=3, num_docs=80, num_rerank=10, rerank_method="nano", use_70b_model=False)
        
#         all_docs = context_docs[:80]  # All 80 retrieved documents
#         selected_docs = context_docs[:10]  # Top 10 after reranking

#         random.shuffle(all_docs)
        
#         answer_correct = evaluate_answer_quality(question, answer)
#         docs_correct = evaluate_document_selection(question, all_docs, selected_docs)
        
#         total_answer_correct += answer_correct
#         total_docs_correct += docs_correct
        
#         result = {
#             "question": question,
#             "answer": answer,
#             "answer_correctness": answer_correct,
#             "top_10_docs_correctness": docs_correct,
#             "all_docs": all_docs,
#             "selected_docs": selected_docs
#         }
#         results.append(result)
        
#         print(f"Question: {question}")
#         print(f"Answer Correctness: {answer_correct}")
#         print(f"Top 10 Documents Correctness: {docs_correct}")
#         print("---")
    
#     avg_answer_correct = total_answer_correct / num_questions
#     avg_docs_correct = total_docs_correct / num_questions
#     print(f"\nAverage Results over {num_questions} questions:")
#     print(f"Average Answer Correctness: {avg_answer_correct:.2f}")
#     print(f"Average Top 10 Documents Correctness: {avg_docs_correct:.2f}")
    
#     # Save results to a JSON file
#     output = {
#         "results": results,
#         "average_answer_correctness": avg_answer_correct,
#         "average_top_10_docs_correctness": avg_docs_correct
#     }
    
#     with open('evaluation_results.json', 'w') as f:
#         json.dump(output, f, indent=2)
    
#     print("\nResults have been saved to 'evaluation_results.json'")

# # To run the evaluation, use:
# # await run_evaluation()

In [16]:
async def process_query(query, num_expansions, num_urls, num_docs, num_rerank, rerank_method, use_70b_model):
    try:
        start_time = time.time()

        hyde_start = time.time()
        hypothetical_doc = generate_hypothetical_document(query)
        hyde_time = time.time() - hyde_start
        # print(f"hypothetical_doc length: {len(hypothetical_doc)}")
        # print(f"-----HyDE generation time: {hyde_time:.2f} seconds")

        embed_start = time.time()
        hyde_embedding = embeddings.embed_query(hypothetical_doc)
        embed_time = time.time() - embed_start
        # print(f"-----Embedding time: {embed_time:.2f} seconds")

        ext_start = time.time()
        expanded_queries = query_expansion(query, num_expansions)
        ext_time = time.time() - embed_start
        # print(f"-----Query expansion time: {embed_time:.2f} seconds")

        scrape_start = time.time()
        all_texts = await asyncio.gather(*[search_and_scrape(eq, num_urls) for eq in expanded_queries])
        scrape_time = time.time() - scrape_start
        # print(f"-----Web scraping time: {scrape_time:.2f} seconds")

        combined_text = " ".join(all_texts)
        # print(f"Combined text length: {len(combined_text)} characters")

        sentence_windows = create_sentence_windows(combined_text)
        # print(f"Number of sentence windows: {len(sentence_windows)}")

        index_documents = [Document(page_content=window) for window in sentence_windows]

        vectorstore_start = time.time()
        vectorstores = []
        for i in range(0, len(index_documents), 256):
            batch = index_documents[i:i + 256]

            batch_embeddings = batch_embed_documents(batch)

            texts = [doc.page_content for doc in batch]

            vectorstore = FAISS.from_embeddings(
                embedding=embeddings,
                text_embeddings=list(zip(texts, batch_embeddings))
            )
            vectorstores.append(vectorstore)

        vectorstore_time = time.time() - vectorstore_start
        # print(f"-----Vectorstore creation time: {vectorstore_time:.2f} seconds")

        retrieval_start = time.time()
        retriever = get_hyde_retriever(vectorstores, hyde_embedding, num_docs, num_rerank, rerank_method)
        retrieved_docs = retriever(query)
        retrieval_time = time.time() - retrieval_start
        # print(f"-----Retrieval and reranking time: {retrieval_time:.2f} seconds")

        # print(f"Number of retrieved and reranked documents: {len(retrieved_docs)}")

        context_docs = [doc.page_content for doc in retrieved_docs]
        context = "\n\n".join(context_docs)

        total_processing_time = hyde_time + embed_time + scrape_time + vectorstore_time + retrieval_time
        # print(f"-----Total processing time before answer generation: {total_processing_time:.2f} seconds")

        answer_start = time.time()
        prompt_template = """
        Use the following context to answer the question. Before answering the question generate a reasoning step. then answer.
        If you cannot answer based on the context, say "I don't have enough information to answer that question."

        Context:
        {context}

        Question: {question}

        Answer:
        """
        prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

        chosen_llm = llm_70b if use_70b_model else llm_8b

        rag_chain = prompt | chosen_llm | StrOutputParser()
        answer = rag_chain.invoke({"context": context, "question": query})
        answer_time = time.time() - answer_start
        print(f"-----Answer generation time: {answer_time:.2f} seconds")

        print("\n")
        print("-"*120)
        print("Final Answer:\n", answer)
        print("-"*120)

        return answer, context_docs, [hyde_time, embed_time, ext_time, scrape_time, vectorstore_time, retrieval_time, total_processing_time, answer_time]

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()
        return "I'm sorry, but I encountered an error while processing your query. Please try again.", []

In [None]:
from tqdm import tqdm
import json
import asyncio

async def run_evaluation(num_questions: int = 100):
    questions = evaluation_questions
    rerankers = ["nano", "small", "medium_t5", "medium_multilang"]#["none", "llm", "nano", "small", "medium_t5", "medium_multilang"]
    
    for reranker in rerankers:
        results = []
        total_answer_correct = 0
        total_docs_correct = 0
        total_times = [0] * 8  # For the 8 time measurements
        
        progress_bar = tqdm(total=num_questions, desc=f"Evaluating {reranker} reranker")
        
        for question in questions[:num_questions]:
            answer, context_docs, times = await process_query(question, num_expansions=1, num_urls=5, num_docs=150, num_rerank=20, rerank_method=reranker, use_70b_model=False)
            
            all_docs = context_docs[:150]
            selected_docs = context_docs[:20]
            
            answer_correct = evaluate_answer_quality(question, answer, judge_model)
            docs_correct = evaluate_document_selection(question, all_docs, selected_docs, judge_model)
            
            total_answer_correct += answer_correct
            total_docs_correct += docs_correct
            total_times = [total + t for total, t in zip(total_times, times)]
            
            result = {
                "question": question,
                "answer": answer,
                "answer_correctness": answer_correct,
                "top_10_docs_correctness": docs_correct,
                "all_docs": all_docs,
                "selected_docs": selected_docs,
                "times": times
            }
            results.append(result)
            
            progress_bar.update(1)
        
        progress_bar.close()
        
        avg_answer_correct = total_answer_correct / num_questions
        avg_docs_correct = total_docs_correct / num_questions
        avg_times = [t / num_questions for t in total_times]
        
        print(f"\nAverage Results for {reranker} reranker over {num_questions} questions:")
        print(f"Average Answer Correctness: {avg_answer_correct:.2f}")
        print(f"Average Top 10 Documents Correctness: {avg_docs_correct:.2f}")
        print(f"Average HyDE Time: {avg_times[0]:.2f} seconds")
        print(f"Average Embedding Time: {avg_times[1]:.2f} seconds")
        print(f"Average Query Expansion Time: {avg_times[2]:.2f} seconds")
        print(f"Average Web Scraping Time: {avg_times[3]:.2f} seconds")
        print(f"Average Vectorstore Creation Time: {avg_times[4]:.2f} seconds")
        print(f"Average Retrieval and Reranking Time: {avg_times[5]:.2f} seconds")
        print(f"Average Total Processing Time: {avg_times[6]:.2f} seconds")
        print(f"Average Answer Generation Time: {avg_times[7]:.2f} seconds")
        
        output = {
            "results": results,
            "average_answer_correctness": avg_answer_correct,
            "average_top_10_docs_correctness": avg_docs_correct,
            "average_times": {
                "hyde_time": avg_times[0],
                "embedding_time": avg_times[1],
                "query_expansion_time": avg_times[2],
                "web_scraping_time": avg_times[3],
                "vectorstore_creation_time": avg_times[4],
                "retrieval_and_reranking_time": avg_times[5],
                "total_processing_time": avg_times[6],
                "answer_generation_time": avg_times[7]
            }
        }
        
        filename = f'/home/ubuntu/maziar/11_efficient_eval_ranking/evaluation/{reranker}.json'
        with open(filename, 'w') as f:
            json.dump(output, f, indent=2)
        
        print(f"\nResults have been saved to '{filename}'")

# To run the evaluation, use:
await run_evaluation()


Evaluating nano reranker:   0%|          | 0/100 [00:00<?, ?it/s][AINFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://medium.com/human-nature-group/what-is-the-air-speed-velocity-of-an-unladen-swallow-4c17087bbf33 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://medium.com/human-nature-group/what-is-the-air-speed-velocity-of-an-unladen-swallow-4c17087bbf33 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.youtube.com/watch?v=pJS4QDUtzzI "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.quora.com/What-is-the-airspeed-velocity-of-an-unladen-swallow-1 "HTTP/1.1 429 Too Many Requests"


Error response 429 while requesting https://www.quora.com/What-is-the-airspeed-velocity-of-an-unladen-swallow-1


INFO:httpx:HTTP Request: GET https://interestingengineering.com/science/monty-python-and-the-holy-grail-airspeed-velocity-of-an-unladen-swallow "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://interestingengineering.com/science/monty-python-and-the-holy-grail-airspeed-velocity-of-an-unladen-swallow "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"


-----Answer generation time: 1.42 seconds


------------------------------------------------------------------------------------------------------------------------
Final Answer:
 To answer the question, we need to follow the reasoning steps outlined in the context. Here's the reasoning step:

1. Identify the species of swallow: The article mentions that the European (or 'Barn') swallow is the most studied species and will be used for the calculation.
2. Determine the average mass and wing-length of the European swallow: The article states that the average mass is 20.3g and the wing-length is 0.122m (12.2cm).
3. Estimate the amplitude and frequency of the swallow's wingbeats: The article mentions that the amplitude and frequency of the swallow's wingbeats have not been extensively studied, so estimates will be made based on similar birds.
4. Use the Strouhal ratio to estimate the airspeed velocity: The Strouhal ratio is an equation that estimates the airspeed velocity of a bird based o

INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"

Evaluating nano reranker:   1%|          | 1/100 [00:11<19:25, 11.78s/it][AINFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://guinnessworldrecords.com/world-records/84549-oldest-person-living "HTTP/1.1 301 Moved Permanently"
INFO:httpx:HTTP Request: GET https://guinnessworldrecords.com/world-records/84549-oldest-person-living "HTTP/1.1 301 Moved Permanently"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/Oldest_people "HTTP/1.1 200 OK"


Error response 301 while requesting https://guinnessworldrecords.com/world-records/84549-oldest-person-living
Error response 301 while requesting https://guinnessworldrecords.com/world-records/84549-oldest-person-living


INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/List_of_the_verified_oldest_people "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.demogr.mpg.de/books/odense/6/10.htm "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"


-----Answer generation time: 1.47 seconds


------------------------------------------------------------------------------------------------------------------------
Final Answer:
 To answer the question, I will first generate a reasoning step.

Reasoning step: The text mentions that the oldest known living person is Maria Branyas of Spain, who is 117 years old. However, it does not explicitly state that she has a verified birth certificate. To determine if she has a verified birth certificate, I will look for information in the text that suggests she has undergone age verification.

After reviewing the text, I found that Maria Branyas is listed as the oldest known living person, but I did not find any information that explicitly states she has a verified birth certificate. However, the text does mention that the list includes supercentenarians validated by organisations specialising in extreme age verification such as the Gerontology Research Group (GRG), which suggests that Maria Bran

INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"

Evaluating nano reranker:   2%|▏         | 2/100 [00:23<18:42, 11.45s/it][AINFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://education.nationalgeographic.org/resource/great-wall-china/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.travelchinaguide.com/china_great_wall/construction/material.htm "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/Great_Wall_of_China "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.chinahighlights.com/greatwall/construc

-----Answer generation time: 0.66 seconds


------------------------------------------------------------------------------------------------------------------------
Final Answer:
 I don't have enough information to answer that question.

Reasoning step: The context provided discusses the physical materials used to build the Great Wall of China, such as earth, stone, brick, lime, and wood, but it does not provide information on the chemical composition of the materials used. To answer the question, I would need information on the specific chemical compounds present in the materials used to build the Great Wall, which is not provided in the context.
------------------------------------------------------------------------------------------------------------------------


INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"

Evaluating nano reranker:   3%|▎         | 3/100 [00:32<16:48, 10.40s/it][AINFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.weforum.org/agenda/2023/04/worlds-most-multilingual-countries/ "HTTP/1.1 403 Forbidden"
INFO:httpx:HTTP Request: GET https://www.weforum.org/agenda/2021/03/these-are-the-top-ten-countries-for-linguistic-diversity/ "HTTP/1.1 403 Forbidden"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/Number_of_languages_by_country "HTTP/1.1 200 OK"


Error response 403 while requesting https://www.weforum.org/agenda/2023/04/worlds-most-multilingual-countries/
Error response 403 while requesting https://www.weforum.org/agenda/2021/03/these-are-the-top-ten-countries-for-linguistic-diversity/


Evaluating none reranker:   1%|          | 1/100 [7:51:21<777:44:40, 28281.62s/it]
INFO:httpx:HTTP Request: GET https://www.statista.com/statistics/1224629/the-most-linguistically-diverse-countries-worldwide-by-number-of-languages/ "HTTP/1.1 200 OK"


An error occurred while requesting https://www.visualcapitalist.com/the-countries-with-the-most-linguistic-diversity/: 
An error occurred while requesting https://www.visualcapitalist.com/the-countries-with-the-most-linguistic-diversity/: 


INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"


-----Answer generation time: 0.63 seconds


------------------------------------------------------------------------------------------------------------------------
Final Answer:
 Based on the context, the answer is Papua New Guinea, with 840 different languages spoken across the country as of 2021.
------------------------------------------------------------------------------------------------------------------------


INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"

Evaluating nano reranker:   4%|▍         | 4/100 [00:51<22:15, 13.91s/it][AINFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/Largest_organisms "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/Largest_organisms "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.scientificamerican.com/article/strange-but-true-largest-organism-is-fungus/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://frontenacarchbiosphere.ca/worlds-largest-organism/ "

-----Answer generation time: 0.78 seconds


------------------------------------------------------------------------------------------------------------------------
Final Answer:
 Reasoning step: The question asks for the largest living organism in the world, and the context provides information about various contenders for this title, including a fungus known as Armillaria solidipes (Honey fungus) and an Aspen Tree colony known as Pando. To answer the question, I need to identify which of these contenders is considered the largest living organism.

Answer: The largest living organism in the world is the Armillaria solidipes (Honey fungus), which spans 5.5 kilometres across and covers about 2,384 acres.
------------------------------------------------------------------------------------------------------------------------


INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"

Evaluating nano reranker:   5%|▌         | 5/100 [01:00<19:21, 12.23s/it][AINFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/Apollo_11 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.nasa.gov/history/apollo-11-mission-overview/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.nasa.gov/learning-resources/for-kids-and-students/who-was-neil-armstrong-grades-5-8/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.rmg.co.uk/stories/topics/how-many-peop

-----Answer generation time: 0.73 seconds


------------------------------------------------------------------------------------------------------------------------
Final Answer:
 Before answering the question, I will generate a reasoning step. 

The question asks for the identity of the first person to walk on the moon. The context provided describes the Apollo 11 mission and the events surrounding the first moon landing. It mentions Neil Armstrong as the first person to step out of the lunar module and onto the moon's surface. 

Based on this information, I can conclude that Neil Armstrong was the first person to walk on the moon.

Answer: Neil Armstrong.
------------------------------------------------------------------------------------------------------------------------


INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"

Evaluating nano reranker:   6%|▌         | 6/100 [01:12<18:43, 11.95s/it][AINFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.visualcapitalist.com/top-languages-spoken-in-the-world/ "HTTP/1.1 403 Forbidden"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers "HTTP/1.1 200 OK"


Error response 403 while requesting https://www.visualcapitalist.com/top-languages-spoken-in-the-world/


INFO:httpx:HTTP Request: GET https://www.berlitz.com/blog/most-spoken-languages-world "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.britannica.com/topic/languages-by-number-of-native-speakers-2228882 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.babbel.com/en/magazine/the-10-most-spoken-languages-in-the-world "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"


-----Answer generation time: 0.50 seconds


------------------------------------------------------------------------------------------------------------------------
Final Answer:
 Based on the context, the most widely spoken language in the world is English, with 1.4+ billion speakers, including both native and non-native speakers.
------------------------------------------------------------------------------------------------------------------------


INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"

Evaluating nano reranker:   7%|▋         | 7/100 [01:21<17:26, 11.25s/it][AINFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/List_of_tallest_mountains_in_the_Solar_System "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://en.wikipedia.org/wiki/List_of_tallest_mountains_in_the_Solar_System "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://science.howstuffworks.com/tallest-mountain-in-solar-system.htm "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.facebook.

-----Answer generation time: 0.66 seconds


------------------------------------------------------------------------------------------------------------------------
Final Answer:
 Reasoning step: The question asks for the highest mountain peak in the solar system, which requires identifying the tallest mountain among all the planets and moons in the solar system.

Answer: The highest mountain peak in the solar system is Olympus Mons on Mars, with a height of 21.9 to 26 km (13.6 to 16.2 miles) above the Martian surface.
------------------------------------------------------------------------------------------------------------------------


INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"

Evaluating nano reranker:   8%|▊         | 8/100 [01:30<15:50, 10.34s/it][AINFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.fireworks.ai/inference/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.history.com/news/what-is-the-oldest-known-piece-of-literature "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.history.com/news/what-is-the-oldest-known-piece-of-literature "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.britannica.com/story/what-was-the-first-book-ever-written "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://www.britannica.com