In [None]:
import json
import os
import sqlite3
from tabulate import tabulate
from typing import List, Dict, Tuple
import numpy as np
from tqdm import tqdm
from openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from pydantic import BaseModel
import nltk
import pickle

# Download NLTK data
nltk.download('punkt', quiet=True)

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Initialize OpenAI API
client = OpenAI()

# Database setup
DB_NAME = 'embeddings.db'



def create_database():
    conn = sqlite3.connect(DB_NAME)
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS embeddings
                 (id TEXT PRIMARY KEY, pre_embedding BLOB, post_embedding BLOB, content TEXT)''')
    conn.commit()
    conn.close()

def save_embeddings(embeddings_data):
    conn = sqlite3.connect(DB_NAME)
    c = conn.cursor()
    for item in embeddings_data:
        c.execute("INSERT OR REPLACE INTO embeddings VALUES (?, ?, ?, ?)", item)
    conn.commit()
    conn.close()

def load_embeddings():
    conn = sqlite3.connect(DB_NAME)
    c = conn.cursor()
    c.execute("SELECT * FROM embeddings")
    rows = c.fetchall()
    conn.close()
    return [(row[0], pickle.loads(row[1]), pickle.loads(row[2]), row[3]) for row in rows]

def embed_text(text: str) -> List[float]:
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

def generate_text(prompt: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content

def load_data(file_path: str) -> List[Dict]:
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def create_embeddings_and_bm25(data: List[Dict]) -> Tuple[List[np.ndarray], List[np.ndarray], List[Document], BM25Okapi]:
    create_database()
    
    pre_embeddings = []
    post_embeddings = []
    documents = []
    corpus = []
    embeddings_data = []

    print("Creating embeddings and BM25 index...")
    for item in tqdm(data):
        pre_text = ' '.join(item['pre_text'])
        post_text = ' '.join(item['post_text'])
        table_text = format_table(item['table'])

        pre_doc = f"{pre_text}"
        post_doc = f"{post_text}"
        full_doc = f"{pre_doc}\n\n{post_doc}"

        # Split document into smaller chunks
        chunks = split_into_chunks(full_doc, chunk_size=500, overlap=100)

        for i, chunk in enumerate(chunks):
            # Add table to each chunk
            chunk_with_table = f"{chunk}\n\nTable:\n{table_text}"
            
            pre_embedding = embed_text(chunk_with_table)
            post_embedding = embed_text(chunk_with_table)

            doc_id = f"{item['id']}-{i}"
            pre_embeddings.append(pre_embedding)
            post_embeddings.append(post_embedding)
            documents.append(Document(page_content=chunk_with_table, metadata={"id": doc_id}))
            corpus.append(chunk_with_table)
            
            embeddings_data.append((doc_id, pickle.dumps(pre_embedding), pickle.dumps(post_embedding), chunk_with_table))

    # Save embeddings to database
    save_embeddings(embeddings_data)

    # Create BM25 index
    tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)

    return pre_embeddings, post_embeddings, documents, bm25

def load_or_create_embeddings_and_bm25(data: List[Dict]) -> Tuple[List[np.ndarray], List[np.ndarray], List[Document], BM25Okapi]:
    if os.path.exists(DB_NAME):
        print("Loading embeddings from database...")
        loaded_data = load_embeddings()
        pre_embeddings = [item[1] for item in loaded_data]
        post_embeddings = [item[2] for item in loaded_data]
        documents = [Document(page_content=item[3], metadata={"id": item[0]}) for item in loaded_data]
        corpus = [item[3] for item in loaded_data]
        
        # Create BM25 index
        tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
        bm25 = BM25Okapi(tokenized_corpus)
        
        return pre_embeddings, post_embeddings, documents, bm25
    else:
        return create_embeddings_and_bm25(data)


def format_table(table: List[List[str]]) -> str:
    formatted_table = []
    for row in table:
        formatted_row = " | ".join(row)
        formatted_table.append(formatted_row)
    return "\n".join(formatted_table)

def split_into_chunks(text: str, chunk_size: int, overlap: int) -> List[str]:
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks


def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def hybrid_retrieve_documents(query: str, pre_embeddings: List[np.ndarray], post_embeddings: List[np.ndarray], documents: List[Document], bm25: BM25Okapi, k: int = 10) -> List[Document]:
    query_embedding = embed_text(query)
    
    # Embedding-based similarity
    similarities = []
    for pre_emb, post_emb in zip(pre_embeddings, post_embeddings):
        pre_sim = cosine_similarity(query_embedding, pre_emb)
        post_sim = cosine_similarity(query_embedding, post_emb)
        similarities.append(max(pre_sim, post_sim))
    
    # BM25 scores
    tokenized_query = word_tokenize(query.lower())
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Normalize scores
    max_similarity = max(similarities)
    max_bm25 = max(bm25_scores)
    normalized_similarities = [sim / max_similarity for sim in similarities]
    normalized_bm25_scores = [score / max_bm25 for score in bm25_scores]
    
    # Combine scores (you can adjust the weights)
    combined_scores = [0.7 * sim + 0.3 * bm25 for sim, bm25 in zip(normalized_similarities, normalized_bm25_scores)]
    
    top_k_indices = np.argsort(combined_scores)[-k:][::-1]
    return [documents[i] for i in top_k_indices]

def rerank_documents(query: str, documents: List[Document], k: int = 5) -> List[Document]:
    rerank_prompt = """
    Given the following query and document, rate the relevance of the document to the query on a scale of 0 to 10, where 0 is completely irrelevant and 10 is highly relevant. Provide a brief explanation for your rating.

    Query: {query}

    Document:
    {document}

    Relevance score (0-10) and explanation:
    """

    scored_docs = []
    for doc in documents:
        prompt = rerank_prompt.format(query=query, document=doc.page_content)
        response = generate_text(prompt)
        try:
            score_text, explanation = response.split('\n', 1)
            score = float(score_text.strip().split(':')[1])
        except ValueError:
            score = 0
            explanation = "Failed to parse score"
        scored_docs.append((score, explanation, doc))

    # Sort documents by score and return top k
    reranked_docs = [doc for score, explanation, doc in sorted(scored_docs, key=lambda x: x[0], reverse=True)]
    return reranked_docs[:k]

def generate_answer(query: str, context: str) -> str:
    prompt_template = """
    You are an AI assistant tasked with answering questions based on the given context. Follow these steps:

    1. Carefully read the context and the question.
    2. Identify the key information in the context that is relevant to the question.
    3. Formulate a clear and concise answer based on the relevant information.
    4. If the context doesn't contain enough information to answer the question confidently, say "I don't have enough information to answer this question accurately."
    5. Provide your confidence level in the answer (low, medium, or high).
    6. Provide the explanation for the answer in Explanatgion: part and the final result in Answer: part (do not add any other text other than the answer in Answer: part).

    Context:
    {context}

    Question: {query}

    Explanation:
    
    
    Answer:
    """

    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "query"])
    full_prompt = prompt.format(context=context, query=query)
    
    return generate_text(full_prompt)

def query_expansion(query: str) -> List[str]:
    expansion_prompt = f"""
    Given the following query, generate 2 alternative phrasings or related queries that might help in retrieving relevant information:

    Original query: {query}

    Alternative queries:
    1.
    2.
    """
    response = generate_text(expansion_prompt)
    expanded_queries = response.strip().split('\n')[1:]
    expanded_queries = [q.split('.', 1)[1].strip() for q in expanded_queries]
    return [query] + expanded_queries

def rag_system(query: str, pre_embeddings: List[np.ndarray], post_embeddings: List[np.ndarray], documents: List[Document], bm25: BM25Okapi) -> str:
    expanded_queries = query_expansion(query)
    all_retrieved_docs = []
    
    for expanded_query in expanded_queries:
        retrieved_docs = hybrid_retrieve_documents(expanded_query, pre_embeddings, post_embeddings, documents, bm25, k=10)
        all_retrieved_docs.extend(retrieved_docs)
    
    # Remove duplicates
    unique_docs = list({doc.metadata['id']: doc for doc in all_retrieved_docs}.values())
    
    reranked_docs = rerank_documents(query, unique_docs, k=5)
    context = "\n\n".join([doc.page_content for doc in reranked_docs])
    answer = generate_answer(query, context)
    return answer


if __name__ == "__main__":
    # Load data
    data = load_data("convfinqa_rag/data/train.json")

    # Load or create embeddings and BM25 index
    pre_embeddings, post_embeddings, documents, bm25 = load_or_create_embeddings_and_bm25(data)

    query = "What was the percentage change in net cash from operating activities from 2008 to 2009?"
    answer = rag_system(query, pre_embeddings, post_embeddings, documents, bm25)
    print(f"Question: {query}")
    print(f"Answer: {answer}")
       

    

In [None]:
import json
import os
import sqlite3
from typing import List, Dict, Tuple, Optional
import numpy as np
from tqdm import tqdm
from openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import nltk
import pickle
import re
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
from datasets import Dataset

# Download NLTK data
nltk.download('punkt', quiet=True)

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Initialize OpenAI API
client = OpenAI()

# Database setup
DB_NAME = 'embeddings.db'

# ... [Previous functions remain the same up to rag_system] ...

def rag_system(query: str, pre_embeddings: List[np.ndarray], post_embeddings: List[np.ndarray], documents: List[Document], bm25: BM25Okapi) -> Dict[str, str]:
    expanded_queries = query_expansion(query)
    all_retrieved_docs = []
    
    for expanded_query in expanded_queries:
        retrieved_docs = hybrid_retrieve_documents(expanded_query, pre_embeddings, post_embeddings, documents, bm25, k=10)
        all_retrieved_docs.extend(retrieved_docs)
    
    unique_docs = list({doc.metadata['id']: doc for doc in all_retrieved_docs}.values())
    
    reranked_docs = rerank_documents(query, unique_docs, k=5)
    context = "\n\n".join([doc.page_content for doc in reranked_docs])
    answer = generate_answer(query, context)
    
    return {
        "question": query,
        "answer": answer,
        "contexts": [doc.page_content for doc in reranked_docs]
    }

def benchmark_rag_system(benchmarking_data: List[Dict], pre_embeddings: List[np.ndarray], post_embeddings: List[np.ndarray], documents: List[Document], bm25: BM25Okapi) -> Dict[str, float]:
    results = []
    
    for item in tqdm(benchmarking_data):
        query = item['question']
        ground_truth = item['answer']
        
        try:
            rag_result = rag_system(query, pre_embeddings, post_embeddings, documents, bm25)
            rag_result['ground_truths'] = [ground_truth]  # RAGAS expects a list of ground truths
            results.append(rag_result)
        
        except Exception as e:
            print(f"Error processing query: {query}")
            print(f"Error message: {str(e)}")
            continue
    
    # Convert results to a Dataset
    dataset = Dataset.from_list(results)
    
    # Evaluate using RAGAS
    evaluation_result = evaluate(
        dataset,
        metrics=[
            # context_precision,
            faithfulness,
            answer_relevancy,
            # context_recall
        ]
    )
    
    return evaluation_result

if __name__ == "__main__":
    # Load data
    data = load_data("convfinqa_rag/data/train.json")

    # Load or create embeddings and BM25 index
    pre_embeddings, post_embeddings, documents, bm25 = load_or_create_embeddings_and_bm25(data)

    # Load benchmarking data
    # from convfinqa_rag.data.benchmarking import benchmarking_data
    
   
    benchmarking_data = [
    {
        "question": "what was the percentage change in the net cash from operating activities from 2008 to 2009",
        "answer": "14.1%"
    },
    {
        "question": "what was the percent of the growth in the revenues from 2007 to 2008",
        "answer": "1.3%"
    },
    {
        "question": "what was the percentage change in net sales from 2000 to 2001?",
        "answer": "-32%"
    },
    {
        "question": "what was the difference in percentage cumulative return on investment for united parcel service inc . compared to the s&p 500 index for the five year period ended 12/31/09?",
        "answer": "-26.16%"
    },
    {
        "question": "what portion of the total shares subject to outstanding awards is under the 2009 global incentive plan?",
        "answer": "70.1%"
    },
    {
        "question": "what was the percentage increase in litigation reserves in 2012?",
        "answer": "15.6%"
    },
    {
        "question": "what portion of total obligations are due within the next 3 years?",
        "answer": "22.99%"
    },
    {
        "question": "for the years ended december 31 , 2013 , 2012 and 2011 , what was the total in millions capitalized to assets associated with compensation expense related to long-term compensation plans , restricted stock and stock options?\\\\n",
        "answer": "12"
    },
    {
        "question": "what is the percent change in net revenue between 2007 and 2008?",
        "answer": "2.4%"
    },
    {
        "question": "what portion of the authorized shares of class b common stock is outstanding as of december 31 , 2017?",
        "answer": "56.6%"
    },
    {
        "question": "what was the 5 year return of the kbw bank index?",
        "answer": "158.82%"
    },
    {
        "question": "what is the percentage change in the balance of asset allocation from 2016 to 2017?",
        "answer": "11.2%"
    },
    {
        "question": "between 2015 and 2013 what was the average compensation expense related to the issuing of the stock award in millions",
        "answer": "40.3"
    },
    {
        "question": "what is the total expected payments for principal of long- term debt , including capital leases in the next 36 months?",
        "answer": "594840"
    },
    {
        "question": "what is the percent change in net loss on disposal of assets between 2004 and 2005?",
        "answer": "700%"
    },
    {
        "question": "how much higher are the returns of the s&p 500 in the same period ( 2008-2013 ) ? as a percentage .",
        "answer": "30%"
    },
    {
        "question": "what is the percentage change in standardized rwas in 2014?",
        "answer": "-1.2%"
    },
    {
        "question": "what percentage did the balance increase from 2007 to 2009?",
        "answer": "83.6%"
    },
    {
        "question": "what is the percentual decrease observed in the future minimum rental payments during 2008 and 2009?",
        "answer": "13.25%"
    },
    {
        "question": "what was the percentage change of total debt from 2001 to 2002?",
        "answer": "-57%"
    },
    {
        "question": "what is the total revenue for the fiscal year of 2015?",
        "answer": "7020"
    },
    {
        "question": "what was the percentage change in revenues for investments in 50% ( 50 % ) or less owned investments accounted for using the equity method between 2000 and 2001?",
        "answer": "-2%"
    },
    {
        "question": "how many square feet are owned by the company?",
        "answer": "377000"
    },
    {
        "question": "considering the years 2017 and 2018 , what is the percentual increase observed in capital expenditures used for continuing operations?",
        "answer": "31.14%"
    },
    {
        "question": "what was the percentage decline in the total restricted cash and marketable securities from 2011 to 2012",
        "answer": "-13.4%"
    },
    {
        "question": "what was the percent change in the value of commercial paper outstanding between 2010 and 2011?",
        "answer": "18%"
    },
    {
        "question": "what was the growth rate of the s&p 500 index from 2011 to 2016",
        "answer": "98.2%"
    },
    {
        "question": "what was the firm's average sum of contractual principal , interest and fees in 2008 and 2009?",
        "answer": "$ 110774.5 million"
    },
    {
        "question": "what is the percentage change in staff number in 2013?",
        "answer": "1.5%"
    },
    {
        "question": "in 2019 what was the net change in cash in millions",
        "answer": "-489.2"
    },
    {
        "question": "what is the growth rate in the average price of the purchased shares from october to november 2014?",
        "answer": "6.1%"
    },
    {
        "question": "what is the roi of an investment in the o'reilly automotive inc . from 2010 to 2011?",
        "answer": "32%"
    },
    {
        "question": "what is the percent increase in cash and cash equivalents from year 2009 to 2010?",
        "answer": "24.4%"
    },
    {
        "question": "by what percent did effects of foreign operations decrease from 2002 to 2004?",
        "answer": "-76.8%"
    },
    {
        "question": "what was the percentage change of the net revenue in 2010",
        "answer": "7.61%"
    },
    {
        "question": "what was the percentage change in industry segment operating profits from 2006 to 2007?",
        "answer": "17%"
    },
    {
        "question": "what was the percentage increase in the cash and cash equivalents from 2005 to 2006",
        "answer": "-16.6%"
    },
    {
        "question": "by how much did asset retirement obligations decrease from 2007 to 2008?",
        "answer": "-14.9%"
    },
    {
        "question": "north american printing papers net sales where what percent of total printing paper sales in 2009?",
        "answer": "49%"
    },
    {
        "question": "by how much did total other income and expense decrease from 2008 to 2009?",
        "answer": "47.4%"
    },
    {
        "question": "what was the percentage chaning in the total fair value of restricted stock and performance awards vested from 2016 to 2017?",
        "answer": "69%"
    },
    {
        "question": "what are the total pre-tax catastrophe losses in the last three years?",
        "answer": "1905.4"
    },
    {
        "question": "what percentage where north american consumer packaging net sales of total consumer packaging sales in 2011?",
        "answer": "67%"
    },
    {
        "question": "what was the percent of the growth in the revenues from 2005 to 2006",
        "answer": "45.3%"
    },
    {
        "question": "what was the percentage change in rental expenses from 2017 to 2018?",
        "answer": "21%"
    },
    {
        "question": "what was the percent of the growth in the recurring tenant improvements from 2006 to 2007",
        "answer": "8.1%"
    },
    {
        "question": "what was the percentage cumulative total shareholder return on disca common stock from september 18 , 2008 to december 31 , 2012?",
        "answer": "359.67%"
    },
    {
        "question": "what was the percentage change in the net cash used in investing activities from 2006 to 2007",
        "answer": "-6.4%"
    },
    {
        "question": "if the 2003 growth rate is the same as 2002 , what would 2003 gas transmission throughput be in bcf?\\\\n",
        "answer": "645"
    },
    {
        "question": "what was the percentage change in revenues between 2005 and 2006?",
        "answer": "45%"
    },
    {
        "question": "by how much did total revenues increase from 2010 to 2011?",
        "answer": "25.4%"
    },
    {
        "question": "what is the percentage change in total dividends paid per share from 2017 to 2018?",
        "answer": "9.9%"
    },
    {
        "question": "what was the percentage change in the research and development costs from 2015 to 2016",
        "answer": "-34.4%"
    },
    {
        "question": "what is the percentage change in net reserves from 2011 to 2012?",
        "answer": "-11.3%"
    },
    {
        "question": "what is the growth rate in the net income from 2011 to 2012?",
        "answer": "-78.8%"
    },
    {
        "question": "what is the percentage change in the weight of smokeless products in operating income from 2015 to 2016?",
        "answer": "2.3%"
    },
    {
        "question": "what is the percentage change in the average total short-duration advances from 2013 to 2014?",
        "answer": "3.1%"
    },
    {
        "question": "by what percentage did asset retirement obligations increase from 2008 to 2009?",
        "answer": "14.2%"
    },
    {
        "question": "what percent of printing papers sales in 2006 was from north american printing papers net sales?",
        "answer": "66%"
    },
    {
        "question": "by how much did american airlines group inc . common stock out preform the s&p 500 index over the 4 year period?",
        "answer": "48%"
    },
    {
        "question": "what is the short-term debt as a percent of long-term debt , in 2019?",
        "answer": "1.80%"
    },
    {
        "question": "how much , in billions , was spent purchasing common stock under the programs from 2016-2018?",
        "answer": "4.1"
    },
    {
        "question": "what is the percentage increase in the balance of goodwill from 2017 to 2018?",
        "answer": "-0.6%"
    },
    {
        "question": "what is actual operating cash flow reported for 2011?",
        "answer": "3.129"
    },
    {
        "question": "what is the percentage change in benefits obligations from 2018 to 2019?",
        "answer": "19.4%"
    },
    {
        "question": "what is the growth rate of interest income from 2013 to 2014?",
        "answer": "10.9%"
    },
    {
        "question": "what is the percentage change in the cash dividends received by the company in 2012 compare to 2011?",
        "answer": "6.4%"
    },
    {
        "question": "what is the growth rate in the share-based compensation expense from 2014 to 2015?",
        "answer": "-29.2%"
    },
    {
        "question": "what percentage of contractual obligations for future payments under existing debt and lease commitments and purchase obligations at december 31 , 2012 is short term for the year 2014?",
        "answer": "58%"
    },
    {
        "question": "did 2015 adjusted ebitda increase more than 2015 actual ebitda?",
        "answer": "no"
    },
    {
        "question": "what is the growth rate in net sales for is&gs in 2011?",
        "answer": "-5.4%"
    },
    {
        "question": "what is the percentage change in nonrecurring losses from 2012 to 2013?",
        "answer": "11.8%"
    },
    {
        "question": "what is the growth rate in net revenue in 2003 for entergy corporation?",
        "answer": "0.1%"
    },
    {
        "question": "what is the percent change in net revenue from 2003 to 2004?",
        "answer": "14.7%"
    },
    {
        "question": "what was the percentage gain on the sale of starter brand business?",
        "answer": "91%"
    },
    {
        "question": "what was the difference in percentage cumulative total shareowners 2019 returns for united parcel service inc . versus the standard & poor 2019s 500 index for the five years ended 12/31/10?",
        "answer": "-1.42%"
    },
    {
        "question": "what was the percentage cumulative total shareholder return on disca for the five year period ended december 21 , 2013?",
        "answer": "502.08%"
    },
    {
        "question": "what is the percentage difference in the number of shares to be issued if the stock price closes at $ 11 compared to if it closes at $ 20?",
        "answer": "278%"
    },
    {
        "question": "what is the growth rate in operating profit for mst in 2012?",
        "answer": "14.3%"
    },
    {
        "question": "what was the percentage change in rent expenses included in selling , general and administrative expense from 2014 to 2015?",
        "answer": "41%"
    },
    {
        "question": "in billions , what would 2018 total operating revenues have been without the mexico business?",
        "answer": "20.332"
    },
    {
        "question": "how much did the company 2019s valuation allowance decrease from 2010 to 2012?",
        "answer": "-23.8%"
    },
    {
        "question": "what is the amount of cash raised from the issuance of shares during 2015 , in millions?",
        "answer": "4.5"
    },
    {
        "question": "what was the percentage change in the net cash used in financing activities from 2004 to 2005",
        "answer": "45.2%"
    },
    {
        "question": "what is the growth rate in sublease revenues from 2007 to 2008?",
        "answer": "-7.8%"
    },
    {
        "question": "what is the percentage of total debt from 2014-2015 that was long-term debt?",
        "answer": "92.5%"
    },
    {
        "question": "what is the percentage change in the balance of alternative assets from 2011 to 2012?",
        "answer": "4.6%"
    },
    {
        "question": "what percentage of total debt is from long-term debt , from 2013-2014?",
        "answer": "81.09"
    },
    {
        "question": "what was the average net interest margin for 2008 and for 2007?",
        "answer": "3.2%"
    },
    {
        "question": "what percentage of total shares purchased was purchased in december?",
        "answer": "36.55%"
    },
    {
        "question": "in 2014 what was the combined total experience loss",
        "answer": "3633"
    },
    {
        "question": "what is the total value of repurchased shares during october 2012 , in millions?",
        "answer": "78.7"
    },
    {
        "question": "what was the average net interest margin in% ( in % ) for 2009 and 2008.?",
        "answer": "3.6"
    },
    {
        "question": "how is the treasury stock affected after the stock repurchases in the last three months of 2016 , ( in millions ) ?",
        "answer": "110.3"
    },
    {
        "question": "what is the net income margin for 2018?",
        "answer": "3.3%"
    },
    {
        "question": "what were 2001 total segment revenues in billions?",
        "answer": "9.3"
    },
    {
        "question": "what is the rate of return of an investment in nasdaq composite from the end of the year in 2015 to the end of the year in 2016?",
        "answer": "6.2%"
    },
    {
        "question": "what was the percentage change in the reserve for product warranties from december 31 2006 to december 30 2007?",
        "answer": "273%"
    },
    {
        "question": "with 2014 closing stock price , what is the total value of the award for the additional shares , ( in millions ) ?",
        "answer": "56.5"
    },
    {
        "question": "what was the percentage cumulative total shareholder return on disca common stock from september 18 , 2008 to december 31 , 2008?",
        "answer": "2.53%"
    },
    {
        "question": "the 2005 charge for asset impairments in the optical and specialty materials segment represented what percent of pre-impairment earnings for the segment?",
        "answer": "14.6%"
    },
    {
        "question": "assuming the revolver is undrawn , what would the annual fee for the revolver be?",
        "answer": "2250000"
    },
    {
        "question": "what was the total number , in mmboe , of 2014 proved developed reserves?",
        "answer": "700.7"
    },
    {
        "question": "what was the difference in percentage cumulative total return between e*trade financial corporation and s&p super cap diversified financials for the five years ended 12/07?",
        "answer": "103.57%"
    },
    {
        "question": "what was the percentage change in the allowance for loan losses from 2007 to 2008?",
        "answer": "80%"
    },
    {
        "question": "what is the expected growth rate in amortization expense in 2017?",
        "answer": "-0.9%"
    },
    {
        "question": "what is the percent change in asset purchase agreements between 2008 and 2009?",
        "answer": "-74%"
    },
    {
        "question": "what is the percentage change in held-to-maturity securities at cost and at fair value as of january 30 , 2009?",
        "answer": "-8.0%"
    },
    {
        "question": "what is the percentage change in aggregate rent expense from 2003 to 2004?",
        "answer": "4.2%"
    },
    {
        "question": "what is the approximate customer penetration for the total regulated businesses?",
        "answer": "28%"
    },
    {
        "question": "what is the growth rate in the total expense related to the defined contribution plan for non-u.s.employees in 2011?",
        "answer": "82.9%"
    },
    {
        "question": "as of december 31 , 2017 what was the percent of the system energy credit facility utilization",
        "answer": "56.7%"
    },
    {
        "question": "what was the percentage change in the repurchase reserve between 2008 and 2009 , in millions?",
        "answer": "543%"
    },
    {
        "question": "what was the percentage change in the number of rsus outstanding from 2012 to 2013?",
        "answer": "-20%"
    },
    {
        "question": "how many combined shares are available under the 2014 incentive plan , the 2009 incentive plan and the 2006 employee stock purchase plan combined?",
        "answer": "41661517"
    },
    {
        "question": "what is the growth rate in cash dividends received in 2010 compare to 2009?",
        "answer": "26.8%"
    },
    {
        "question": "what is the average interest income for 2013 and 2014 , in millions?",
        "answer": "26.05"
    },
    {
        "question": "what percentage of total proved undeveloped resources as of dec 31 , 2014 does extensions and discoveries and proved undeveloped resources as of dec 31 , 2013 account for?",
        "answer": "125.1%"
    },
    {
        "question": "what is the percent change in net revenue from 2016 to 2017?",
        "answer": "2.25%"
    },
    {
        "question": "what is the percentage change in the balance of asset purchase agreements from 2012 to 2013?",
        "answer": "-7.5%"
    },
    {
        "question": "what was the growth in allowance for other funds used during construction from 2013 to 2014",
        "answer": "-44.4%"
    },
    {
        "question": "what were total r&e expenses in millions for 2017 , 2016 and in 2015?",
        "answer": "581"
    },
    {
        "question": "what portion of the securities approved by the security holders is issued?",
        "answer": "29.1%"
    },
    {
        "question": "considering the years 2006-2009 , what is the value of the average additions?",
        "answer": "100491"
    },
    {
        "question": "assuming that intangible asset will be sold , what will be the accumulated deprecation at the end of 2006 , in millions?",
        "answer": "830.2"
    },
    {
        "question": "what is the percentage change in the weighted average discount rate for pensions from 2010 to 2011?",
        "answer": "-10.4%"
    },
    {
        "question": "what was the percentage decrease the credit card lines from 2008 to 2009",
        "answer": "-21.6%"
    },
    {
        "question": "what is the 2017 growth rate in the amount of the unused commitments and lines of credit for dealers?",
        "answer": "-14%"
    },
    {
        "question": "what was the percent of the increase in the revenue from 2010 to 2011",
        "answer": "6.1%"
    },
    {
        "question": "what was the percentage change in the gross unrecognized tax benefits between 2010 and 2011?",
        "answer": "46%"
    },
    {
        "question": "what is the growth rate of net amount from 2007 to 2008?",
        "answer": "7.4%"
    },
    {
        "question": "what percentage where north american consumer packaging net sales of total consumer packaging sales in 2012?",
        "answer": "63%"
    },
    {
        "question": "what was the percentage change in the employee total matching contributions from 2015 to 2016",
        "answer": "57%"
    },
    {
        "question": "what is the growth rate in net revenues in 2018?",
        "answer": "11.9%"
    },
    {
        "question": "what percentage of the total purchase price was intangible assets?",
        "answer": "27%"
    },
    {
        "question": "by what percentage did total goodwill decline from 2007 to 2008 year end?",
        "answer": "-50.1%"
    },
    {
        "question": "what is the mark-to-market as a percentage of the decrease in net revenue from 2012 to 2013?",
        "answer": "112%"
    },
    {
        "question": "what is the percent change in net revenue between 2007 and 2008?",
        "answer": "0.7%"
    },
    {
        "question": "what is the percentage change in the balance of gross liability for unrecognized tax benefits from 2016 to 2017?",
        "answer": "-19.0%"
    },
    {
        "question": "what portion of the total net operating loss carryforwards is state related?",
        "answer": "47.8%"
    },
    {
        "question": "did the annual interest savings on the redemption of the 6.65% ( 6.65 % ) notes exceed the cost of the early extinguishment?",
        "answer": "yes"
    },
    {
        "question": "what was the average yearly decline in international traffic in 2008 and in 2009?",
        "answer": "17.5%"
    },
    {
        "question": "what is the growth rate in the r&d in 2019?",
        "answer": "-7.5%"
    },
    {
        "question": "what is the percentual increase in the resulting change in provision for income taxes caused by errors during 2002 and 2003?",
        "answer": "189%"
    },
    {
        "question": "what was the rate of growth from 2013 to 2014 in the fair value per share",
        "answer": "13.3%"
    },
    {
        "question": "what was the percentage change in capital expenditures between 2017 and 2018?",
        "answer": "30%"
    },
    {
        "question": "what percentage of debt maturity was there in 2010 , relative to 2006?",
        "answer": "114%"
    },
    {
        "question": "what is the growth rate in net sales for walmart u.s . segment from 2017 to 2018?",
        "answer": "3.5%"
    },
    {
        "question": "what is the percent change in net revenue between 2006 and 2007?",
        "answer": "3.3%"
    },
    {
        "question": "what was the percentage change in unrecognized tax benefits for 2007?",
        "answer": "41%"
    },
    {
        "question": "for how many common stock shares did the company pay dividends in 2012 , ( in millions ) ?",
        "answer": "1251.4"
    },
    {
        "question": "what was the average potential maximum exposure under the loss share arrangements in december 31 , 2013 and december 31 , 2012 in billions?",
        "answer": "3.8"
    },
    {
        "question": "what was the percentage change in open claims ending balance at december 31 from 2004 to 2005?",
        "answer": "4%"
    }
]

    

    # Perform benchmarking
    benchmark_results = benchmark_rag_system(benchmarking_data, pre_embeddings, post_embeddings, documents, bm25)

    print("Benchmarking Results:")
    print(benchmark_results)

    # Example query (optional)
    query = "What was the percentage change in net cash from operating activities from 2008 to 2009?"
    answer = rag_system(query, pre_embeddings, post_embeddings, documents, bm25)
    print(f"\nExample Question: {query}")
    print(f"Answer: {answer['answer']}")
    print(f"Contexts: {answer['contexts']}")