#Installation and Dependencies

In [None]:
! pip install -qU langchain openai datasets PyPDF2 langchain-core faiss-gpu python-dotenv==1.0.0 langchain_community sentence-transformers ragas

In [None]:
import os
import time
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import faiss
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from datasets import Dataset, Features, Sequence, Value
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextRecall, ContextPrecision
from openai import OpenAIError, RateLimitError
import pandas as pd
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)


#Chunking and Storing text into Vector Store

In [None]:
def get_pdf_text(docs):
    text = ""
    for pdf in docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
            text += "\n"
    return text

def get_chunks(raw_text):
    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
    chunks = text_splitter.split_text(raw_text)
    return chunks


def get_vectorstore(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
    vectorstore = faiss.FAISS.from_texts(texts=chunks, embedding=embeddings)
    return vectorstore

#Generating Questions and Groud Truth

In [None]:
OPENAI_API_KEY = ''
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")


template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use two sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)


def setup_chain(retriever):
    def retriever_func(inputs):
        question = inputs['question']
        results = retriever.get_relevant_documents(question)
        context = [result.page_content for result in results]
        return {"question": question, "context": context}

    chain = (
        {"question": RunnablePassthrough(), "context": retriever_func}
        | prompt
        | llm
        | StrOutputParser()
    )
    return chain
def handle_question(question, retriever, max_retries=5, delay=5):
    retries = 0
    while retries < max_retries:
        try:
            chain = setup_chain(retriever)
            response = chain.invoke({"question": question})
            return response
        except RateLimitError as e:
            retries += 1
            print(f"Rate limit error: {e}. Retrying in {delay} seconds...")
            time.sleep(delay)
            delay *= 2  # Exponential backoff
    raise Exception("Max retries exceeded for OpenAI API rate limit")


def handle_question_with_retry(question, retriever, retries=5, backoff_factor=2):
    for attempt in range(retries):
        try:
            response = handle_question(question, retriever)
            return response
        except OpenAIError as e:
            if "rate_limit_exceeded" in str(e):
                wait_time = backoff_factor ** attempt
                print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                raise e
    raise Exception(f"Failed after {retries} retries due to rate limit issues")

  warn_deprecated(


In [None]:
    questions = [
        "What does Section 3: Courtesy car cover?",
        "What does Section 4: Accidental damage cover?",
        "What does Section 5: Windscreen damage cover?",
        "What does Section 6: Personal benefits cover?",
        "Who is covered under Motor Legal Cover?",
        "How does Guaranteed Hire Car Plus work?",
        "How can I make a complaint?"
    ]

    ground_truths = [
        ["Section 3: Courtesy car covers providing a courtesy car while your car is being repaired after an accident, fire, or theft."],
        ["Section 4: Accidental damage covers damage to your car caused by an accident."],
        ["Section 5: Windscreen damage covers repair or replacement of your car's windscreen."],
        ["Section 6: Personal benefits cover includes personal accident benefits, medical expenses, and personal belongings."],
        ["Motor Legal Cover includes the policyholder, named drivers, the registered keeper of the car, and passengers for motoring offense or road traffic accident cases."],
        ["Guaranteed Hire Car Plus provides you with a hire car of a similar size to yours while your car is being repaired, written off, or stolen."],
        ["To make a complaint, call 0800 051 0198 or write to Customer Relations Manager, Churchill Court, Westmoreland Road, Bromley BR1 1DP."]

    ]

#Evaluation with RAGAS

In [None]:
if __name__ == "__main__":

    docs = ["/content/policy-booklet-0923.pdf"]
    raw_text = get_pdf_text(docs)
    text_chunks = get_chunks(raw_text)
    vectorstore = get_vectorstore(text_chunks)
    retriever = vectorstore.as_retriever()

    answers = []
    contexts = []

    for question in questions:
        response = handle_question_with_retry(question, retriever)
        answers.append(response)
        contexts.append([doc.page_content for doc in retriever.get_relevant_documents(question)])

    features = Features({
        "question": Value("string"),
        "answer": Value("string"),
        "ground_truth": Value("string"),
        "contexts": Sequence(Value("string"))
    })

    dataset = Dataset.from_dict({"question": questions, "answer": answers, "ground_truth": ground_truths, "contexts": contexts}, features=features)


    result = evaluate(
        dataset=dataset,
        metrics=[
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
    )

    df = result.to_pandas()
    print(df)


    output_csv_path = "/content/evaluation_results.csv"
    df.to_csv(output_csv_path, index=False)
    print(f"Evaluation results saved to {output_csv_path}")



Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

                                        question  \
0       What does Section 3: Courtesy car cover?   
1  What does Section 4: Accidental damage cover?   
2  What does Section 5: Windscreen damage cover?   
3  What does Section 6: Personal benefits cover?   
4        Who is covered under Motor Legal Cover?   
5        How does Guaranteed Hire Car Plus work?   
6                    How can I make a complaint?   

                                              answer  \
0  Section 3: Courtesy car cover provides a small...   
1  Section 4: Accidental Damage cover includes da...   
2  Section 5: Windscreen damage covers the replac...   
3  Section 6: Personal benefits cover the additio...   
4  The policyholder, named drivers, registered ke...   
5  Guaranteed Hire Car Plus provides you with a s...   
6  To make a complaint, you can call the provided...   

                                        ground_truth  \
0  ['Section 3: Courtesy car covers providing a c...   
1  ['Section 4: Accide