In [32]:
import os
import requests
import fitz  # PyMuPDF
from urllib.parse import parse_qs, urlparse
import base64
from datasets import load_dataset
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import Chroma
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import openai
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_score import score
import pandas as pd
from dotenv import load_dotenv
from rich import print

#import statements that further down in hershs chunking.ipynb
from openai import OpenAI, ChatCompletion
import uuid
from langchain_openai.embeddings import OpenAIEmbeddings

import pandas as pd
from datasets import load_dataset

load_dotenv()

False

In [33]:
def extract_pdf_url(url):
    if url.lower().endswith('.pdf'):
        return url  # Direct PDF URL
    else:
        parsed_url = urlparse(url)
        query_params = parse_qs(parsed_url.query)
        pdf_target = query_params.get('pdfTarget', [None])[0]

        if pdf_target:
            pdf_url = base64.b64decode(pdf_target).decode('utf-8')
            return pdf_url
        else:
            raise ValueError("No valid PDF URL found in the provided URL")

In [34]:
def download_pdf(url, save_path):
    try:
        pdf_url = extract_pdf_url(url)
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()  # Ensure the request was successful
        if not(os.path.exists(save_path)):
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f"Downloaded PDF from: {pdf_url} to {save_path}")
    except Exception as e:
        print(f"Error downloading PDF: {e}")

In [35]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [36]:
openai_api_key = ""
def create_embeddings(documents, openai_api_key):
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=openai_api_key,
        model_name="text-embedding-ada-002"
    )
    sentences = [doc.page_content for doc in documents]
    vectors = openai_ef(sentences)
    return vectors

In [37]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata if metadata else {}

class AgenticChunker:
    def __init__(self):
        self.chunks = {}
        self.id_truncate_limit = 5
        self.generate_new_metadata_ind = True
        self.print_logging = True
        self.llm = ChatCompletion(api_key=openai_api_key, model='gpt-3.5-turbo', temperature=0)

    def add_propositions(self, propositions):
        for proposition in propositions:
            self.add_proposition(proposition)

    def add_proposition(self, proposition):
        if self.print_logging:
            print (f"\nAdding: '{proposition}'")
        if len(self.chunks) == 0:
            if self.print_logging:
                print ("No chunks, creating a new one")
            self._create_new_chunk(proposition)
            return
        chunk_id = self._find_relevant_chunk(proposition)
        if chunk_id:
            if self.print_logging:
                print (f"Chunk Found ({self.chunks[chunk_id]['chunk_id']}), adding to: {self.chunks[chunk_id]['title']}")
            self.add_proposition_to_chunk(chunk_id, proposition)
        else:
            if self.print_logging:
                print ("No chunks found")
            self._create_new_chunk(proposition)

    def add_proposition_to_chunk(self, chunk_id, proposition):
        self.chunks[chunk_id]['propositions'].append(proposition)
        if self.generate_new_metadata_ind:
            self.chunks[chunk_id]['summary'] = self._update_chunk_summary(self.chunks[chunk_id])
            self.chunks[chunk_id]['title'] = self._update_chunk_title(self.chunks[chunk_id])

    def _update_chunk_summary(self, chunk):
        PROMPT = ChatPromptTemplate.from_messages(
            [
                ("system", """
                You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.
                A new proposition was just added to one of your chunks, generate a brief 1-sentence summary for the chunk.
                """),
                ("user", "Chunk's propositions:\n{proposition}\n\nCurrent chunk summary:\n{current_summary}"),
            ]
        )
        runnable = PROMPT | self.llm
        new_chunk_summary = runnable.invoke({
            "proposition": "\n".join(chunk['propositions']),
            "current_summary": chunk['summary']
        }).content
        return new_chunk_summary

    def _update_chunk_title(self, chunk):
        PROMPT = ChatPromptTemplate.from_messages(
            [
                ("system", """
                You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.
                A new proposition was just added to one of your chunks, generate a brief updated chunk title.
                """),
                ("user", "Chunk's propositions:\n{proposition}\n\nChunk summary:\n{current_summary}\n\nCurrent chunk title:\n{current_title}"),
            ]
        )
        runnable = PROMPT | self.llm
        updated_chunk_title = runnable.invoke({
            "proposition": "\n".join(chunk['propositions']),
            "current_summary": chunk['summary'],
            "current_title": chunk['title']
        }).content
        return updated_chunk_title

    def _create_new_chunk(self, proposition):
        new_chunk_id = str(uuid.uuid4())[:self.id_truncate_limit]
        new_chunk_summary = self._get_new_chunk_summary(proposition)
        new_chunk_title = self._get_new_chunk_title(new_chunk_summary)
        self.chunks[new_chunk_id] = {
            'chunk_id': new_chunk_id,
            'propositions': [proposition],
            'title': new_chunk_title,
            'summary': new_chunk_summary,
            'chunk_index': len(self.chunks)
        }
        if self.print_logging:
            print(f"Created new chunk ({new_chunk_id}): {new_chunk_title}")

    def get_chunks(self, get_type='dict'):
        if get_type == 'dict':
            return self.chunks
        if get_type == 'list_of_strings':
            chunks = [" ".join(chunk['propositions']) for chunk in self.chunks.values()]
            return chunks

    def pretty_print_chunks(self):
        print(f"\nYou have {len(self.chunks)} chunks\n")
        for chunk_id, chunk in self.chunks.items():
            print(f"Chunk #{chunk['chunk_index']}")
            print(f"Chunk ID: {chunk_id}")
            print(f"Summary: {chunk['summary']}")
            print(f"Propositions:")
            for prop in chunk['propositions']:
                print(f"    -{prop}")
            print("\n\n")

In [38]:
def chunk_text(text, method="character", chunk_size=100, chunk_overlap=0):
    if method == "character":
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    elif method == "recursive":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    elif method == "semantic":
        text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type="percentile")
        documents = text_splitter.create_documents([text])
        return documents
    elif method == "agentic":
        ac = AgenticChunker()
        sentences = text.split('.')
        ac.add_propositions(sentences)
        chunks = ac.get_chunks(get_type='list_of_strings')
        documents = [Document(page_content=chunk) for chunk in chunks]
        return documents
    else:
        raise ValueError("Unknown chunking method")
    return text_splitter.create_documents([text])

In [39]:
def store_embeddings_in_chroma(documents, vectors, collection_name="Finance_bench_documents"):
    client = Chroma.Client(Settings())
    collection = client.get_or_create_collection(name=collection_name)

    for i, (doc, vector) in enumerate(zip(documents, vectors)):
        collection.add(f"id_{i}", vector, {"sentence": doc.page_content})

    print(f"Stored {len(documents)} vectors in the Chroma vector database.")

local_llm = ChatOpenAI(model="text-davinci-003")

ValidationError: 1 validation error for ChatOpenAI
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)

In [None]:
def rag(question, collection_name="Finance_bench_documents"):
    client = Chroma.Client(Settings())
    collection = client.get_collection(collection_name)
    retriever = collection.as_retriever()

    template = """You are a financial chatbot trained to answer questions based on the information provided in 10-K
    documents. Your responses should be directly sourced from the content of these documents. When asked
    a question, ensure that your answer is explicitly supported by the text in the 10-K filing, and do not
    include any external information, interpretations, or assumptions not clearly stated in the document. If
    a question pertains to financial data or analysis that is not explicitly covered in the 10-K filing provided,
    respond by stating that the information is not available in the document. Your primary focus should
    be on accuracy, specificity, and adherence to the information in 10-K documents, particularly regarding
    financial statements, company performance, and market position."""

    prompt_template = ChatPromptTemplate.from_template(template)

    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt_template
        | local_llm
        | StrOutputParser()
    )
    result = chain.invoke(question)
    return result

In [None]:
def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    similarity_score = cosine_sim[0][0]
    return similarity_score

In [None]:
def calculate_bertscore(candidate, reference):
    P, R, F1 = score([candidate], [reference], lang="en", verbose=True)
    return P.mean().item()

In [None]:
def evaluate_llm_responses(question, model_answer, reference_answer):
    evaluation_scores = []
    messages = [
        {"role": "system", "content": "You are an evaluator that scores responses based on correctness."},
        {"role": "user", "content": f"""
        Evaluate the following response against the reference answer. Assign a score between 0 and 1 based on correctness and provide a brief justification.

        Question: {question}
        Response: {model_answer}
        Reference Answer: {reference_answer}

        Score (0 to 1):
        Justification:
        """}
    ]
    evaluation_response = local_llm(messages)  # Using local_llm for evaluation
    evaluation_text = evaluation_response.strip()
    try:
        score_line = evaluation_text.split('\n')[0]
        score = float(score_line.split(':')[1].strip())
        evaluation_scores.append(score)
    except Exception as e:
        print(f"Error parsing score: {e}")
        evaluation_scores.append(0.0)

    average_score = sum(evaluation_scores) / len(evaluation_scores) if evaluation_scores else 0
    print(f'Average Correctness Score: {average_score:.2f}')
    return average_score

In [None]:
def seed_chroma():
    client = chromadb.PersistentClient(path=f'{path_to_pdf_storage}/chromadb.db')
    collection_name = "FinanceBench_Embeddings"
    if not(collection_name in [c.name for c in client.list_collections()]):
        collection = client.create_collection(name=collection_name)#, embedding_function=embedding_function)
    else:
        print('already exists - returning')
        return

    files = glob.glob(f'{path_to_pdf_storage}/*.pdf')
    files = [x for x in files if not(".pdf.pdf" in x)]
    print('files is:', files)
    for idx, path in enumerate(files):
        pages = get_pages_from_pdf(path)
        my_ids=[f'{str(idx)}_{x[0]}' for x in list(enumerate(pages))]
        collection.add(
            documents= pages,
            ids=my_ids,
            metadatas=[{'doc_path':path}]*len(my_ids),

        )

In [None]:
def evaluate_chunking_techniques(df, openai_api_key):
    chunking_methods = ["character", "recursive", "semantic", "agentic"]
    results = []

    for method in chunking_methods:
        print(f"Evaluating chunking method: {method}")

        for i, row in df.iterrows():
            download_dir = "pdf_documents"
            os.makedirs(download_dir, exist_ok=True)
            pdf_url = row['doc_link']
            doc_name = row['doc_name']
            question = row['question']
            ref_answer = row['answer']
            ref_context = row['evidence_text']

            doc_path = os.path.join(download_dir, f"{doc_name}.pdf")

            #save_path = f"downloads/{row['financebench_id']}.pdf"
            download_pdf(pdf_url, doc_path)

            text = extract_text_from_pdf(doc_path)
            documents = chunk_text(text, method=method)
            vectors = create_embeddings(documents, openai_api_key)

            store_embeddings_in_chroma(documents, vectors, collection_name=f"Finance_bench_{method}")

            model_answer = rag(question, collection_name=f"Finance_bench_{method}")

            cosine_similarity_score = calculate_cosine_similarity(model_answer, ref_context)
            bert_score_value = calculate_bertscore(model_answer, ref_context)
            llm_eval = evaluate_llm_responses(question, model_answer, ref_context)

            results.append({
                "doc_name" : doc_name,
                "method": method,
                "question": question,
                "ref_answer": ref_answer,
                "model_answer": model_answer,
                "cosine_similarity": cosine_similarity_score,
                "bert_score": bert_score_value,
                "llm_eval": llm_eval
            })

    return pd.DataFrame(results)

In [None]:
def main():
    dataset = load_dataset("PatronusAI/financebench")
    df = pd.DataFrame(dataset['train'])
    test = df.head(5)
    download_dir = "documents_QE"
    if not(os.path.exists(download_dir)):
        os.makedirs(download_dir, exist_ok=True)
    return test


    
test_data = main()
results_df = evaluate_chunking_techniques(test_data)
    # Save results to a CSV file
results_df.to_csv("chunking_evaluation_results.csv", index=False)

# # Calculate metrics
# def calculate_metrics(results_df):
#     metrics = results_df.groupby('method').agg({
#         'cosine_similarity': 'mean',
#         'bert_score': 'mean',
#         'llm_eval': 'mean'
#     }).reset_index()
#     return metrics

# metrics = calculate_metrics(results_df)
# print(metrics)

