In [1]:
import json
import os
import numpy as np
import pandas as pd
from datasets import load_dataset
from urllib.parse import parse_qs, urlparse
import requests
import chromadb
import openai
from chromadb.config import Settings
import os
import openai
from openai import OpenAI
import chromadb
import fitz  # PyMuPDF
from chromadb.utils import embedding_functions
import chromadb.utils.embedding_functions as embedding_functions
from chromadb.config import Settings
from pypdf import PdfReader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_pdf_url(url):
    """
    Extracts the actual PDF URL from the given URL.
    Decodes it from base64 if necessary.
    """
    if url.lower().endswith('.pdf'):
        return url  # Direct PDF URL
    else:
        parsed_url = urlparse(url)
        query_params = parse_qs(parsed_url.query)
        pdf_target = query_params.get('pdfTarget', [None])[0]

        if pdf_target:
            pdf_url = base64.b64decode(pdf_target).decode('utf-8')
            return pdf_url
        else:
            raise ValueError("No valid PDF URL found in the provided URL")


def download_pdf(url, save_path):
    """
    Downloads a PDF from a given URL.
    """
    try:
        pdf_url = extract_pdf_url(url)
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()  # Ensure the request was successful
        if not(os.path.exists(save_path)):
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            print(f"Downloaded PDF from: {pdf_url} to {save_path}")
    except Exception as e:
        print(f"Error downloading PDF: {e}")

def get_pages_from_pdf(path):
    reader = PdfReader(path)
    pages_text = []
    for idx, page in enumerate(reader.pages):
        pages_text.append(page.extract_text())
    return pages_text

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [17]:
def display_chat_history(messages):
    for message in messages:
        print(f"{message['role'].capitalize()}: {message['content']}")

def get_assistant_response(messages, openai_api_key):
  client = OpenAI(api_key = openai_api_key)
  response = client.chat.completions.create(
      model="gpt-4o",
      messages=[{"role": m["role"], "content": m["content"]} for m in messages],
  )

  return response.choices[0].message.content

In [15]:
def query_openai_with_context(query, context, openai_api_key):
    
    template = """You are a financial chatbot trained to answer questions based on the information provided in 10-K
        documents. Your responses should be directly sourced from the content of these documents."""

    
    prompt = f"\nContext:\n{context}\n\nQuery: {query}\n\nAnswer:"

    messages = [
            {"role": "system", "content": template},
            {"role": "user", "content": prompt}
        ]
    response = get_assistant_response(messages, openai_api_key)
    
    return response

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_score import score

def calculate_cosine_similarity(text1, text2):
    # Create a TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the two texts
    tfidf_matrix = vectorizer.fit_transform([text1, text2])

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    # Extract the similarity score
    similarity_score = cosine_sim[0][0]
    
    return similarity_score


def calculate_bertscore(candidate, reference):
    P, R, F1 = score([candidate], [reference], lang="en", verbose=True)
    return P.mean().item()

In [21]:
def evaluate_llm_responses(question, model_answer, refrence_answer, openai_api_key):
    for question in question:
        messages = [
            {"role": "system", "content": "You are an assistant that provides concise and accurate answers."},
            {"role": "user", "content": question}
        ]
        response = model_answer

    evaluation_scores = []
    for i in range(len(response)):
        evaluation_prompt = f"""
        Evaluate the following response against the reference answer. Assign a score between 0 and 1 based on correctness and provide a brief justification.

        Question: {question}
        Response: {model_answer}
        Reference Answer: {refrence_answer}

        Score (0 to 1):
        Justification:
        """
        messages = [
            {"role": "system", "content": "You are an evaluator that scores responses based on correctness."},
            {"role": "user", "content": evaluation_prompt}
        ]
        evaluation_response = get_assistant_response(messages, openai_api_key)

        evaluation_text = evaluation_response.strip()
        try:
            score_line = evaluation_text.split('\n')[0]
            score = float(score_line.split(':')[1].strip())
            evaluation_scores.append(score)
        except Exception as e:
            print(f"Error parsing score: {e}")
            evaluation_scores.append(0.0)

    average_score = sum(evaluation_scores) / len(evaluation_scores) if evaluation_scores else 0
    print(f'Average Correctness Score: {average_score:.2f}')
    return average_score

In [23]:
def main(): 
    results_list = []
    dataset = load_dataset("PatronusAI/financebench")
    df = pd.DataFrame(dataset['train'])
    # change to be the whole dataset when ready
    test = df[:5]

    for index, row in test.iterrows():
        download_dir = "pdf_documents"
        os.makedirs(download_dir, exist_ok=True)
        doc_link = row['doc_link']
        doc_name = row['doc_name']
        question = row['question']
        ref_answer = row['answer']
        ref_context = row['evidence_text']
        # doc_path = os.path.join(download_dir, f"{doc_name}.pdf")

        # download_pdf(doc_link, doc_path)
        
        model_answer = query_openai_with_context(question, ref_context, openai_api_key)
        print(model_answer)

        # Evaluation for structured QA 
        cosine_similarity_score = calculate_cosine_similarity(model_answer, ref_answer)
        bert_score = calculate_bertscore(model_answer, ref_answer)
        llm_eval = evaluate_llm_responses(question, model_answer, ref_answer, openai_api_key)

        # Append results to the list
        results_list.append({
            'doc_name': doc_name,
            'question': question,
            'ref_answer': ref_answer,
            'model_answer': model_answer,
            'cosine_similarity': cosine_similarity_score,
            'bert_score': bert_score,
            'llm_eval': llm_eval
        })

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(results_list)

    # Save results to CSV
    results_df.to_csv('fake_rag_results.csv', index=False)

In [24]:
main()

The FY2018 capital expenditure amount for 3M, which corresponds to the "Purchases of property, plant and equipment (PP&E)" line item, is $1,577 million.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  3.73it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 190.00it/s]


done in 0.27 seconds, 3.71 sentences/sec
Average Correctness Score: 1.00
The year-end FY2018 net Property, Plant, and Equipment (PP&E) for 3M is $8.738 billion.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  4.76it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 500.04it/s]


done in 0.21 seconds, 4.70 sentences/sec
Error parsing score: list index out of range
Average Correctness Score: 0.73
Based on the FY2022 data available in the 3M Company's 10-K filing, 3M appears to be a capital-intensive business. This can be inferred from the substantial investments in purchases of property, plant, and equipment (PP&E), which amounted to $1,749 million in FY2022. Additionally, the net value of property, plant, and equipment was $9,178 million as of December 31, 2022, indicating significant capital assets.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.68it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 333.44it/s]


done in 0.60 seconds, 1.68 sentences/sec
Error parsing score: could not convert string to float: '0**'
Error parsing score: could not convert string to float: '** 0.5'
Error parsing score: could not convert string to float: '0.5**'
Error parsing score: could not convert string to float: '0.5**'
Error parsing score: could not convert string to float: '** 0.5'
Error parsing score: could not convert string to float: '0.5**'
Error parsing score: could not convert string to float: '** 0.5'
Average Correctness Score: 0.43
Operating margin change for 3M as of FY2022 was driven by several factors:

1. **Increased SG&A Expenses**: Special item costs such as significant litigation expenses related to the Combat Arms Earplugs litigation, impairment costs due to exiting PFAS manufacturing, costs associated with exiting Russia, and divestiture-related restructuring charges.
2. **Investments in Growth Initiatives**: Continued investment in key growth initiatives contributed to higher SG&A as a perce

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:01<00:00,  1.07s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 188.23it/s]


done in 1.08 seconds, 0.93 sentences/sec
Error parsing score: could not convert string to float: ''
Error parsing score: could not convert string to float: '0.8**'
Error parsing score: could not convert string to float: '0.8**'
Error parsing score: could not convert string to float: '** 0.8'
Error parsing score: could not convert string to float: '** 0.5'
Error parsing score: could not convert string to float: '** 0.8'
Error parsing score: could not convert string to float: '** 0.8'
Error parsing score: could not convert string to float: '0.75**'
Error parsing score: could not convert string to float: '0.8**'
Error parsing score: could not convert string to float: '0.7**'
Error parsing score: could not convert string to float: '0.8**'
Error parsing score: could not convert string to float: '0.8**'
Error parsing score: could not convert string to float: ''
Error parsing score: could not convert string to float: '** 0.8'
Error parsing score: could not convert string to float: '0.8**'
Err

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}