In [1]:
import json
import os
import numpy as np
import pandas as pd
from datasets import load_dataset
from urllib.parse import parse_qs, urlparse
import requests
import chromadb
import openai
from chromadb.config import Settings
import os
import openai
from openai import OpenAI
import chromadb
import fitz  # PyMuPDF
from chromadb.utils import embedding_functions
import chromadb.utils.embedding_functions as embedding_functions
from chromadb.config import Settings
import base64

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

In [2]:
import base64


def extract_pdf_url(url):
    """
    Extracts the actual PDF URL from the given URL.
    Decodes it from base64 if necessary.
    """
    if url.lower().endswith('.pdf'):
        return url  # Direct PDF URL
    else:
        parsed_url = urlparse(url)
        query_params = parse_qs(parsed_url.query)
        pdf_target = query_params.get('pdfTarget', [None])[0]

        if pdf_target:
            pdf_url = base64.b64decode(pdf_target).decode('utf-8')
            return pdf_url
        else:
            raise ValueError("No valid PDF URL found in the provided URL")

def download_pdf(url, save_path):
    """
    Downloads a PDF from a given URL.
    """
    try:
        pdf_url = extract_pdf_url(url)
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()  # Ensure the request was successful

        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

        print(f"Downloaded PDF from: {pdf_url} to {save_path}")
    except Exception as e:
        print(f"Error downloading PDF: {e}")
        
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [8]:

def create_embeddings(documents, openai_api_key):
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=openai_api_key,
        model_name="text-embedding-ada-002"
    )
    sentences = [doc.page_content for doc in documents]

    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=openai_api_key,
        model_name="text-embedding-ada-002"
    )
    
    # Batch processing for embeddings
    vectors = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        if len(batch) > 0:  # Ensure batch is not empty
            batch_vectors = openai_ef(batch)
            vectors.extend(batch_vectors)
    vectors = openai_ef(sentences)
    return vectors

In [14]:
def chunk_text(text, method="character", chunk_size=1000, chunk_overlap=0):
    if method == "character":
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    elif method == "recursive":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        documents = text_splitter.create_documents([text])
        return documents

In [9]:
def store_embeddings_in_chroma(documents, vectors, collection_name="Finance_bench_documents"):
    client = chromadb.Client()
    collection = client.get_or_create_collection(name=collection_name)

    for i, (doc, vector) in enumerate(zip(documents, vectors)):
        collection.upsert(f"id_{i}", vector, {"sentence": doc.page_content})

    print(f"Stored {len(documents)} vectors in the Chroma vector database.")

In [5]:
def display_chat_history(messages):
    for message in messages:
        print(f"{message['role'].capitalize()}: {message['content']}")

def get_assistant_response(messages, openai_api_key):
  client = OpenAI(api_key = openai_api_key)
  response = client.chat.completions.create(
      model="gpt-4o",
      messages=[{"role": m["role"], "content": m["content"]} for m in messages],
  )

  return response.choices[0].message.content

In [7]:
def query_openai_with_context(query, collection_name="Finance_bench_documents", top_k=2):
    # Initialize Chroma client
    client = chromadb.Client()

    collection = client.get_collection(collection_name)

    results = collection.query(
    query_text =[query], # Chroma will embed this for you
    n_results=top_k # how many results to return
    )
    print(results)
    
    # Formulate the prompt for OpenAI with context
    template = """You are a financial chatbot trained to answer questions based on the information provided in 10-K
    documents. Your responses should be directly sourced from the content of these documents. When asked
    a question, ensure that your answer is explicitly supported by the text in the 10-K filing, and do not
    include any external information, interpretations, or assumptions not clearly stated in the document. If
    a question pertains to financial data or analysis that is not explicitly covered in the 10-K filing provided,
    respond by stating that the information is not available in the document. Your primary focus should
    be on accuracy, specificity, and adherence to the information in 10-K documents, particularly regarding
    financial statements, company performance, and market position."""
    
    prompt = f"\nContext:\n{results}\n\nQuery: {query}\n\nAnswer:"
    
    # Query the OpenAI model
    #openai.api_key = openai_api_key
    # client = OpenAI(api_key = openai_api_key)
    # response = client.chat.completions.create(
    #     model="gpt-3.5-turbo",  # Choose the appropriate model
    #     messages=[
    #         {"role": "system", "content": template},
    #         {"role": "user", "content": f"Context:\n{results}\n\nQuery: {query}\n\nAnswer:"}
    #     ],
    #     max_tokens=150
    # )

    messages = [
            {"role": "system", "content": template},
            {"role": "user", "content": prompt}
        ]
    response = get_assistant_response(messages)
    
    return response

In [15]:
def main(): 
    results_list = []
    dataset = load_dataset("PatronusAI/financebench")
    df = pd.DataFrame(dataset['train'])
    test = df[:5]
    for index, row in test.iterrows():
        download_dir = "pdf_documents"
        os.makedirs(download_dir, exist_ok=True)
        doc_link = row['doc_link']
        doc_name = row['doc_name']
        question = row['question']
        ref_answer = row['answer']
        ref_context = row['evidence_text']
        doc_path = os.path.join(download_dir, f"{doc_name}.pdf")

            #save_path = f"downloads/{row['financebench_id']}.pdf"
        download_pdf(doc_link, doc_path)

        text = extract_text_from_pdf(doc_path)
        documents = chunk_text(text, method='character')
        vectors = create_embeddings(documents, openai_api_key)

        store_embeddings_in_chroma(documents, vectors, collection_name=f"Finance_bench_documents")
        
        print("Querying Model now")
        model_answer = query_openai_with_context(question, collection_name=f"Finance_bench_documents")
        print(model_answer)

        # Evaluation for structured QA 
        # cosine_similarity_score = calculate_cosine_similarity(model_answer, ref_answer)
        # bert_score = calculate_bertscore(model_answer, ref_answer)
        # llm_eval = evaluate_llm_responses(question, model_answer, ref_answer)

        # Append results to the list
        results_list.append({
            'doc_name': doc_name,
            'question': question,
            'ref_answer': ref_answer,
            'model_answer': model_answer,
            # 'cosine_similarity': cosine_similarity_score,
            # 'bert_score': bert_score,
            # 'llm_eval': llm_eval
        })

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(results_list)

    # Save results to CSV
    results_df.to_csv('results_test.csv', index=False)

In [16]:
main()

Downloaded PDF from: https://investors.3m.com/financials/sec-filings/content/0001558370-19-000470/0001558370-19-000470.pdf to pdf_documents\3M_2018_10K.pdf


TypeError: 'NoneType' object is not iterable

In [None]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder(
    "cross-encoder/ms-marco-TinyBERT-L-2-v2", max_length=512, device="cpu"
)

In [None]:
reranked_docs = cross_encoder.rank(
    question,
    [doc.page_content for doc in retrieved_docs],
    top_k=3,
    return_documents=True,
)

In [None]:
def create_chroma_vectordb_from_pdf(pdf_path, openai_api_key, batch_size=100):
# Extract text from PDF
    text = extract_text_from_pdf(pdf_path)
    
    # Split text into sentences
    sentences = text.split('\n')
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]  # Remove empty sentences
    
    # Initialize OpenAI Embedding Function
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=openai_api_key,
        model_name="text-embedding-ada-002"
    )
    
    # Batch processing for embeddings
    vectors = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        if len(batch) > 0:  # Ensure batch is not empty
            batch_vectors = openai_ef(batch)
            vectors.extend(batch_vectors)
    
    # Store vectors in Chroma vector database
    client = chromadb.Client(Settings())
    collection_name = "Finance_bench_documents"
    collection = client.get_or_create_collection(name= collection_name) 

    for i, (sentence, vector) in enumerate(zip(sentences, vectors)):
        collection.add(f"id_{i}", vector, {"sentence": sentence})
    
    print(f"Stored {len(sentences)} vectors in the Chroma vector database.")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader(sec_filing_pdf)

# Load the PDF document
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2"),
    chunk_size=256,
    chunk_overlap=16,
    strip_whitespace=True,
)

docs = text_splitter.split_documents(pages)

text_splitter = CharacterTextSplitter(
            chunk_size=1300, 
            chunk_overlap=5,
            #separators=["\n\n", "\n", " ", ""],
            length_function=len)
docs = text_splitter.split_documents(documents)