<p style="font-size:20px; color:turquoise">Setup</p>

<p style="font-size:16px; color:lightblue">Import Modules</p>

In [72]:
import pymupdf4llm
import pathlib
from llama_index.readers.file import MarkdownReader
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import re
from rapidfuzz import process, fuzz 
import pandas as pd
import os
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI
import csv
import os
from collections import deque
from dotenv import load_dotenv

load_dotenv()

True

<p style="font-size:18px; color:lightblue">Define document source directory</p>

In [32]:
pdffolder = Path("./sourcePDF/") 

<p style="font-size:16px; color:turquoise">Function to look up metadata from source file</p>

In [33]:
def lookupMetaData(municipality):
    # Load the metadata CSV file
    df = pd.read_csv('./sourceData/Municipality_MetaData.csv')
    name_list = df['Name'].dropna().tolist()

    # Find best match using fuzzy logic
    best_match, score, index = process.extractOne(
        municipality, name_list, scorer=fuzz.token_sort_ratio
    )
    if score >= 50:
        matched_row = df[df['Name'] == best_match].iloc[0]
        output = matched_row.to_dict()
        return output
    else:
        return f"No good match found for '{municipality}' (best guess: '{best_match}', score: {score})"

<p style="font-size:16px; color:turquoise">Function to create metadata from lookup in source file</p>

In [34]:
def createMetaData(fname):
    fnameSeg = re.findall(r'(\d+)(\_)(\w+)', fname)
    year = fnameSeg[0][0]
    municipality = fnameSeg[0][2]
    metadata = lookupMetaData(municipality)
    docData = {"File Name": fname,
                "Name" : metadata["Name"], 
                "year" : year, 
                "Municipal status" : metadata["Municipal status"], 
                "Geographic Area" : metadata["Geographic Area"],
                "Upper Tier" : metadata["Upper Tier"],
                "website" : metadata["website"]
                }
    print(docData)
    return docData

<p style="font-size:18px; color:lightblue">Initialize Chromadb Collection</p>


In [53]:
chromaClient = chromadb.PersistentClient(path="./chroma_db")

def initializeChromaCollection(collection_name):
    try:
        collection = chromaClient.get_collection(name=collection_name)
        print(f"Collection {collection_name} found with {collection.count()} documents")
    except Exception as e:
        print(f"Error accessing collection: {e}")
        print("Creating new collection...")
        collection = chromaClient.create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )
        print(f"Created collection {collection_name} with {collection.count()} documents")

<p style="font-size:18px; color:lightblue">Add documents to ChromaDB</p>

In [67]:
def addDocumentsToChromaCollection(collection_name, documents, embedding_function):
    initializeChromaCollection(collection_name)
    collection = chromaClient.get_collection(name=collection_name)
    x=0
    for doc in documents:
        x+=1
        text= doc.text
        id= doc.doc_id
        metadata = doc.metadata
        embed_model = embedding_function(text)

    collection.add(
        documents=text,
        embeddings=embed_model,
        ids=id,
        metadatas=[metadata]
    )
    print(f"There are {x} documents added")

<p style="font-size:18px; color:lightblue">Define Scoring Functions</p>

In [55]:
def exact_match_score(expected, response):
    return int(expected.strip().lower() == response.strip().lower())

def keyword_overlap_score(expected, response):
    expected_keywords = set(expected.lower().split())
    response_keywords = set(response.lower().split())
    if not expected_keywords:
        return 0.0
    overlap = expected_keywords & response_keywords
    return len(overlap) / len(expected_keywords)

def embedding_similarity_score(expected, response, embedding_function):
    expected_emb = embedding_function(expected)
    response_emb = embedding_function(response)
    return util.cos_sim(expected_emb, response_emb).item()

<p style="font-size:18px; color:lightblue">Load Test Cases</p>

In [85]:
test_cases = []

with open("RAG_tests.csv", newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        test_cases.append(row)

<p style="font-size:18px; color:lightblue">OpenAI: LLM Client and API Call</p>

In [90]:
# Create OpenAI Client
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'], organization=os.environ['ORGANIZATION'], project=os.environ['PROJECT'])
# chat_history = deque(maxlen=10)

def generate_response(question: str, context, chat_history):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": question}, 
                  {"role": "system", "content": context},
                  {"role": "system", "content":chat_history 
                   }],
        stream=True,
    )
    chat_history.append({"role": "user", "content": question})
    full_response = ""
    for chunk in response:
        if chunk.choices[0].delta.content is not None:
            ans = chunk.choices[0].delta.content
            # print(ans, end="")
            # yield ans
            full_response += ans
    chat_history.append({"role": "system", "content": full_response})
    max_history = 10
    if len(chat_history) > max_history:
        chat_history = chat_history[-max_history:] 
    return full_response 


<p style="font-size:18px; color:lightblue">Test Model</p>


NOTE: The variable result['documents'][0] contains the n_results which in this case is 3
    print(len(result['documents'][0])) # =3
    if you turn the results into a string that you can evaluate them together, 
    no need to loop through and then combine the results.

In [76]:
# Run test cases and compute scores
def test_model(test_cases, results_fname, collection_name, embedding_function, chat_history):
    test_results = []
    for case in test_cases:
        query = case["Query"]
        source_file = case["Source File"]
        expected = case["Expected Result"]
        print(f"Running query: {query}")
        
        # response = str(query_engine.query(query))
        # query_vec = embed_model.get_text_embedding(query)
        collection = chromaClient.get_collection(name=collection_name)
        query_vec = embedding_function(query)
        result = collection.query(
            query_embeddings=[query_vec],
            n_results=3
        )
        
        vec_response = str(result['documents'][0]) 
        metadata = str(result['metadatas'][0])
        response_fname = str(result['metadatas'][0][0]['File Name'])    
        exact = exact_match_score(expected, vec_response)
        keyword = keyword_overlap_score(expected, vec_response)
        vec_similarity = embedding_similarity_score(expected, vec_response, embedding_function) 
        file_similartiy = embedding_similarity_score(source_file,response_fname, embedding_function)   
        
        llm_response = generate_response(question=query, context=vec_response, chat_history=chat_history)
        llm_similarity = embedding_similarity_score(expected, llm_response, embedding_function) 
        
        
        test_results.append({
            "Query": query,
            "Expected": expected,
            "Response": vec_response,
            "Metadata": metadata,
            "Exact Match": exact,
            "Keyword Overlap": keyword,
            "Embedding Similarity": vec_similarity,
            "Source File Similarity": file_similartiy,
            "LLM Response": llm_response,
            "LLM Embedding Similarity": llm_similarity
        })

    # Save results to CSV
    with open(results_fname, "w", newline='') as csvfile:
        fieldnames = ["Query", "Expected", "Response", "Metadata", "Exact Match", "Keyword Overlap", "Embedding Similarity", 
                      "Source File Similarity", "LLM Response", "LLM Embedding Similarity"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for r in test_results:
            writer.writerow(r)

    print(f"Test results saved to {results_fname}")

In [66]:
llama_reader = pymupdf4llm.LlamaMarkdownReader()
all_llama_docs = []
remove_list = ['format', 'author', 'creator', 'producer', 'creationDate', 'modDate', 'trapped', 'encryption', 'file_path']

for pdf in pdffolder.iterdir():
    fname = str(pdf)
    print(fname)
    docData = createMetaData(fname)
    
    llama_docs = llama_reader.load_data(pdf)
    for i, doc in enumerate(llama_docs):
        doc.metadata = {k: v for k, v in doc.metadata.items() if k not in remove_list and v is not None}
        for key, value in docData.items():
            doc.metadata.update({
            key:value
        })   

    #print(doc.metadata)
    all_llama_docs.extend(llama_docs) 
    
print(len(all_llama_docs))

sourcePDF/2024_CityofWaterloo.pdf
{'File Name': 'sourcePDF/2024_CityofWaterloo.pdf', 'Name': ' City of Waterloo', 'year': '2024', 'Municipal status': 'Lower Tier', 'Geographic Area': 'Waterloo', 'Upper Tier': ' Regional Municipality of Waterloo', 'website': 'http://www.city.waterloo.on.ca/'}
sourcePDF/2002_GuelphEramosa.pdf
{'File Name': 'sourcePDF/2002_GuelphEramosa.pdf', 'Name': ' Township of Guelph/Eramosa', 'year': '2002', 'Municipal status': 'Lower Tier', 'Geographic Area': 'Wellington', 'Upper Tier': ' County of Wellington', 'website': 'http://www.get.on.ca/'}
sourcePDF/2024_CityofGuelph.pdf
{'File Name': 'sourcePDF/2024_CityofGuelph.pdf', 'Name': ' City of Guelph', 'year': '2024', 'Municipal status': 'Single Tier', 'Geographic Area': 'Wellington', 'Upper Tier': nan, 'website': 'http://www.guelph.ca/'}
872


In [None]:
results_fname = "llm_test_results.csv"
collection_name = "amPlan_context"
chat_history = deque(maxlen=10)
test_model(test_cases, results_fname, collection_name, chat_history)

Running query: What is the estimated total value of the City of Waterloo's infrastructure?
Running query: Which regulation does the City of Waterloo's Asset Management Plan comply with?
Running query: Who contributed to the development of the Asset Management Plan?
Running query: What is the forecasted decline in performance of tax-base funded assets over 25 years?
Running query: What is the primary funding source mentioned for infrastructure renewal?
Running query: What is the Waterloo Decision Support System (DSS)?
Running query: What is the city's current target for overall road PQI?
Running query: What is the estimated cost of replacing the city's transportation assets?
Running query: What funding increase was approved by the council for 2024-2026?
Running query: What are some of the key asset classes identified in the plan?
Test results saved to test_results.csv


<p style="font-size:20px; color:turquoise">TEST CASE 1: all-MiniLM-L6-v2 Embedding Model</p>

In [86]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embedding_function(text):
    embedding = embed_model.encode(text)
    return embedding

In [78]:
collection_name = "all-MiniLM-L6-v2_TEST2"
documents = all_llama_docs
addDocumentsToChromaCollection(collection_name, documents, embedding_function)

Collection all-MiniLM-L6-v2_TEST2 found with 1 documents
There are 872 documents added


In [91]:
collection_name = "all-MiniLM-L6-v2"
fname = "llama_all-MiniLM-L6-v2_results_TEST2_CHAT_HISTORY.csv"
chat_history = []
# chat_history = [{"role":"System", "content":"None"}]
test_model(test_cases=test_cases, results_fname=fname, collection_name=collection_name, embedding_function=embedding_function, chat_history=chat_history)

Running query: What is the estimated total value of the City of Waterloo's infrastructure?


BadRequestError: Error code: 400 - {'error': {'message': "Missing required parameter: 'messages[2].content[0].type'.", 'type': 'invalid_request_error', 'param': 'messages[2].content[0].type', 'code': 'missing_required_parameter'}}

<p style="font-size:20px; color:turquoise">TEST CASE 2: all-mpnet-base-v2</p>

<p style="font-size:18px; color:lightblue">1: Set Embedding Model and Function</p>

In [44]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")

def embedding_function(text):
    embedding = embed_model.get_text_embedding(text)
    return embedding

<p style="font-size:18px; color:lightblue">2: Create Collection</p>

In [45]:
collection_name = "amPlan_context"
documents = all_llama_docs
addDocumentsToChromaCollection(collection_name, documents, embedding_function)

Collection amPlan_context found with 872 documents


<p style="font-size:18px; color:lightblue">3: Run Test Model</p>

In [49]:
collection_name = "amPlan_context"
fname = "llama_all-mpnet-base-v2_results.csv"
test_model(test_cases=test_cases, results_fname=fname, collection_name=collection_name, embedding_function=embedding_function)

Running query: What is the estimated total value of the City of Waterloo's infrastructure?
Running query: Which regulation does the City of Waterloo's Asset Management Plan comply with?
Running query: Who contributed to the development of the Asset Management Plan?
Running query: What is the forecasted decline in performance of tax-base funded assets over 25 years?
Running query: What is the primary funding source mentioned for infrastructure renewal?
Running query: What is the Waterloo Decision Support System (DSS)?
Running query: What is the city's current target for overall road PQI?
Running query: What is the estimated cost of replacing the city's transportation assets?
Running query: What funding increase was approved by the council for 2024-2026?
Running query: What are some of the key asset classes identified in the plan?
Test results saved to test_results.csv
