In [2]:
import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, BootstrapFinetune
from dspy.primitives.assertions import assert_transform_module, backtrack_handler

In [3]:
import psycopg2
from psycopg2 import Error

def get_records(DB_HOST, POSTGRES_DB, POSTGRES_USER, POSTGRES_PASSWORD):
    try:
        connection = psycopg2.connect(database=POSTGRES_DB, user=POSTGRES_USER, password=POSTGRES_PASSWORD, host=DB_HOST)
        cursor = connection.cursor()
        cursor.execute("SELECT id, (metadata->>'summary') as summary FROM documents WHERE ((metadata->>'doctype') = 'Lease Agreement') AND ((metadata->>'summary') IS NOT NULL) LIMIT 25")
        records = cursor.fetchall()
        return records
    except Error as e:
        print(f"Connection error: {e}")

In [17]:
def get_examples(records):    
    template = "Extract the data from the following lease abstract.  The extracted data should contain information about the premises to be occupied by the tenant i.e., address1 (Street address), address2 (Apartment or Unit Number (if applicable)), city, state (Two letter abbreviation) and postcode. The output information should be separated by commas. If the address1 does not contain relevant information or is not very specific, return N/A. \n" # The data would be used for locating the premises on a map so be precise and standard.

    questions = []

    for row in records:
        question = template + row[1]
        questions.append(question)

    answers = ["Gnoss Field Airport, Marin County, CA, 94945", "920 Grand Avenue, San Rafael, CA, 94901", "125 Belvedere Avenue, Belvedere, CA", "800 San Anselmo Avenue, San Anselmo, CA, 94960", "351 Airport Road, Marin County, CA", "135 Belvedere Avenue, Belvedere, CA, 94920", "N/A", "N/A", "N/A", "30 Joseph Court, San Rafael, CA, 94903", " Corte Madera Creek, Marin County, CA, 94925", "2521 Shoreline Highway, Stinson Beach, CA, 94970", "310 Harbor Drive, Sausalito, CA, 94965", "Marin County Civic Center, San Rafael, CA, 94903", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "", "", "", "", ""]
    examples = [(questions[i], answers[i]) for i in range(len(questions))]
    return examples

In [18]:
from pathlib import Path
from dotenv import load_dotenv
import os
import random

dotenv_path = Path(r'C:\Users\Aashrith\CoE Internship\real-assistant\app\.env')
load_dotenv(dotenv_path=dotenv_path)

AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
DB_HOST=os.getenv("DB_HOST")
POSTGRES_DB=os.getenv("POSTGRES_DB")
POSTGRES_USER=os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD=os.getenv("POSTGRES_PASSWORD")

records = get_records(DB_HOST, POSTGRES_DB, POSTGRES_USER, POSTGRES_PASSWORD)
examples = get_examples(records)
train_examples = examples[:20]
test_examples = examples[20:]
random.shuffle(train_examples)

In [6]:
gpt4_mini = dspy.AzureOpenAI(api_base=AZURE_OPENAI_ENDPOINT, api_version='2024-02-15-preview', model='gpt-4o-mini-global-128k',  api_key=AZURE_OPENAI_API_KEY)
dspy.configure(lm=gpt4_mini)

In [19]:
train = [dspy.Example(question=question, answer=answer).with_inputs('question') for question, answer in train_examples[:15]]
dev = [dspy.Example(question=question, answer=answer).with_inputs('question') for question, answer in train_examples[15:20]]
test = [dspy.Example(question=question, answer=answer).with_inputs('question') for question, answer in test_examples]

In [8]:
class BasicQA(dspy.Signature):
    """Answer questions."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="only contains the answer")

In [9]:
def check_postcode_in_abstract(pred, abstract):
    if pred.answer == "N/A":
        return True
    postcode = pred.answer.split(",")[-1]
    if postcode in abstract:
        return True
    return False

In [10]:
class CoT(dspy.Module):  
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought('question -> answer')
    
    def forward(self, question):
        return self.generate_answer(question=question)

In [11]:
class CoT_with_Assertions(dspy.Module):  
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought('question -> answer')
    
    def forward(self, question):
        pred = self.generate_answer(question=question)
        abstract = question[(question.index("\n")+1):]
        dspy.Suggest(check_postcode_in_abstract(pred,abstract), f"Check if postcode is present in the abstract and only then provide it in the output \n Abstract:{abstract}", )
        return pred

In [20]:
cot = CoT()

metric_EM = dspy.evaluate.answer_exact_match

teleprompter = BootstrapFewShot(metric=metric_EM, max_bootstrapped_demos=4)
cot_compiled_with_assertions = teleprompter.compile(student=cot, teacher = assert_transform_module(CoT_with_Assertions(), backtrack_handler), trainset=train)

# longformqa_with_assertions = assert_transform_module(LongFormQAWithAssertions().map_named_predictors(Retry), backtrack_handler) 

 87%|███████████████████████████████████████████████████████████████████████           | 13/15 [00:48<00:07,  3.72s/it]

Bootstrapped 4 full traces after 14 examples in round 0.





In [21]:
NUM_THREADS = 32
evaluate_hotpot = Evaluate(devset=dev, metric=metric_EM, num_threads=NUM_THREADS, display_progress=True, display_table=15)

evaluate_hotpot(cot_compiled_with_assertions)

Average Metric: 2 / 5  (40.0): 100%|█████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.21it/s]

Average Metric: 2 / 5  (40.0%)





Unnamed: 0,question,example_answer,rationale,pred_answer,answer_exact_match
0,"Extract the data from the following lease abstract. The extracted data should contain information about the premises to be occupied by the tenant i.e., address1...","Gnoss Field Airport, Marin County, CA, 94945","extract the relevant address information from the lease abstract. The property address is stated as ""Gnoss Field Airport, Marin County, California."" However, it does not...",,False
1,"Extract the data from the following lease abstract. The extracted data should contain information about the premises to be occupied by the tenant i.e., address1...","30 Joseph Court, San Rafael, CA, 94903","extract the relevant address information from the lease abstract. The property address is clearly stated as ""30 Joseph Court, San Rafael, CA."" We need to...","30 Joseph Court, N/A, San Rafael, CA, 94901",False
2,"Extract the data from the following lease abstract. The extracted data should contain information about the premises to be occupied by the tenant i.e., address1...",,"extract the relevant address information from the lease abstract. The property address is stated as ""Novato, California,"" but it lacks a specific street address. Since...",,✔️ [True]
3,"Extract the data from the following lease abstract. The extracted data should contain information about the premises to be occupied by the tenant i.e., address1...","310 Harbor Drive, Sausalito, CA, 94965","extract the relevant address information from the lease abstract. The property address is clearly stated as ""310 Harbor Drive, Sausalito, CA."" We need to format...",,False
4,"Extract the data from the following lease abstract. The extracted data should contain information about the premises to be occupied by the tenant i.e., address1...",,"extract the relevant address information from the lease abstract. The property address is stated as ""Mt. Barnabe Communications Site, Assessor Parcel No. 168-240-01, Lagunitas, Marin...",,✔️ [True]


40.0

In [22]:
# for i in range(len(test)):
#     pred = cot_compiled_with_assertions(test[i].question)
#     print(pred.answer)
pred = cot_compiled_with_assertions(test[1].question)
print(pred.answer)

3270 Kerner Blvd., Suite B, San Rafael, CA, 94901


In [23]:
gpt4_mini.inspect_history(n=5)





Given the fields `question`, produce the fields `answer`.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: ${answer}

---

Question: Extract the data from the following lease abstract. The extracted data should contain information about the premises to be occupied by the tenant i.e., address1 (Street address), address2 (Apartment or Unit Number (if applicable)), city, state (Two letter abbreviation) and postcode. The output information should be separated by commas. If the address1 does not contain relevant information or is not very specific, return N/A. # Lease Abstract Summary ## Key Information - **Asset Class:** Tidelands Lease (Special Use) - **Tenant/Lessee:** Seaplane Investment, LLC **Landlord/Lessor:** County of Marin - **Property Address:** Assessor Parcel No. 052-247-03, Richardson Bay, Sausalito, County of Marin, State of California - **Parts of the Property Occupied by Tenan