In [20]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema import StrOutputParser
from operator import itemgetter
from dotenv import load_dotenv
import os
from qdrant_client import QdrantClient
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain.schema.runnable import RunnablePassthrough
from langsmith import Client


# Load environment variables
load_dotenv()

tavily_api_key = os.getenv("TAVILY_API_KEY")
qdrant_url = os.getenv("QDRANT_CLOUD_URL")
qdrant_api_key = os.getenv("QDRANT_CLOUD_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

# Define LLMs
# large_llm = ChatOpenAI(model="gpt-4o", temperature=0.2, api_key=openai_api_key)
base_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.8, api_key=openai_api_key)

# Define Embeddings
huggingface_embeddings = HuggingFaceEmbeddings(
    model_name="Snowflake/snowflake-arctic-embed-l", model_kwargs={'trust_remote_code': True}
)


In [26]:
# Define Embeddings
huggingface_embeddings = HuggingFaceEmbeddings(
    model_name="mel00/legal-ft-midterm", model_kwargs={'trust_remote_code': True}
)

Some weights of BertModel were not initialized from the model checkpoint at mel00/legal-ft-midterm and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from langchain_core.prompts import PromptTemplate
# Define RAG components
rag_prompt_template = """
You are a helpful legal assistant that answers questions based on the provided legal, by-law, and guidance context.
You must only use the provided context and avoid adding any extra knowledge.
Use simple and clear English, avoiding difficult legal terms.

## Question:
{question}
## Context:
{context}
"""

rag_prompt = PromptTemplate.from_template(rag_prompt_template)

In [22]:
# Connect to Qdrant
qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
collection_name = "ontario_ltb_toronto_guidance"


In [23]:
vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=collection_name,
    embedding=huggingface_embeddings,
)
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

In [None]:
rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | base_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [24]:
rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    |  rag_prompt | base_llm | StrOutputParser()
)

In [91]:
response = rag_chain.invoke({"question" : "How are LLM agents useful?"})

In [92]:
response

'The provided context does not contain any information about LLM agents or their usefulness. It focuses on specific rules and regulations related to the Residential Tenancies Act in Ontario. If you have questions about that act or related topics, feel free to ask!'

In [35]:
rag_chain.invoke({"question" : "Can tenant have pets?"})

{'response': "Whether a tenant can have pets depends on the specific rules set by the landlord or the rental agreement. The law allows landlords to restrict pets if they believe the animal could cause problems, such as being dangerous or causing allergies. However, if a tenant's pet does not cause any issues, the landlord cannot evict the tenant just because they have a pet. It's important for tenants to check their rental agreement and talk to their landlord about pet policies.",
 'context': [Document(metadata={'producer': 'Skia/PDF m133', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', 'creationdate': '2025-02-21T00:14:15+00:00', 'title': 'Residential Tenancies Act, 2006, S.O. 2006, c. 17 | ontario.ca', 'moddate': '2025-02-21T00:14:15+00:00', 'source': 'data/Residential Tenancies Act, 2006, S.O. 2006, c. 17 _ ontario.ca.pdf', 'total_pages': 247, 'page': 5, 'page_label': '6', '_id': 'ce1cc42b-65c5-41df

## LangSmith

In [11]:
from langsmith import Client

client = Client()

# Define the examples for the dataset
examples = [
    (
        "Can my landlord enter my apartment without my permission?",
        "The landlord need to give written 24 hour notice",
    ),
    (
        "What can I do if my landlord refuses to make necessary repairs?",
        "If the tenant tells the landlord about a problem and the landlord doesn't fix it, doesn't fix it properly or doesn't fix it quickly enough, a tenant can file a Tenant Application about Maintenance (Form T6) with the LTB. When the application is filed, a hearing will be scheduled.",
    ),
    (
        "Does my lease automatically renew when it expires?",
        "The end of a fixed term tenancy or lease does not mean that the tenant has to move out or sign a renewal or new lease in order to stay. The lease is renewed automatically on a month-to-month basis (if the rent is paid monthly) or week-to-week basis (if rent is paid weekly).",
    ),
    (
        "Can my landlord help with noisy neighbors?",
        "Sometimes it is other tenants living in the same building who are causing the disturbance, not the landlord. However, landlords have a positive obligation to provide quiet enjoyment and take reasonable action to address another tenant's conduct that disturbs the complaining tenant.",
    ),
    (
        "How much can my landlord increase my rent, and with how much notice?",
        "In Ontario, the landlord can increase the rent by a maximum of rate set by the rent increase guideline. Need to give 90 day notice",
    )
]

# Create the dataset and examples in LangSmith
dataset_name = "Ontario_LTB"
dataset = client.create_dataset(dataset_name=dataset_name)
client.create_examples(
    inputs=[{"question": q} for q, _ in examples], 
    outputs=[{"answer": a} for _, a in examples], 
    dataset_id=dataset.id,
)

LangSmithConflictError: Conflict for /datasets. HTTPError('409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets', '{"detail":"Dataset with this name already exists."}')

## LangSmith Evaluation Set-up

We'll use OpenAI's GPT-4o as our evaluation LLM for our base Evaluators.

In [12]:
eval_llm = ChatOpenAI(model="gpt-4o-mini")

In [13]:
from typing_extensions import Annotated, TypedDict

# Grade output schema
class CorrectnessGrade(TypedDict):
    # Note that the order in the fields are defined is the order in which the model will generate them.
    # It is useful to put explanations before responses because it forces the model to think through
    # its final response before generating it:
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    correct: Annotated[bool, ..., "True if the answer is correct, False otherwise."]

# Grade prompt
correctness_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. 
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader LLM
grader_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0).with_structured_output(CorrectnessGrade, method="json_schema", strict=True)

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    answers = f"""      QUESTION: {inputs['question']}
    GROUND TRUTH ANSWER: {reference_outputs['answer']}
    STUDENT ANSWER: {outputs["output"]}"""

    # Run evaluator
    grade = grader_llm.invoke([{"role": "system", "content": correctness_instructions}, {"role": "user", "content": answers}])
    return grade["correct"]

# Grade output schema
class RelevanceGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    relevant: Annotated[bool, ..., "Provide the score on whether the answer addresses the question"]

# Grade prompt
relevance_instructions="""You are a teacher grading a quiz. 

You will be given a QUESTION and a STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
(2) Ensure the STUDENT ANSWER helps to answer the QUESTION

Relevance:
A relevance value of True means that the student's answer meets all of the criteria.
A relevance value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader LLM
relevance_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0).with_structured_output(RelevanceGrade, method="json_schema", strict=True)

# Evaluator
def relevance(inputs: dict, outputs: dict) -> bool:
    """A simple evaluator for RAG answer helpfulness."""
    answer = f"""      QUESTION: {inputs['question']}
    STUDENT ANSWER: {outputs["output"]}"""
    grade = relevance_llm.invoke([{"role": "system", "content": relevance_instructions}, {"role": "user", "content": answer}])
    return grade["relevant"]


We'll be using a number of evaluators - from LangSmith provided evaluators, to a few custom evaluators!

In [14]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

qa_evaluator = LangChainStringEvaluator("qa", config={"llm" : eval_llm})

labeled_helpfulness_evaluator = LangChainStringEvaluator(
    "labeled_criteria",
    config={
        "criteria": {
            "helpfulness": (
                "Is this submission helpful to the user,"
                " taking into account the correct reference answer?"
            )
        },
        "llm" : eval_llm
    },
    prepare_data=lambda run, example: {
        "prediction": run.outputs["output"],
        "reference": example.outputs["answer"],
        "input": example.inputs["question"],
    }
)

In [15]:
from langsmith import traceable

@traceable()
def rag_bot(question: str) -> dict:
    # LangChain retriever will be automatically traced
    response = rag_chain.invoke(question)

    return response

In [16]:
def target(inputs: dict) -> dict:
    return rag_bot(inputs["question"])

## LangSmith Evaluation

In [25]:
evaluate(
    rag_chain.invoke,
    data=dataset_name,
    evaluators=[
        qa_evaluator,
        labeled_helpfulness_evaluator,
        correctness, 
        relevance
    ],
    experiment_prefix="aie5_midterm_prototype",
    metadata={"revision_id": "default_chain_init"},
)

View the evaluation results for experiment: 'aie5_midterm_prototype-a964051d' at:
https://smith.langchain.com/o/d0d88943-eeab-4eae-90cc-8ae8385f454e/datasets/60d7ce11-9d68-44b8-838a-d6f34fa50b11/compare?selectedSessions=c7400381-62aa-40c0-9996-d6e7e8b83b91




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.output,error,reference.answer,feedback.correctness,feedback.helpfulness,feedback.relevance,execution_time,example_id,id
0,Can my landlord enter my apartment without my ...,"No, your landlord cannot enter your apartment ...",,The landlord need to give written 24 hour notice,True,0,True,2.049954,3778a8b0-2088-4d7e-82b3-c4044b494503,4bdc1f21-f474-4229-9f8c-6a45b5750d1d
1,Can my landlord help with noisy neighbors?,"Yes, your landlord can help with noisy neighbo...",,Sometimes it is other tenants living in the sa...,True,1,True,2.27061,48549b41-709f-4076-ae63-3cc981c9ba31,b9fd36ce-55b0-46f7-ab78-078f78319e5a
2,"How much can my landlord increase my rent, and...","Your landlord can increase your rent, but ther...",,"In Ontario, the landlord can increase the rent...",False,0,True,4.354469,a3b2b2df-809f-4245-857b-170d1ba0ba8e,e331a85c-0e68-48a1-ac90-ada5cf051f0e
3,Does my lease automatically renew when it expi...,"Yes, your lease can automatically renew when i...",,The end of a fixed term tenancy or lease does ...,True,1,True,3.512074,208d576b-b8b8-4eef-a6ab-c302b0b22c2f,8ac243ec-f7c0-4a05-a2cd-1e51dfc9b7fb
4,What can I do if my landlord refuses to make n...,If your landlord refuses to make necessary rep...,,If the tenant tells the landlord about a probl...,True,0,True,3.52713,eeaaa017-f8f8-4f4f-b1cb-8123f3ec6595,01015f47-2bb4-41e8-98d9-24e95a31371d


In [28]:
evaluate(
    rag_chain.invoke,
    data=dataset_name,
    evaluators=[
        qa_evaluator,
        labeled_helpfulness_evaluator,
        correctness, 
        relevance
    ],
    experiment_prefix="aie5_midterm_ft",
    metadata={"revision_id": "default_chain_init"},
)

View the evaluation results for experiment: 'aie5_midterm_ft-5e89610b' at:
https://smith.langchain.com/o/d0d88943-eeab-4eae-90cc-8ae8385f454e/datasets/60d7ce11-9d68-44b8-838a-d6f34fa50b11/compare?selectedSessions=e261dcd1-f72e-441c-b676-a3fad49a9363




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.output,error,reference.answer,feedback.correctness,feedback.helpfulness,feedback.relevance,execution_time,example_id,id
0,Can my landlord enter my apartment without my ...,"No, your landlord cannot enter your apartment ...",,The landlord need to give written 24 hour notice,True,0,True,2.294301,3778a8b0-2088-4d7e-82b3-c4044b494503,edbded78-b421-483f-99c7-f5a11d32ebd0
1,"How much can my landlord increase my rent, and...",Your landlord can only increase your rent by a...,,"In Ontario, the landlord can increase the rent...",True,0,False,2.277027,a3b2b2df-809f-4245-857b-170d1ba0ba8e,8c621ff2-5131-4f89-861c-c28926c4037d
2,Can my landlord help with noisy neighbors?,"Yes, your landlord can help with noisy neighbo...",,Sometimes it is other tenants living in the sa...,True,1,True,4.323969,48549b41-709f-4076-ae63-3cc981c9ba31,0421a81e-10ae-486d-b013-d14e561dcf90
3,What can I do if my landlord refuses to make n...,If your landlord refuses to make necessary rep...,,If the tenant tells the landlord about a probl...,True,1,True,4.677866,eeaaa017-f8f8-4f4f-b1cb-8123f3ec6595,3cdbe493-ad2a-4455-a96c-4fda1e083654
4,Does my lease automatically renew when it expi...,"Yes, your lease will automatically renew when ...",,The end of a fixed term tenancy or lease does ...,True,1,True,2.487856,208d576b-b8b8-4eef-a6ab-c302b0b22c2f,f5efd2b2-1aeb-4c41-ae59-e8e164e429e4


## Evaluate RAG chain using RAGAS

In [19]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

In [20]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
path="data/"
loader = PyPDFDirectoryLoader(path, glob="*.pdf")
pdf_docs = loader.load()
print("Loaded PDFs")

Loaded PDFs


In [9]:
len(pdf_docs)

247

In [30]:
pdf_docs[10]

Document(metadata={'producer': 'Skia/PDF m133', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', 'creationdate': '2025-02-21T00:14:15+00:00', 'title': 'Residential Tenancies Act, 2006, S.O. 2006, c. 17 | ontario.ca', 'moddate': '2025-02-21T00:14:15+00:00', 'source': 'data/Residential Tenancies Act, 2006, S.O. 2006, c. 17 _ ontario.ca.pdf', 'total_pages': 247, 'page': 10, 'page_label': '11'}, page_content='(iii)\xa0 the living accommodation or residential complex is forfeited corporate\nproperty to which the Forfeited Corporate Property Act, 2015 applies; and\n(n)\xa0 any other prescribed class of accommodation. \xa02006, c.\xa017, s.\xa05; 2007, c.\xa08, s.\xa0226;\n2007, c.\xa013, s.\xa048; 2008, c.\xa014, s.\xa058 (2, 4); 2009, c.\xa033, Sched.\xa018, s.\xa030; 2013, c.\xa03, s.\xa024;\n2015, c. 38, Sched. 7, s. 60; 2017, c. 14, Sched. 4, s. 33; 2021, c. 4, Sched. 11, s. 31 (2);\n2021, c. 39, Sched. 2,

In [23]:
mini_set = pdf_docs[3:10]

In [24]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(mini_set, testset_size=10)

Applying HeadlinesExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/7 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/8 [00:00<?, ?it/s]

Property 'summary' already exists in node 'a0d369'. Skipping!
Property 'summary' already exists in node 'df06d1'. Skipping!
Property 'summary' already exists in node '19c732'. Skipping!
Property 'summary' already exists in node '35312b'. Skipping!


Applying CustomNodeFilter: 0it [00:00, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/8 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node 'df06d1'. Skipping!
Property 'summary_embedding' already exists in node '35312b'. Skipping!
Property 'summary_embedding' already exists in node '19c732'. Skipping!
Property 'summary_embedding' already exists in node 'a0d369'. Skipping!


Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Samples: 0it [00:00, ?it/s]

In [26]:
dataset.to_pandas()

In [None]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class State(TypedDict):
  question: str
  context: List[Document]
  response: str

In [None]:
def retrieve(state):
  retrieved_docs = retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

In [None]:
def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
  response = base_llm.invoke(messages)
  return {"response" : response.content}

In [None]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
for test_row in dataset:
  response = graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [None]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [None]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))

In [None]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result