# Challenge three: Testing and evaluation
Goal: Demonstrate your ability to write tests and evaluate responses from large language models.

### Install required packages

In [1]:
!pip install langchain-google-vertexai --quiet

### Import required packages

In [2]:
from google import genai
from langchain_google_vertexai import ChatVertexAI
from langchain_core.messages import SystemMessage, HumanMessage

### Setup variables

In [3]:
MODEL="gemini-2.0-flash"
PROJECT_ID='qwiklabs-gcp-03-7a8bdf6e2e2c'
LOCATION='us-central1'

### Create model

In [5]:
llm = ChatVertexAI(
    model_name="gemini-2.0-flash-001",
    temperature=0,
    max_tokens=None,
    max_retries=2,
    stop=None,
)

### Create function to classify category

In [6]:
from pydantic import BaseModel, Field
class ResponseFormatter(BaseModel):
    """Always use this tool to structure your response to the user."""
    category: str = Field(description="The category of the user's question")


def classify(question: str):
    categories = ["Employment", "General Information", "Emergency Services", "Tax Related"]
    system_prompt = (
          f"""You are a helpful classifier. Your task is to classify the user's question into one of the following categories: {', '.join(categories)}.\n
          Respond ONLY in the following JSON format:
          {{
          "category": "<one of the four categories above>"
          }}"""
    )
    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(
            content=f"question: {question}"
        ),
    ]
    response = llm.with_structured_output(ResponseFormatter).invoke(messages)
    return response.category

In [7]:
print(classify("How many leaves are allowed?"))

Employment


### Unit test classification

In [8]:
import unittest
class test_classify(unittest.TestCase):
    def test_classify_Employment(self):
      category = classify("How do I apply for a government job?")
      self.assertEqual(category, "Employment")

    def test_classify_General_Information(self):
      category = classify("Is the public library open on national holidays?")
      self.assertEqual(category, "General Information")

    def test_classify_Emergency_Services(self):
      category = classify("Is there any emergency number to call during floods?")
      self.assertEqual(category, "Emergency Services")

    def test_classify_Tax_Related(self):
      category = classify("When will the tax filing deadline be this year?")
      self.assertEqual(category, "Tax Related")
unittest.main(argv=[''], verbosity=2, exit=False)

test_classify_Emergency_Services (__main__.test_classify.test_classify_Emergency_Services) ... ok
test_classify_Employment (__main__.test_classify.test_classify_Employment) ... ok
test_classify_General_Information (__main__.test_classify.test_classify_General_Information) ... ok
test_classify_Tax_Related (__main__.test_classify.test_classify_Tax_Related) ... ok

----------------------------------------------------------------------
Ran 4 tests in 1.924s

OK


<unittest.main.TestProgram at 0x7fb9d44809d0>

### Create function to generate social media post

In [9]:
generate_post_system_prompt = f"""You are a responsible and professional government communications assistant.

Your task is to generate short, clear, and informative social media posts (such as for Twitter, Facebook, or Instagram) based on official government announcements.

Follow these principles:

1. Be accurate, calm, and authoritative — avoid exaggeration or speculation.
2. Use accessible, inclusive language suitable for the general public.
3. Prioritize clarity over style. Keep the message easy to understand.
4. Use proper hashtags and @mentions if applicable.
5. For emergencies or weather alerts, emphasize action, location, and time clearly.
6. For holidays or school closings, use a respectful and informative tone.
7. Avoid political or promotional language.
8. Keep posts under 280 characters when possible (for Twitter/X compatibility).
9. Include emojis if appropriate, but never at the expense of clarity or seriousness.

Format:
- Output: A single social media post.
- Optionally add hashtags or links at the end if relevant.
- Never include the original announcement text in the post.

Your audience is the general public — students, families, commuters, and citizens.

Generate a suitable post for the following announcement:
"""

def generate_post(announcement: str):
    messages = [
        SystemMessage(content=generate_post_system_prompt),
        HumanMessage(
            content=f"announcement: {announcement}"
        ),
    ]
    response = llm.invoke(messages)
    return response.content

In [10]:
print(generate_post("weather emergency tomorrow because of rain"))

Stay safe tomorrow! 🌧️ Heavy rain is expected. Avoid unnecessary travel. Check local news for updates. #WeatherAlert #SafetyFirst



### Create function to evaluate post using LLM

In [11]:
def eval_generate_post(post: str):
    system_prompt = (
          f"""
          You are a professional communications reviewer for a government agency.
Your task is to evaluate social media posts intended for public announcements, such as weather emergencies, school closings, public holidays, or civic updates.
Assess each post across the following criteria:

1. **Clarity** – Is the message understandable by the general public (including people with low literacy or non-native speakers)?
2. **Tone** – Is the tone appropriate for the subject (e.g., serious for emergencies, respectful for closures, neutral for general info)?
3. **Accuracy** – Is the information factually correct and free of speculation or misleading language?
4. **Urgency (if applicable)** – Does the post convey a clear call to action in urgent or emergency contexts?
5. **Length** – Is the post concise (preferably under 280 characters for X/Twitter)?
6. **Hashtag/Tag Use** – Are hashtags relevant and not excessive? Are accounts tagged appropriately (if mentioned)?
7. **Language Suitability** – Avoids jargon, political commentary, slang, or marketing language.
8. **Formatting** – Uses proper capitalization, punctuation, and optional emoji appropriately.

Return Yes if all of them are passed otherwise return No
ONLY RETURN Yes or NO
"""
    )
    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(
            content=f"post: {post}"
        ),
    ]
    response = llm.invoke(messages)
    return response.content.strip()

### Uni test generated post

In [12]:
import unittest
class test_generate_post(unittest.TestCase):
    def test_generate_post_1(self):
      post = generate_post("Heavy rainfall is expected across Chennai on July 12. The Meteorological Department has issued a red alert.")
      result = eval_generate_post(post)
      self.assertEqual(result, "Yes")

    def test_generate_post_2(self):
      post = generate_post("All government offices will remain closed on August 15 in observance of Independence Day.")
      result = eval_generate_post(post)
      self.assertEqual(result, "Yes")

    def test_generate_post_3(self):
      post = "Transport won’t be running that day. Figure it out. 🤷"
      result = eval_generate_post(post)
      self.assertEqual(result, "No")

    def test_generate_post_4(self):
      post = "There’s a thing happening in the east, might be a cyclone. Stay safe or whatever."
      result = eval_generate_post(post)
      self.assertEqual(result, "No")

    def test_generate_post_5(self):
      post = "NO WATER IN SECTOR 10 & 11!!! STOCK UP NOW!!! 😱😱😱 #WATERCRISIS #PANIC"
      result = eval_generate_post(post)
      self.assertEqual(result, "No")

    def test_generate_post_6(self):
      post = "Woohoo! No work on Oct 2 🎉 Celebrate Gandhi Day like a boss 😎🔥🔥🔥 #holiday #fun #GandhiJayanti #coolday #nooffice"
      result = eval_generate_post(post)
      self.assertEqual(result, "No")

unittest.main(argv=[''], verbosity=2, exit=False)

test_classify_Emergency_Services (__main__.test_classify.test_classify_Emergency_Services) ... ok
test_classify_Employment (__main__.test_classify.test_classify_Employment) ... ok
test_classify_General_Information (__main__.test_classify.test_classify_General_Information) ... ok
test_classify_Tax_Related (__main__.test_classify.test_classify_Tax_Related) ... ok
test_generate_post_1 (__main__.test_generate_post.test_generate_post_1) ... ok
test_generate_post_2 (__main__.test_generate_post.test_generate_post_2) ... ok
test_generate_post_3 (__main__.test_generate_post.test_generate_post_3) ... ok
test_generate_post_4 (__main__.test_generate_post.test_generate_post_4) ... ok
test_generate_post_5 (__main__.test_generate_post.test_generate_post_5) ... ok
test_generate_post_6 (__main__.test_generate_post.test_generate_post_6) ... ok

----------------------------------------------------------------------
Ran 10 tests in 4.849s

OK


<unittest.main.TestProgram at 0x7fb9d042bb90>

### Evaluation Task

In [13]:
import vertexai
from vertexai.evaluation import (
    EvalTask,
    PointwiseMetric,
    PairwiseMetric,
    PointwiseMetricPromptTemplate,
    PairwiseMetricPromptTemplate,
    MetricPromptTemplateExamples,
)
from vertexai.generative_models import (
    GenerativeModel,
    HarmCategory,
    HarmBlockThreshold,
)
import pandas as pd
import plotly.graph_objects as go
from IPython.display import HTML, Markdown, display
import datetime

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Dataset from the response and prompt

In [24]:
posts = [
    {
        "prompt": "Heavy rainfall is expected across Chennai on July 12. The Meteorological Department has issued a red alert.",
        "post": "Red alert issued for Chennai on July 12. Expect heavy rainfall. Stay indoors and avoid unnecessary travel. Stay safe. #ChennaiRains #WeatherAlert"
    },
    {
        "prompt": "All government offices will remain closed on August 15 in observance of Independence Day.",
        "post": "Government offices will be closed on August 15 for Independence Day. #IndependenceDay"
    },
    {
        "prompt": "The main flyover at MG Road in Bengaluru will be closed for maintenance work from June 20 to June 22. Commuters should plan alternate routes.",
        "post": "MG Road flyover in Bengaluru will be closed for maintenance from June 20-22. Please use alternate routes. Plan your commute accordingly. #BengaluruTraffic #RoadClosure"
    },
    {
        "prompt": "An outbreak of dengue has been reported in several parts of Delhi. Citizens are advised to avoid stagnant water, wear full sleeves, and use mosquito repellents.",
        "post": "Delhi residents: Take precautions against dengue fever. Avoid stagnant water, wear long sleeves, and use mosquito repellent. Stay safe! #Delhi #DengueAlert #PublicHealth"
    },
    {
        "prompt": "Due to extreme heatwave conditions, all schools in Rajasthan will remain closed from May 10 to May 14.",
        "post": "All schools in Rajasthan will be closed from May 10 to May 14 due to the extreme heatwave. Stay safe and hydrated. #Rajasthan #Heatwave #SchoolClosure"
    },
    {
        "prompt": "The Municipal Corporation will carry out a water supply maintenance shutdown in Sector 10 and 11 from 10 AM to 6 PM on June 21.",
        "post": "NO WATER IN SECTOR 10 & 11!!! STOCK UP NOW!!! 😱😱😱 #WATERCRISIS #PANIC"
    },
    {
        "prompt": "Public transport services will be partially suspended in Delhi on August 15 for Independence Day security arrangements.",
        "post": "Transport won’t be running that day. Figure it out. 🤷"
    }
]

### Creating evaulation dataset

In [25]:
eval_dataset = pd.DataFrame([
    {
        "instruction": generate_post_system_prompt,
        "context": f"announcement: {item['prompt']}",
        "response": item["post"],
    } for item in posts
])

In [26]:
run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[
        MetricPromptTemplateExamples.Pointwise.GROUNDEDNESS,
        MetricPromptTemplateExamples.Pointwise.VERBOSITY,
        MetricPromptTemplateExamples.Pointwise.INSTRUCTION_FOLLOWING,
        MetricPromptTemplateExamples.Pointwise.SAFETY
    ],
    experiment=f"social-media-post-{run_ts}"
)

### Running evaulation

In [27]:
prompt_template = (
    "Instruction: {instruction}. Prompt: {context}. Post: {response}"
)
result = eval_task.evaluate(
      prompt_template=prompt_template,
      experiment_run_name=f"social-media-post-{run_ts}"
)
evaluation_results = []
evaluation_results.append(result)

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Instruction: {instruction}. Prompt: {context}. Post: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 28 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 28/28 [00:03<00:00,  8.00it/s]
INFO:vertexai.evaluation._evaluation:All 28 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:3.5102648080028303 seconds


### Comparing results

In [28]:
from vertexai.preview.evaluation import notebook_utils
notebook_utils.display_eval_result(eval_result=result)

### Summary Metrics

Unnamed: 0,row_count,groundedness/mean,groundedness/std,verbosity/mean,verbosity/std,instruction_following/mean,instruction_following/std,safety/mean,safety/std
0,7.0,1.0,0.0,-0.285714,0.755929,3.857143,1.9518,0.857143,0.377964


### Row-based Metrics

Unnamed: 0,instruction,context,response,prompt,groundedness/explanation,groundedness/score,verbosity/explanation,verbosity/score,instruction_following/explanation,instruction_following/score,safety/explanation,safety/score
0,You are a responsible and professional governm...,announcement: Heavy rainfall is expected acros...,Red alert issued for Chennai on July 12. Expec...,Instruction: You are a responsible and profess...,"The response is fully grounded in the prompt, ...",1.0,The response is perfectly concise and provides...,0.0,"The response adheres to all instructions, prov...",5.0,The response is safe as it does not contain an...,1.0
1,You are a responsible and professional governm...,announcement: All government offices will rema...,Government offices will be closed on August 15...,Instruction: You are a responsible and profess...,The response is fully grounded in the provided...,1.0,The response is perfectly concise and provides...,0.0,The response completely fulfills the prompt's ...,5.0,The response is safe because it does not conta...,1.0
2,You are a responsible and professional governm...,announcement: The main flyover at MG Road in B...,MG Road flyover in Bengaluru will be closed fo...,Instruction: You are a responsible and profess...,The response is completely grounded in the pro...,1.0,"The response is perfectly concise, providing a...",0.0,The response accurately summarizes the announc...,5.0,The response is safe as it does not contain an...,1.0
3,You are a responsible and professional governm...,announcement: An outbreak of dengue has been r...,Delhi residents: Take precautions against deng...,Instruction: You are a responsible and profess...,The response is fully grounded as it only uses...,1.0,The response is appropriately concise and prov...,0.0,The response perfectly follows the instruction...,5.0,The response is safe as it does not contain ha...,1.0
4,You are a responsible and professional governm...,announcement: Due to extreme heatwave conditio...,All schools in Rajasthan will be closed from M...,Instruction: You are a responsible and profess...,The response accurately reflects the informati...,1.0,The response is appropriately concise and prov...,0.0,"The response followed the instructions, provid...",5.0,The response is safe as it does not contain an...,1.0
5,You are a responsible and professional governm...,announcement: The Municipal Corporation will c...,NO WATER IN SECTOR 10 & 11!!! STOCK UP NOW!!! ...,Instruction: You are a responsible and profess...,The response completely contains the informati...,1.0,"The response is perfectly concise, providing a...",0.0,The response fails to adhere to almost all of ...,1.0,The response uses alarming language and emojis...,0.0
6,You are a responsible and professional governm...,announcement: Public transport services will b...,Transport won’t be running that day. Figure it...,Instruction: You are a responsible and profess...,"The response is fully grounded, with all infor...",1.0,The response is excessively brief and lacks cr...,-2.0,"The response fails to adopt the tone, clarity,...",1.0,"The response is sarcastic and unhelpful, but d...",1.0
