# Challenge Three: Testing and Evaluation

In [1]:
# Import all the related libraries
from google import genai
from google.genai import types
import pytest
import re
import json
import pandas as pd

from vertexai.evaluation import (
    EvalTask,
    MetricPromptTemplateExamples,
)
from vertexai.preview.evaluation import notebook_utils
from datetime import datetime

In [2]:
# ---------------- Constants ----------------
PROJECT_ID = "qwiklabs-gcp-02-682b5eee4362"

# Latest stable Gemini model version

MODEL = "gemini-2.0-flash-001"

question_classifier_system_prompt = """
You are an intelligent classification assistant that categorizes user questions into one of the following categories:

1. Employment
2. General Information
3. Emergency Services
4. Tax Related

## Output Format:
Respond only with the category name as a single word or phrase: "Employment", "General Information", "Emergency Services", or "Tax Related".

Do not include explanations, markdown, or additional text.

## Example:
Input: Is there any vaccancy available in post office?
Output: Employment

Input: Any tax rate changes in upcoming financial year?
Output: Tax Related
"""

gov_announcement_system_prompt = """
You are a social media manager for a government agency, responsible for writing clear, concise, and informative social media posts for official government announcements.

## Rules:
- Keep the message under 500 characters where possible.
- The social media post should be like official statement

## Output Format:
Respond only with the social media post text. Do not include explanations or markdown formatting.

## Example:
Announcement: Severe weather warning for Chennai on June 18.
Post: Weather Alert: Heavy rain and strong winds expected in Chennai on June 18. Stay indoors and follow local advisories. #ChennaiWeather

Announcement: Income tax filing deadline extended to August 31.
Post: The deadline for income tax filing has been extended to August 31. Visit the official portal for more details. #TaxUpdate #GovNotice
"""

test_gov_announcement_system_prompt = """
You are a quality control assistant responsible for verifying the quality of a social media post generated for a government announcement.

You will be given:
- The original announcement content
- The generated social media post

Evaluate the social media post using the following criteria:

1. **Relevance**: Does the post accurately reflect the key information in the original announcement (e.g., event, date, location, type of alert)?
2. **Clarity**: Is the message easy to understand for the general public?
3. **Brevity**: Is the post concise and ideally under 280 characters (suitable for Twitter)?
4. **Tone**: Is the tone appropriate for an official government post (clear, calm, and informative)?

ONLY RAW JSON.
Example Output:
{
  "result": "Yes",
  "reason": "Follow all the rules"
}
"""

In [3]:
# Initialize the Gemini AI client to interact with the Vertex AI service.
# Parameters:
# - vertexai=True : specifies that the client should use Google Cloud Vertex AI as the backend.
# - project : GCP project ID where the Vertex AI resources are located.
# - location : Regional endpoint to connect to; "global" refers to a global endpoint.
gemini_client = genai.Client(
    vertexai=True,
    project=PROJECT_ID,
    location="global",
)

In [4]:
def extract_json(text):
    """
    Extracts the JSON object found in the input text string.

    Parameters:
        text (str): The string containing a JSON object.

    Returns:
        dict or None: Parsed JSON object as a Python dictionary if successful,
                      None if no valid JSON is found or parsing fails.
    """
    # Regular expression to find the first JSON object in the text
    match = re.search(r'\{.*?\}', text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            print("Invalid JSON format")
            return None
    else:
        print("No JSON found")
        return None

## Functions to Classify questions and Generate a social media post for Government announcement

In [5]:
# Function to classify the user's question
def classify_question(user_input, system_prompt=question_classifier_system_prompt):
  response = gemini_client.models.generate_content(
    model = MODEL,
    contents = [f"Input Prompt: ${user_input}"],
    config = types.GenerateContentConfig(
        temperature = 1,
        top_p = 1,
        system_instruction=[types.Part.from_text(text=system_prompt)]
    )
  )
  return response.text


In [6]:
# Function to create a social media post for government announcement
def create_social_media_post(user_input, system_prompt=gov_announcement_system_prompt):
  response = gemini_client.models.generate_content(
    model = MODEL,
    contents = [f"Input Prompt: ${user_input}"],
    config = types.GenerateContentConfig(
        temperature = 1,
        top_p = 1,
        system_instruction=[types.Part.from_text(text=system_prompt)]
    )
  )
  return response.text

## Test Question classifier AI and Government social media post generator

In [7]:
# Pytest for question classifier AI
# List of test cases
test_cases = [
    ("Are there job openings available in the finance department?", "Employment"),
    ("How do I pay my taxes online?", "Tax Related"),
    ("What should I do if there's an earthquake?", "Emergency Services"),
    ("Where is the city library located?", "General Information"),
]

# Run test cases
for user_input, expected_category in test_cases:
    result = classify_question(user_input)
    print(f"Query: {user_input}")
    print(f"Expected: {expected_category}, Actual: {result.strip()}\n")
    assert result.strip() == expected_category, f"Failed for input: {user_input}"

Query: Are there job openings available in the finance department?
Expected: Employment, Actual: Employment

Query: How do I pay my taxes online?
Expected: Tax Related, Actual: Tax Related

Query: What should I do if there's an earthquake?
Expected: Emergency Services, Actual: Emergency Services

Query: Where is the city library located?
Expected: General Information, Actual: General Information



In [8]:
# Pytest for social media post generator AI
# Sample government announcements and what you'd expect in the post
test_announcements = [
    "Severe weather warning for Mumbai on June 20",
    "Government offices will be closed on August 15 for Independence Day",
    "Schools in Delhi will remain closed tomorrow due to heavy rain",
    "Income tax deadline extended to October 31"
]

def test_create_social_media_post(announcement_content, generated_post):
  response = gemini_client.models.generate_content(
    model = MODEL,
    contents = [f"Announcement: {announcement_content}\n Generated Post: {generated_post}"],
    config = types.GenerateContentConfig(
        temperature = 0,
        top_p = 1,
        system_instruction=[types.Part.from_text(text=test_gov_announcement_system_prompt)]
    )
  )
  clean_json = extract_json(response.text)
  print(f"Announcement: {announcement_content}, Response: {clean_json['result']}, Reason: {clean_json['reason']}")
  assert clean_json['result'] == 'Yes', f"Failed for announcement: {announcement_content} because {clean_json['reason']}"

for announcement in test_announcements:
  post = create_social_media_post(announcement)
  test_create_social_media_post(announcement, post)



Announcement: Government offices will be closed on August 15 for Independence Day, Response: Yes, Reason: The post accurately reflects the announcement, is clear, brief, and maintains an appropriate tone.
Announcement: Schools in Delhi will remain closed tomorrow due to heavy rain, Response: Yes, Reason: The post accurately reflects the announcement, is clear, brief, and maintains an appropriate tone.
Announcement: Income tax deadline extended to October 31, Response: Yes, Reason: The post accurately reflects the announcement, is clear, brief, and maintains an appropriate tone.


## Evaluate Government social media post generator

## Create a evaluation dataset

In [9]:
prompts = [
    (
        "Severe weather warning for Mumbai on June 20"
    ),
    (
        "Government offices will be closed on August 15 for Independence Day"
    ),
    (
        "Income tax deadline extended to October 31"
    )
]

responses = [
    (
        create_social_media_post(prompts[0])
    ),
    (
        create_social_media_post(prompts[1])
    ),
    (
        create_social_media_post(prompts[2])
    )
]

eval_dataset = pd.DataFrame({
    "prompt": prompts,
    "response": responses
})

## Create evaluation task and execute

In [10]:
ts = datetime.now().strftime("%Y%m%d%H%M%S")
eval_task = EvalTask(
    dataset = eval_dataset,
    metrics=[
        MetricPromptTemplateExamples.Pointwise.GROUNDEDNESS,
        MetricPromptTemplateExamples.Pointwise.VERBOSITY,
        MetricPromptTemplateExamples.Pointwise.INSTRUCTION_FOLLOWING,
        MetricPromptTemplateExamples.Pointwise.SAFETY
    ],
    experiment=f"social-media-post-generator-{ts}"
)

result = eval_task.evaluate(
    prompt_template="Prompt: {prompt}. response: {response}",
    experiment_run_name=f"social-media-post-generator-{ts}"
)
eval_results = []
eval_results.append(result)

INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': 'Prompt: {prompt}. response: {response}'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 12 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 12/12 [00:01<00:00,  6.99it/s]
INFO:vertexai.evaluation._evaluation:All 12 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:1.727107247999811 seconds


## Display the evaluation result

In [11]:
notebook_utils.display_eval_result(eval_result=result)

### Summary Metrics

Unnamed: 0,row_count,groundedness/mean,groundedness/std,verbosity/mean,verbosity/std,instruction_following/mean,instruction_following/std,safety/mean,safety/std
0,3.0,1.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0


### Row-based Metrics

Unnamed: 0,prompt,response,groundedness/explanation,groundedness/score,verbosity/explanation,verbosity/score,instruction_following/explanation,instruction_following/score,safety/explanation,safety/score
0,Prompt: Severe weather warning for Mumbai on J...,Weather Alert: Heavy rainfall and strong winds...,All aspects of the response are attributable t...,1.0,The response is appropriately concise and prov...,0.0,The response accurately and completely fulfill...,5.0,"The response contains no hate speech, harassme...",1.0
1,Prompt: Government offices will be closed on A...,Government offices will be closed on August 15...,"The response is fully grounded, as it only rep...",1.0,"The response is perfectly concise, providing a...",0.0,The response accurately and completely fulfill...,5.0,The response is safe because it does not conta...,1.0
2,Prompt: Income tax deadline extended to Octobe...,Income tax filing deadline extended to October...,The response is fully grounded in the user pro...,1.0,The response is perfectly concise and provides...,0.0,The response successfully repeated the user pr...,5.0,The response is safe because it does not conta...,1.0
