In [1]:
import requests
import json
import re
from typing import Optional, Dict, Any
import google.genai as genai
from google.genai import types

In [2]:
api_key="AIzaSyDHOSjzr-AedFPftuIK7iiZ0yTqaTkSDYQ"

In [3]:
client = genai.Client(api_key=api_key)

In [4]:
# Gemini Generate text functions
def extract_text(r):
    out = []
    for c in getattr(r, "candidates", []) or []:
        # finish_reason can be useful: types.FinishReason.STOP, SAFETY, etc.
        # print("finish_reason:", c.finish_reason)
        content = getattr(c, "content", None)
        if content:
            for p in getattr(content, "parts", []) or []:
                t = getattr(p, "text", None)
                if t:
                    out.append(t)
    # Fallback to r.text if present
    return "\n".join(out) or getattr(r, "text", "") or ""
    
def generate_text(client, prompt: str, max_tokens: int = 1024, temperature: float = 0.7, verbose = 0) -> Optional[str]:
    """
    Generate text using Gemini API.
    
    Args:
        prompt (str): The input prompt for text generation
        max_tokens (int): Maximum number of tokens to generate
        temperature (float): Controls randomness (0.0 to 1.0)
    
    Returns:
        Optional[str]: Generated text or None if request fails
    """
    
    try:
        resp = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=[{"role": "user", "parts": [{"text": prompt}]}],
            config=types.GenerateContentConfig(
                max_output_tokens=max_tokens,
                temperature=temperature,
            )
        )
        answer_text = extract_text(resp)
        if verbose > 0:
            print(f"Answer to be Evaluation Response: {answer_text}")
        return answer_text, resp
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

In [5]:
# Gemini text evaluation
def extract_score(evaluation_text: str) -> float:
    """
    Extract numerical score from evaluation text.
    
    Args:
        evaluation_text (str): The evaluation response text
    
    Returns:
        float: Extracted score or 0.0 if extraction fails
    """
    # Look for decimal numbers between 0 and 1
    patterns = [
        r'\b(0\.\d+|1\.0+|0\.0+)\b',  # Decimal format (0.75, 1.0, 0.0)
        r'\b([0-9](?:\.[0-9]+)?)/10\b',  # X/10 format
        r'\b(\d+)%\b',  # Percentage format
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, evaluation_text)
        if matches:
            try:
                score_str = matches[0]
                if '/' in score_str:
                    # Handle X/10 format
                    score = float(score_str.split('/')[0]) / 10.0
                elif '%' in score_str:
                    # Handle percentage format
                    score = float(score_str.replace('%', '')) / 100.0
                else:
                    # Handle decimal format
                    score = float(score_str)
                
                # Ensure score is in valid range
                if 0.0 <= score <= 1.0:
                    return score
            except ValueError:
                continue
    
    print(f"Could not extract score from: {evaluation_text}")
    return 0.0

def evaluate_answer(
    client,
    question: str, answer: str, reference_answer: str = None,
    system_prompt = """
    Please evaluate the quality of the given answer to the question on a scale of 0.0 to 1.0, where:
    - 0.0 = Completely incorrect, irrelevant, or nonsensical
    - 0.5 = Partially correct but missing key information or has some errors
    - 1.0 = Excellent, accurate, and comprehensive answer
    
    Consider accuracy, completeness, clarity, and relevance. Respond with just the numerical score (e.g., 0.75) followed by a brief explanation.
    If Reference answer is provided, answer according to its information only.
    """,
    verbose = 0
) -> float:
    """
    Evaluate the quality of an answer using Gemini API.
    
    Args:
        question (str): The original question
        answer (str): The answer to evaluate
        reference_answer (str, optional): Reference answer for comparison
    
    Returns:
        float: Evaluation score between 0.0 and 1.0
    """
    evaluation_prompt = system_prompt + f"/n Question: {question}"
    evaluation_prompt = evaluation_prompt + f"/n Answer to evaluate: {answer}"

    # Construct evaluation prompt
    if reference_answer:
        evaluation_prompt = evaluation_prompt + f"/n Reference answer: {reference_answer}"
    
    # Get evaluation from Gemini
    evaluation_response, response = generate_text(client, evaluation_prompt, max_tokens=4096, temperature=0.1)
    if verbose > 0:
        print(f"Evaluation Response: {evaluation_response}")
    
    if not evaluation_response:
        print("Failed to get evaluation response")
        return -1.0
    
    # Extract numerical score from response
    score = extract_score(evaluation_response)
    
    if verbose > 0:
        print(f"Score: {score:.2f}")
    return max(0.0, min(1.0, score))  # Ensure score is between 0.0 and 1.0

In [6]:
verbose = 1
question = "What is the capital of France?"
reference_answer = "The capital of France is Paris, which is also the country's largest city and cultural center."
answer_text = generate_text(client, question, max_tokens = 1024, temperature = 0.7, verbose = verbose)

Answer to be Evaluation Response: The capital of France is **Paris**.


In [7]:
score = evaluate_answer(client, question, answer_text, reference_answer = reference_answer, verbose = verbose)

Evaluation Response: 1.0
The answer is accurate, clear, and directly answers the question. While the reference answer provides additional context (largest city and cultural center), the given answer fully addresses the specific question asked, making it excellent and comprehensive for the query.
Score: 1.00


In [8]:
answer_text = "The capital of France is **Bangkok**."
score = evaluate_answer(client, question, answer_text, reference_answer = reference_answer, verbose = verbose)

Evaluation Response: 0.0
The answer is completely incorrect. The capital of France is Paris, not Bangkok.
Score: 0.00


In [9]:
answer_text = "The capital of France is likely **Paris** but maybe **Bangkok**."
score = evaluate_answer(client, question, answer_text, reference_answer = reference_answer, verbose = verbose)

Evaluation Response: 0.5 The answer correctly identifies Paris as the likely capital but introduces Bangkok as a possibility, which is completely incorrect and misleading. The "likely" also adds an unnecessary degree of uncertainty compared to the definitive reference.
Score: 0.50


In [10]:
reference_answer = "The capital of France is Mararis, they just moved it here"
score = evaluate_answer(client, question, answer_text, reference_answer = reference_answer, verbose = verbose)

Evaluation Response: 0.0 The evaluated answer states "Paris" and "Bangkok," neither of which matches the capital "Mararis" provided in the reference answer. According to the reference, the answer is completely incorrect.
Score: 0.00


## Productionize Functions

In [11]:
import logging
import google.genai as genai

In [12]:
import genai_functions.gemini_textgen_functions as gemini_textgen_functs
import genai_functions.gemini_texteva_functions as gemini_texteva_functs
import genai_functions.gemini_usage_logging as gemini_log_functs

In [13]:
api_key="AIzaSyDHOSjzr-AedFPftuIK7iiZ0yTqaTkSDYQ"

In [14]:
question = "เมืองหลวงของประเทศไทยคือ?"

In [15]:
logging.basicConfig(level=logging.INFO)
client = genai.Client(api_key=api_key)

config    = gemini_textgen_functs.GenerationConfig(max_tokens=512, temperature=0.5)
generator = gemini_textgen_functs.GeminiTextGenerator(client, config)
res_text, response = generator.generate_text(question)
print(res_text)

INFO:genai_functions.gemini_textgen_functions:GeminiTextGenerator initialized with model: gemini-2.5-flash
INFO:genai_functions.gemini_textgen_functions:Generating text with model: gemini-2.5-flash, max_tokens: 512, temperature: 0.50
INFO:google_genai.models:AFC is enabled with max remote calls: 10.
INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
INFO:google_genai.models:AFC remote call 1 is done.
INFO:genai_functions.gemini_usage_logging:GeminiUsageLogger initialized with 4 existing logs (persist=True)
INFO:genai_functions.gemini_usage_logging:Added log entry: query_len=25, total_tokens=56
INFO:genai_functions.gemini_textgen_functions:Successfully generated 51 characters of text


เมืองหลวงของประเทศไทยคือ **กรุงเทพมหานคร** ครับ/ค่ะ


In [16]:
eva_config = gemini_texteva_functs.EvaluationConfig(temperature=0.0, max_tokens=1024)
evaluator  = gemini_texteva_functs.GeminiTextEvaluator(client, eva_config)

INFO:genai_functions.gemini_texteva_functions:GeminiTextEvaluator initialized


In [17]:
reference_answer = "Capital of Thailand is Bangkok"

In [18]:
score_wo_ref = evaluator.evaluate_answer_quality(question, res_text, verbose=True)
score_w_ref  = evaluator.evaluate_answer_quality(question, res_text, reference_answer, verbose=True)
print(f"score_wo_ref: {score_wo_ref}")
print(f"score_w_ref:  {score_w_ref}")

INFO:genai_functions.gemini_texteva_functions:Evaluating answer quality for question length: 25, answer length: 51
INFO:genai_functions.gemini_texteva_functions:Evaluation prompt: 
Please evaluate the quality of the given answer to the question on a scale of 0.0 to 1.0, where:
- 0.0 = Completely incorrect, irrelevant, or nonsensical
- 0.5 = Partially correct but missing key information or has some errors
- 1.0 = Excellent, accurate, and comprehensive answer

Consider accuracy...

INFO:google_genai.models:AFC is enabled with max remote calls: 10.
INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
INFO:google_genai.models:AFC remote call 1 is done.
INFO:genai_functions.gemini_usage_logging:GeminiUsageLogger initialized with 5 existing logs (persist=True)
INFO:genai_functions.gemini_usage_logging:Added log entry: query_len=607, total_tokens=367
INFO:genai_functions.gemini_texteva_functions:Evaluation res

score_wo_ref: 1.0
score_w_ref:  1.0


In [19]:
reference_answer = "Capital of Thailand is Chaing Mai, they juse moved it here"

In [20]:
score_w_ref  = evaluator.evaluate_answer_quality(question, res_text, reference_answer, verbose=True)
print(f"score_w_ref: {score_w_ref}")

INFO:genai_functions.gemini_texteva_functions:Evaluating answer quality for question length: 25, answer length: 51
INFO:genai_functions.gemini_texteva_functions:Evaluation prompt: 
Please evaluate the quality of the given answer to the question on a scale of 0.0 to 1.0, where:
- 0.0 = Completely incorrect, irrelevant, or nonsensical
- 0.5 = Partially correct but missing key information or has some errors
- 1.0 = Excellent, accurate, and comprehensive answer

Consider accuracy...

INFO:google_genai.models:AFC is enabled with max remote calls: 10.
INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
INFO:google_genai.models:AFC remote call 1 is done.
INFO:genai_functions.gemini_usage_logging:GeminiUsageLogger initialized with 7 existing logs (persist=True)
INFO:genai_functions.gemini_usage_logging:Added log entry: query_len=684, total_tokens=428
INFO:genai_functions.gemini_texteva_functions:Evaluation res

score_w_ref: 0.0


In [25]:
import genai_functions.gemini_usage_logging as gemini_log_functs
import pprint

gemini_logger = gemini_log_functs.GeminiUsageLogger(log_path="logs/gemini_usage.csv")
pprint.pprint(gemini_logger.get_usage_summary())
logs_df = gemini_logger.get_logs_dataframe()
logs_df.tail()

INFO:genai_functions.gemini_usage_logging:GeminiUsageLogger initialized with 8 existing logs (persist=True)


{'avg_tokens_per_request': 166.625,
 'date_range': (Timestamp('2025-09-08 07:22:02.188114+0000', tz='UTC'),
                Timestamp('2025-09-08 08:41:11.159131+0000', tz='UTC')),
 'finish_reasons': {'STOP': 4},
 'total_requests': 8,
 'total_tokens': 1333}


Unnamed: 0,timestamp,query,uploaded_file,response_text,finish_reason,cached_content_token_count,candidates_token_count,prompt_token_count,thoughts_token_count,total_token_count
3,2025-09-08 07:37:47.978513+00:00,What is RAG?,,,,,,,,
4,2025-09-08 08:41:05.392691+00:00,เมืองหลวงของประเทศไทยคือ?,,เมืองหลวงของประเทศไทยคือ **กรุงเทพมหานคร** ครั...,STOP,,15.0,7.0,34.0,56.0
5,2025-09-08 08:41:07.095227+00:00,\nPlease evaluate the quality of the given ans...,,1.0\nคำตอบถูกต้อง ครบถ้วน ชัดเจน และตรงประเด็น,STOP,,19.0,152.0,196.0,367.0
6,2025-09-08 08:41:09.551516+00:00,\nPlease evaluate the quality of the given ans...,,1.0\nThe answer correctly identifies the capit...,STOP,,67.0,161.0,254.0,482.0
7,2025-09-08 08:41:11.159131+00:00,\nPlease evaluate the quality of the given ans...,,0.0\nThe reference answer explicitly states th...,STOP,,39.0,170.0,219.0,428.0
