In [1]:
from evaluator import LLMJudge, ReportProcessor
import os

#### LLM Setup


In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI

# Set up Gemini LLM
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not set in environment.")

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",  # This is the latest reliable model name as of mid-2025
    temperature=0,
    max_tokens=2048,  # Set an appropriate limit
    max_retries=2,
    google_api_key=api_key,
)

#### Process sample report


In [3]:
example_path = os.path.join(os.getcwd(), "reports/report_sudan_20250607.md")
example = ReportProcessor(example_path)

In [4]:
example_sections = example.get_sections()
print('SECTION NAMES:')
print(example_sections.keys())

SECTION NAMES:
dict_keys(['Summary of Recent Developments', 'Key Alliance Shifts', 'Security Implications for UN Operations', 'Forward Outlook', 'Key Trends', 'Hotspots', 'Broader Conflict Context'])


#### Evaluate all sections


In [5]:
evaluator = LLMJudge(model=llm)

response = evaluator.evaluate_all_sections(example_sections)

Evaluating section: Summary of Recent Developments
Evaluating section: Key Alliance Shifts
Evaluating section: Security Implications for UN Operations
Evaluating section: Forward Outlook
Evaluating section: Key Trends
Evaluating section: Hotspots
Evaluating section: Broader Conflict Context


In [6]:
response

{'Summary of Recent Developments': {'accuracy_1': {'score': 3,
   'comment': "The summary states that violence dropped to its lowest levels in April 2025, but it's unclear if this is accurate without further context or comparison to specific data. To improve, provide a quantifiable comparison (e.g., 'X% lower than the average of the previous six months') and cite the source of the data."},
  'accuracy_2': {'score': 4,
   'comment': 'The claims about territorial control seem plausible, but to reach a score of 5, the report should explicitly cite sources or data that confirm these claims (e.g., specific reports from monitoring groups or satellite imagery analysis).'},
  'accuracy_3': {'score': 4,
   'comment': 'No data points are explicitly missing or misstated, but the lack of specific figures (e.g., casualty numbers) makes it difficult to fully assess accuracy. Including key figures with sources would improve this.'},
  'relevance_1': {'score': 4,
   'comment': 'The summary captures cr