In [11]:
# Import core libraries

import os               # File and directory operations
import json             # Read/write JSON files

from anthropic import Anthropic

import time



In [None]:
# Initialize Anthropic client
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY", "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"))  # CHANGED

In [13]:
# Define base path for all input/output data
path = '/Users/pastudilloe/Library/CloudStorage/Dropbox/01 CONSULTING/WB_PriorActions_Poverty'


In [14]:
# Define specific file and folder locations
universe_file = path + "/Helpers/Prior_Actions_PROCESSED.txt"
processed_reports_folder = path + "/Datasets/Processed/policy_reports_test"
classification_output_file = path + "/Datasets/Processed/classification_results_test_CLAUDE1.json"


In [15]:
# Load the universe JSON into a Python dict

def load_universe(file_path: str) -> dict:
    """Load the pre-processed universe JSON from disk."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [16]:
# Read in all processed report texts into a dict

def load_reports(folder_path: str) -> dict:
    """
    Load every .txt file from folder_path.
    Returns { filename: full_text, ... }
    """
    reports = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                reports[filename] = f.read()
    return reports


In [17]:
# Build a short summary of the universe for prompting

def build_universe_summary(universe_data: dict) -> str:
    """
    For each topic and policy area, include only the first 100 characters
    so the prompt stays concise.
    """
    summary = ""
    for topic, policies in universe_data.items():
        summary += f"Topic: {topic}\n"
        for policy, text in policies.items():
            summary += f"  Policy Area: {policy}\n    {text[:100]}...\n"
        summary += "\n"
    return summary

In [18]:
# Call Anthropic Claude Haiku to classify a single report against the universe

def classify_report(report_text: str, universe_summary: str) -> dict:
    """
    Use the Claude API to classify a policy report.
    """
    # Truncate if too long
    max_report_length = 2000
    if len(report_text) > max_report_length:
        report_text = report_text[:max_report_length] + "..."

    # Build the exact same prompt body as before:
    prompt_body = (
        "You are a World Bank economist and expert in policy analysis. Below is the universe description that defines topics and their policy areas:\n\n"
        f"{universe_summary}\n\n"
        "Now, consider the following policy report text:\n\n"
        f"{report_text}\n\n"
        "Your task is to classify this report by identifying its primary topic and assessing its relevance to each policy area. For each policy area, provide:\n"
        "  1. A binary score: **0** for no or only minimal relevance; **1** for very clear and very meaningful relevance. Exclude if the only reference to the policy are is just an institution\n."
        "  2. Specific sentences or excerpts from the report that are directly relevant to that policy area. Include only evidence that clearly supports the classification.\n"
        "  3. A detailed explanation of why that score was assigned, explicitly referencing the evidence.\n\n"
        "If the report does not clearly address a policy area, assign a score of 0 and explain that no clear evidence was found.\n\n"
        "Return your answer as a JSON object in the following structure:\n"
        '{\n'
        '  "matched_topic": "Name of topic",\n'
        '  "policy_scores": {\n'
        '      "Policy Area Name": score, ...\n'
        '  },\n'
        '  "evidence": {\n'
        '      "Policy Area Name": ["relevant excerpt 1", "relevant excerpt 2", ...],\n'
        '      ...\n'
        '  },\n'
        '  "explanation": "Detailed explanation with references to the evidence for each policy area."\n'
        '}\n'
        "Ensure the JSON is properly formatted.\n"
        "***IMPORTANT:*** Respond **with only the JSON object**, no additional text or markdown."
    )

    response = client.messages.create(
        model="claude-3-5-haiku-20241022",
        system = "You are a World Bank economist and expert in policy analysis.",
        messages=[
            {"role": "user",   "content": prompt_body}
        ],
        
    max_tokens=800,
    temperature=0.2
    )
    
    answer = response.content[0].text.strip()

    try:
        classification = json.loads(answer)
    except json.JSONDecodeError:
        classification = {"error": "Failed to parse JSON response", "raw_response": answer}

    return classification

In [19]:
# Call Anthropic Claude Haiku to generate a one-paragraph summary of the report

def summarize_document(text: str) -> str:
    """
    Ask Claude to produce a brief, one-paragraph summary.
    """
    prompt_body = "In exactly one paragraph, **with no intro**, summarize the following text:\n\n" + text
    

    response = client.messages.create(
        model="claude-3-5-haiku-20241022",
        system = "You are a World Bank economist and expert in policy analysis.",
        messages=[
            {"role": "user",   "content": prompt_body}
        ],
        max_tokens=150,
        temperature=0.2
        
    )
    answer = response.content[0].text.strip()

    # strip any common leading summary phrase
    summary = answer
    for prefix in [
        "Here’s a one-paragraph summary of the document:",
        "Here is a one-paragraph summary of the document:",
        "Here's a one-paragraph summary of the document:",
        "Here is a one-paragraph summary of the document:",
        "Sure, here's a one-paragraph summary:"
    ]:
        if summary.startswith(prefix):
            summary = summary[len(prefix):].strip()

    return summary



In [20]:
# Main workflow orchestration

def main():
    # 1) Load & summarize universe
    if not os.path.exists(universe_file):
        print(f"Universe file not found at: {universe_file}")
        return
    universe_data = load_universe(universe_file)
    universe_summary = build_universe_summary(universe_data)

    # 2) Load processed report texts
    if not os.path.exists(processed_reports_folder):
        print(f"Processed reports folder not found at: {processed_reports_folder}")
        return
    reports_data = load_reports(processed_reports_folder)
    print(f"Loaded {len(reports_data)} processed policy reports.")

    classification_results = {}
    # 3) Loop over each report
    for filename, full_text in reports_data.items():
        # Extract metadata headers from the top of the text
        lines = full_text.splitlines()
        project_name = "unknown"
        link = "unknown"
        filtered_lines = []
        for line in lines:
            if line.startswith("Project Name:"):
                project_name = line.split("Project Name:", 1)[1].strip()
            elif line.startswith("Link:"):
                link = line.split("Link:", 1)[1].strip()
            else:
                filtered_lines.append(line)

        # Extract Operation ID & file name from the first header line if present
        operation_id = "unknown"
        file_name = filename
        if filtered_lines and filtered_lines[0].startswith("ID "):
            header_parts = filtered_lines[0].split()
            if len(header_parts) >= 3:
                operation_id = header_parts[1]
                file_name = header_parts[2].replace('.txt', '')
            filtered_lines = filtered_lines[1:]

        report_body = "\n".join(filtered_lines)

        # 4) Classify & summarize the report
        print(f"Classifying report: {filename}")
        result = classify_report(report_body, universe_summary)
        summary = summarize_document(report_body)
        time.sleep(0.9)


        # 5) Attach metadata & summary
        result.update({
            "Operation ID": operation_id,
            "File Name": file_name,
            "Project Name": project_name,
            "Link": link,
            "Description": summary
        })

        classification_results[filename] = result
        print(f"Result for {filename}:\n{json.dumps(result, indent=2)}\n")

    # 6) Save all classifications to JSON
    with open(classification_output_file, 'w', encoding='utf-8') as f:
        json.dump(classification_results, f, indent=2)
    print(f"Classification results saved to: {classification_output_file}")
    print("All Classification Results:")
    print(json.dumps(classification_results, indent=2))

if __name__ == "__main__":
    main()

Loaded 213 processed policy reports.
Classifying report: ID_P_UNKNOWN_59910.txt
Result for ID_P_UNKNOWN_59910.txt:
{
  "matched_topic": "Poverty Reduction",
  "policy_scores": {
    "Improving access to education": 0,
    "Enhancing education quality": 0,
    "Fostering labor market transitions": 0,
    "Access to quality care": 0,
    "Food security and maternal and child nutrition services": 0,
    "Environmental health determinants and pollution": 0
  },
  "evidence": {},
  "explanation": "This document appears to be a World Bank program document for a poverty reduction support grant in Burkina Faso. While it touches on development issues, it does not provide specific, substantive evidence for any of the listed policy areas. The document seems to be primarily an administrative and financial overview of a development support program, with many technical abbreviations and institutional references. Without more context or detailed text, it is not possible to definitively link the docum