# Citation Evaluation

This notebook evaluates how well citations support scientific claims by:
1. Loading citation data and references from the JSON file
2. Using an LLM to determine if the referenced content supports each claim
3. Evaluating multiple citations holistically when present
4. Saving the evaluation results to evaluation.json

In [None]:
import json
import os
from typing import List, Dict, Optional, Any
from pathlib import Path
from datetime import datetime
from pydantic import BaseModel, Field
from openai import OpenAI

# Set your OpenAI API key
openai_api_key = os.environ.get("OPENAI_API_KEY")

## Define Evaluation Models

In [None]:
class Reference(BaseModel):
    """Reference information"""
    reference_id: int = Field(..., description="The ID of the reference")
    reference_text: str = Field(..., description="The full text of the reference")

class ClaimEvaluation(BaseModel):
    """Evaluation of whether citations support a claim"""
    claim_text: str = Field(..., description="The text of the claim")
    citation_keys: List[int] = Field(..., description="Citation keys supporting this claim")
    citation_text: str = Field(..., description="Original citation text (e.g., '$^{1,2}$')")
    references: List[Reference] = Field(..., description="Full reference information for citations")
    is_adequately_supported: bool = Field(..., description="Whether the claim is adequately supported by its citations")
    explanation: str = Field(..., description="Explanation of why the citations do or don't support the claim")
    suggestions: Optional[List[str]] = Field(None, description="Suggestions for improving the citation support")

class EvaluationResults(BaseModel):
    """Overall evaluation results for a document"""
    document_id: str = Field(..., description="ID of the document being evaluated")
    evaluations: List[ClaimEvaluation] = Field(..., description="List of claim evaluations")
    processed_date: str = Field(..., description="Date and time of evaluation")

## Load Citation Data and References

In [None]:
# Load citations and references data from JSON file
citation_json_path = "citation_analysis/citations.json"

with open(citation_json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

document_id = data['document_id']
citations = data['citations']
references = data.get('references', [])

# Create a reference lookup dictionary
reference_dict = {}
for ref in references:
    reference_dict[ref['reference_id']] = ref['reference_text']

print(f"Loaded {len(citations)} citations and {len(references)} references from document: {document_id}")

# Count single vs multiple citation claims
single_citations = sum(1 for c in citations if len(c['citation_keys']) == 1)
multi_citations = sum(1 for c in citations if len(c['citation_keys']) > 1)
print(f"Single citation claims: {single_citations}")
print(f"Multiple citation claims: {multi_citations}")

## Evaluate Citations

In [None]:
def evaluate_claim(claim_data, reference_dict, references_list, client, model="gpt-4.1"):
    """
    Evaluate if citations support a claim.
    
    Args:
        claim_data: Dictionary with claim information and citations
        reference_dict: Dictionary mapping reference IDs to reference text
        references_list: List of all reference objects
        client: OpenAI client
        model: Model to use for evaluation
        
    Returns:
        ClaimEvaluation object
    """
    # Extract claim information
    claim_text = claim_data['claim']
    citation_text = claim_data['citation_text']
    citation_keys = claim_data['citation_keys']
    
    # Get the referenced content for each citation key
    reference_content = ""
    for i, key in enumerate(citation_keys):
        ref_text = reference_dict.get(key, "Reference content not found")
        reference_content += f"Reference {i+1} (#{key}): {ref_text}\n\n"
    
    # Get the full reference objects for these citation keys
    claim_references = []
    for key in citation_keys:
        ref_obj = next((r for r in references_list if r['reference_id'] == key), None)
        if ref_obj:
            claim_references.append(Reference(
                reference_id=ref_obj['reference_id'],
                reference_text=ref_obj['reference_text']
            ))
    
    # Response model for the LLM
    class EvaluationResponse(BaseModel):
        is_adequately_supported: bool = Field(..., description="Whether the claim is adequately supported by its citations")
        explanation: str = Field(..., description="Explanation of why the citations do or don't support the claim")
        suggestions: Optional[List[str]] = Field(None, description="Suggestions for improving the citation support")
    
    # System prompt for claim evaluation
    system_prompt = """
    You are a citation evaluation expert with extensive knowledge of scientific literature and academic standards.
    Your task is to assess whether a scientific claim is adequately supported by its citations.
    
    IMPORTANT: When a claim has multiple citations, evaluate them HOLISTICALLY as a group,
    not just individually. Consider how they work together to support the overall claim.
    """
    
    # User prompt for evaluation
    user_prompt = f"""
    Evaluate whether this scientific claim is adequately supported by its citations.
    
    CLAIM:
    "{claim_text}"
    
    This claim is supported by {len(citation_keys)} citation(s) appearing as: {citation_text}
    
    CITED REFERENCES:
    {reference_content}
    
    Based on the content of these references, provide an evaluation with:
    1. Is this claim adequately supported by these references? (Yes/No)
    2. Explain your reasoning in detail
    3. If support is inadequate, provide 1-2 suggestions for improvement
    
    If there are multiple references, remember to evaluate them HOLISTICALLY, considering
    how they work together rather than just evaluating them individually.
    """
    
    try:
        # Use structured output to get evaluation
        completion = client.beta.chat.completions.parse(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            response_format=EvaluationResponse
        )
        
        # Get LLM response
        response = completion.choices[0].message.parsed
        
        # Create the full evaluation object
        evaluation = ClaimEvaluation(
            claim_text=claim_text,
            citation_keys=citation_keys,
            citation_text=citation_text,
            references=claim_references,
            is_adequately_supported=response.is_adequately_supported,
            explanation=response.explanation,
            suggestions=response.suggestions
        )
        
        return evaluation
        
    except Exception as e:
        print(f"Error evaluating claim: {e}")
        
        # Return a default evaluation in case of error
        return ClaimEvaluation(
            claim_text=claim_text,
            citation_keys=citation_keys,
            citation_text=citation_text,
            references=claim_references,
            is_adequately_supported=True,
            explanation=f"Error evaluating claim: {str(e)}"
        )

def evaluate_all_claims(citations, reference_dict, references_list, api_key, output_path="citation_analysis/evaluation.json"):
    """
    Evaluate all claims and save results to JSON.
    
    Args:
        citations: List of citation dictionaries
        reference_dict: Dictionary mapping reference IDs to reference text
        references_list: List of all reference objects
        api_key: OpenAI API key
        output_path: Path to save evaluation results
        
    Returns:
        EvaluationResults object with all evaluations
    """
    # Create OpenAI client
    client = OpenAI(api_key=api_key)
    
    # Evaluate each claim
    evaluations = []
    total_claims = len(citations)
    
    for i, claim in enumerate(citations):
        print(f"Evaluating claim {i+1}/{total_claims}: {claim['claim'][:80]}..." if len(claim['claim']) > 80 else f"Evaluating claim {i+1}/{total_claims}: {claim['claim']}")
        
        # Evaluate the claim
        evaluation = evaluate_claim(claim, reference_dict, references_list, client)
        evaluations.append(evaluation)
        
        # Show abbreviated result
        result = "Adequately supported" if evaluation.is_adequately_supported else "Inadequately supported"
        print(f"Result: {result}\n")
        
        # Save intermediate results every 10 claims
        if (i+1) % 10 == 0:
            intermediate_results = EvaluationResults(
                document_id=document_id,
                evaluations=evaluations,
                processed_date=datetime.now().isoformat()
            )
            
            # Save to temporary file
            temp_path = output_path.replace(".json", "_partial.json")
            with open(temp_path, 'w', encoding='utf-8') as f:
                f.write(intermediate_results.model_dump_json(indent=2))
            print(f"Saved intermediate results ({i+1}/{total_claims} claims) to {temp_path}")
    
    # Create final results object
    results = EvaluationResults(
        document_id=document_id,
        evaluations=evaluations,
        processed_date=datetime.now().isoformat()
    )
    
    # Save results to JSON file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(results.model_dump_json(indent=2))
    
    print(f"\nEvaluation complete. All results saved to {output_path}")
    return results

## Run Evaluation on All Claims

In [None]:
# Check if OpenAI API key is available
if not openai_api_key:
    print("Warning: No OpenAI API key provided. Set the OPENAI_API_KEY environment variable.")
else:
    # Create output directory if it doesn't exist
    Path("citation_analysis").mkdir(exist_ok=True)
    
    # Evaluate all claims and save results
    results = evaluate_all_claims(citations, reference_dict, references, openai_api_key)

## Analyze Results

In [None]:
def analyze_results(results):
    """Analyze evaluation results"""
    evaluations = results.evaluations
    
    # Basic statistics
    total = len(evaluations)
    adequately_supported = sum(1 for e in evaluations if e.is_adequately_supported)
    inadequately_supported = total - adequately_supported
    
    print(f"Total claims evaluated: {total}")
    print(f"Adequately supported: {adequately_supported} ({adequately_supported/total:.1%})")
    print(f"Inadequately supported: {inadequately_supported} ({inadequately_supported/total:.1%})")
    
    # Single vs multiple citation comparison
    single_citation_claims = [e for e in evaluations if len(e.citation_keys) == 1]
    multi_citation_claims = [e for e in evaluations if len(e.citation_keys) > 1]
    
    # Calculate adequacy rates
    single_citation_adequacy = sum(1 for e in single_citation_claims if e.is_adequately_supported) / len(single_citation_claims) if single_citation_claims else 0
    multi_citation_adequacy = sum(1 for e in multi_citation_claims if e.is_adequately_supported) / len(multi_citation_claims) if multi_citation_claims else 0
    
    print(f"\nSingle citation claims: {len(single_citation_claims)}")
    print(f"Adequately supported: {single_citation_adequacy:.1%}")
    
    print(f"\nMultiple citation claims: {len(multi_citation_claims)}")
    print(f"Adequately supported: {multi_citation_adequacy:.1%}")
    
    # Suggestions count
    claims_with_suggestions = sum(1 for e in evaluations if e.suggestions and len(e.suggestions) > 0)
    total_suggestions = sum(len(e.suggestions) for e in evaluations if e.suggestions)
    
    print(f"\nClaims with improvement suggestions: {claims_with_suggestions}")
    print(f"Total improvement suggestions: {total_suggestions}")

# Load and analyze results if file exists
evaluation_path = "citation_analysis/evaluation.json"
if Path(evaluation_path).exists():
    with open(evaluation_path, 'r', encoding='utf-8') as f:
        saved_results = json.load(f)
    
    # Convert to EvaluationResults object
    results = EvaluationResults.model_validate(saved_results)
    analyze_results(results)