# **Install dependencies**

In [None]:
pip install jsonlines 'smolagents[litellm]'

# **Input API keys**

In [None]:
os.environ["FIREWORKS_API_KEY"] = "YOUR-FIREWORKS-API-KEY"
os.environ["OPENAI_API_KEY"] = "YOUR-OPENAI-API-KEY"
os.environ["ANTHROPIC_API_KEY"] = "ANTHROPIC-API-KEY"

# **Run Evaluation on LLM of choice**

In [None]:
import json
import pandas as pd
import jsonlines
from tqdm import tqdm
import os
import time
from litellm import completion
from litellm.exceptions import RateLimitError

def evaluate_chancery(jsonl_path, csv_path, model="fireworks_ai/llama-v3p3-70b-instruct",
                                 max_items=502, max_tokens=10, temperature=0.0, max_retries=5):
    """
    Evaluates an LLM on the CHANCERY benchmark by matching the charter id with the
    corresponding charter text and querying an LLM via litellm.
    Includes handling for rate limits and time estimation.

    Args:
    - jsonl_path (str): Path to the JSONL file with questions and answers
    - csv_path (str): Path to the CSV file with charter text
    - model (str): LLM model to use (default: "fireworks_ai/llama-v3p3-70b-instruct")
    - max_items (int): Maximum number of items to process
    - max_tokens (int): Maximum tokens in model response
    - temperature (float): Temperature setting for model (0.0 = more deterministic)
    - max_retries (int): Maximum number of retry attempts on rate limit errors

    Returns:
    - tuple: (results dataframe, accuracy percentage)
    """
    # Read the CSV file
    print(f"Loading charter texts from {csv_path}...")
    charter_texts = pd.read_csv(csv_path, on_bad_lines='skip')
    print(f"Loaded {len(charter_texts)} charter entries")

    # Prepare results storage
    results = []
    correct_predictions = 0
    total_processed = 0

    print(f"Using model: {model}")

    # Count total items for progress bar (up to max_items)
    with jsonlines.open(jsonl_path) as reader:
        total_items = sum(1 for _ in reader if _ is not None)
    total_items = min(total_items, max_items)
    print(f"Processing up to {total_items} questions from {jsonl_path}")

    # Read JSONL file
    with jsonlines.open(jsonl_path) as reader:
        # Initialize tqdm with total count and time estimation
        pbar = tqdm(total=total_items, desc="Processing Charters", unit="charter")

        for i, item in enumerate(reader):
            if i >= max_items:
                break

            question = item['question']
            expected_answer = item['answer']
            charter_id = item['charter_id']

            # Find matching charter text
            matching_text = charter_texts[charter_texts['charter_id'] == charter_id]['text'].values

            if len(matching_text) == 0:
                print(f"No matching text found for charter_id: {charter_id}")
                continue

            # Create prompt for the model
            prompt = f"""Based on the charter text below, please answer the following question with ONLY 'Yes' or 'No'.

Question: {question}

Charter Text: {matching_text[0]}

Your response must be EXACTLY ONE WORD: either 'Yes' or 'No'. No additional text or explanation is allowed."""

            # Query the model with retry logic for rate limits
            retry_count = 0
            backoff_time = 2  # starting backoff time in seconds

            while True:
                try:
                    response = completion(
                        model=model,
                        messages=[{"content": prompt, "role": "user"}],
                        max_tokens=max_tokens,
                        temperature=temperature
                    )

                    # Extract the model's response
                    model_response = response.choices[0].message.content.strip().lower()

                    # Extract prediction (just take the first word in case model outputs extra text)
                    prediction = model_response.split()[0].capitalize()
                    if prediction not in ['Yes', 'No']:
                        prediction = 'No'  # Default to No if model doesn't follow instructions

                    is_correct = prediction == expected_answer

                    if is_correct:
                        correct_predictions += 1

                    results.append({
                        'charter_id': charter_id,
                        'question': question,
                        'expected_answer': expected_answer,
                        'model_response': model_response,
                        'prediction': prediction,
                        'is_correct': is_correct,
                        'token_usage': response.usage,
                        'retries': retry_count
                    })

                    # Break out of the retry loop on success
                    break

                except RateLimitError as e:
                    retry_count += 1
                    if retry_count > max_retries:
                        print(f"Failed after {max_retries} retries for charter {charter_id}: {e}")
                        results.append({
                            'charter_id': charter_id,
                            'question': question,
                            'expected_answer': expected_answer,
                            'model_response': "RATE_LIMIT_ERROR",
                            'prediction': "No",  # Default to No on failure
                            'is_correct': expected_answer == "No",
                            'token_usage': None,
                            'retries': retry_count
                        })
                        break

                    wait_time = backoff_time * (2 ** (retry_count - 1))  # Exponential backoff
                    print(f"Rate limit hit for charter {charter_id}. Retrying in {wait_time} seconds... (Attempt {retry_count}/{max_retries})")
                    time.sleep(wait_time)

                except Exception as e:
                    print(f"Error processing charter {charter_id}: {e}")
                    print(f"Exception details: {type(e).__name__}: {e}")

                    # Still add the result with error info
                    results.append({
                        'charter_id': charter_id,
                        'question': question,
                        'expected_answer': expected_answer,
                        'model_response': f"ERROR: {type(e).__name__}",
                        'prediction': "No",  # Default to No on failure
                        'is_correct': expected_answer == "No",
                        'token_usage': None,
                        'retries': retry_count
                    })
                    break

            # Update progress bar
            total_processed += 1
            pbar.update(1)

        # Close progress bar
        pbar.close()

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Calculate accuracy
    accuracy = (correct_predictions / total_processed) * 100 if total_processed > 0 else 0

    # Print summary statistics
    print(f"\nResults Summary:")
    print(f"Model used: {model}")
    print(f"Processed {total_processed} charters with {correct_predictions} correct predictions")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Total retries due to rate limits: {results_df['retries'].sum()}")

    return results_df, accuracy

def setup_api_keys(fireworks_key=None, openai_key=None, anthropic_key=None):
    """Set up API keys for different LLM providers"""
    keys_set = []

    if fireworks_key:
        os.environ["FIREWORKS_API_KEY"] = fireworks_key
        keys_set.append("Fireworks")

    if openai_key:
        os.environ["OPENAI_API_KEY"] = openai_key
        keys_set.append("OpenAI")

    if anthropic_key:
        os.environ["ANTHROPIC_API_KEY"] = anthropic_key
        keys_set.append("Anthropic")

    if keys_set:
        print(f"API keys set for: {', '.join(keys_set)}")
    else:
        print("No API keys provided. Please ensure keys are set in your environment variables.")

if __name__ == "__main__":
    # Example configuration - can be modified or moved to command-line arguments
    config = {
        'jsonl_path': 'CHANCERY benchmark.jsonl',
        'csv_path': 'charters.csv',
        'model': 'fireworks_ai/llama-v3p3-70b-instruct',  # Can be changed to any LiteLLM supported model
        'max_items': 10,  # Set to small number for testing; use larger number for full run
        'output_file': 'charter_analysis_results.csv'
    }

    # Optional: Set API keys if not in environment variables
    # setup_api_keys(
    #     fireworks_key="your_fireworks_key",
    #     openai_key="your_openai_key",
    #     anthropic_key="your_anthropic_key"
    # )

    # Run the analysis
    results_df, accuracy = evaluate_chancery(
        jsonl_path=config['jsonl_path'],
        csv_path=config['csv_path'],
        model=config['model'],
        max_items=config['max_items']
    )

    # Save results
    results_df.to_csv(config['output_file'], index=False)
    print(f"Results saved to {config['output_file']}")