# Constitutional Law LLM Evaluation

This notebook provides comprehensive evaluation tools for the trained Constitutional Law LLM.

## Overview
- Model performance evaluation
- Generation quality analysis
- Legal reasoning assessment
- Comparative analysis across parameters

## Setup

In [None]:
import os
import sys
import json
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.append('../src')

# Import our modules
from config import config
from model_utils import ModelManager
from model_training import ConstitutionalLawTrainer

# Add evaluation path
sys.path.append('../evaluation')
from generation_analysis import evaluate_model, run_generation_analysis, load_test_cases

print("Evaluation setup complete!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## Load Test Cases

Load and inspect the test cases for evaluation:

In [None]:
# Load test cases
test_file = "../evaluation/test_cases.json"
test_cases = load_test_cases(test_file)

print(f"Loaded {len(test_cases)} test cases")

# Show test case structure
if test_cases:
    example = test_cases[0]
    print(f"\nExample test case:")
    print(f"ID: {example.get('id', 'N/A')}")
    print(f"Facts: {example.get('facts', '')[:150]}...")
    print(f"Question: {example.get('question', '')[:100]}...")
    print(f"Reference: {example.get('reference', '')[:150]}...")

## Model Evaluation

Evaluate the trained models:

In [None]:
# Define model paths to evaluate
model_paths = [
    "../models/constitutional_law_trained",
    "../models/lora_finetuned2",
    "../models/lora_finetuned3"
]

# Check which models exist
available_models = []
for path in model_paths:
    if os.path.exists(path):
        available_models.append(path)
        print(f"✓ Found model: {path}")
    else:
        print(f"✗ Model not found: {path}")

print(f"\nEvaluating {len(available_models)} available models...")

In [None]:
# Evaluate each available model
evaluation_results = {}

for model_path in available_models:
    print(f"\nEvaluating model: {model_path}")
    
    try:
        # Run basic evaluation
        results = evaluate_model(
            model_path=model_path,
            test_file=test_file,
            base_model_name=config.model.base_model_name
        )
        
        evaluation_results[model_path] = results
        
        # Print summary
        print(f"  Total cases: {results['summary']['total_cases']}")
        print(f"  Overall score: {results['summary']['overall_score']:.3f}")
        print(f"  Average F1: {results['aggregate_metrics'].get('avg_f1', 0):.3f}")
        print(f"  Average accuracy: {results['aggregate_metrics'].get('avg_accuracy', 0):.3f}")
        
    except Exception as e:
        print(f"  Error evaluating {model_path}: {e}")
        evaluation_results[model_path] = {"error": str(e)}

print(f"\nEvaluation completed for {len(evaluation_results)} models")

## Results Analysis

Analyze and visualize the evaluation results:

In [None]:
# Create comparison dataframe
comparison_data = []

for model_path, results in evaluation_results.items():
    if "error" not in results:
        model_name = os.path.basename(model_path)
        metrics = results['aggregate_metrics']
        
        comparison_data.append({
            'Model': model_name,
            'Overall Score': results['summary']['overall_score'],
            'F1 Score': metrics.get('avg_f1', 0),
            'Accuracy': metrics.get('avg_accuracy', 0),
            'Legal Term Coverage': metrics.get('avg_legal_term_coverage', 0),
            'Constitutional Framework': metrics.get('avg_constitutional_framework', 0),
            'Reasoning Structure': metrics.get('avg_reasoning_structure', 0)
        })

if comparison_data:
    df_comparison = pd.DataFrame(comparison_data)
    print("Model Comparison:")
    print(df_comparison.round(3))
    
    # Plot comparison
    plt.figure(figsize=(14, 8))
    
    # Overall scores
    plt.subplot(2, 2, 1)
    plt.bar(df_comparison['Model'], df_comparison['Overall Score'])
    plt.title('Overall Model Performance')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    
    # F1 and Accuracy
    plt.subplot(2, 2, 2)
    x = range(len(df_comparison))
    width = 0.35
    plt.bar([i - width/2 for i in x], df_comparison['F1 Score'], width, label='F1 Score')
    plt.bar([i + width/2 for i in x], df_comparison['Accuracy'], width, label='Accuracy')
    plt.title('F1 Score vs Accuracy')
    plt.ylabel('Score')
    plt.xticks(x, df_comparison['Model'], rotation=45)
    plt.legend()
    
    # Legal metrics
    plt.subplot(2, 2, 3)
    plt.bar(df_comparison['Model'], df_comparison['Legal Term Coverage'])
    plt.title('Legal Term Coverage')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    
    # Constitutional framework
    plt.subplot(2, 2, 4)
    plt.bar(df_comparison['Model'], df_comparison['Constitutional Framework'])
    plt.title('Constitutional Framework Score')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Find best model
    best_model = df_comparison.loc[df_comparison['Overall Score'].idxmax()]
    print(f"\nBest performing model: {best_model['Model']}")
    print(f"Overall score: {best_model['Overall Score']:.3f}")
    
else:
    print("No successful evaluations to compare")

## Sample Responses

Show sample responses from the best model:

In [None]:
# Show sample responses from the best model
if comparison_data:
    best_model_name = df_comparison.loc[df_comparison['Overall Score'].idxmax(), 'Model']
    best_model_path = None
    
    for path in available_models:
        if os.path.basename(path) == best_model_name:
            best_model_path = path
            break
    
    if best_model_path:
        print(f"Sample responses from best model: {best_model_name}")
        print("=" * 60)
        
        # Get detailed results
        detailed_results = evaluation_results[best_model_path]['detailed_results']
        
        # Show first 3 examples
        for i, result in enumerate(detailed_results[:3]):
            print(f"\nExample {i+1}:")
            print(f"Case ID: {result['case_id']}")
            print(f"Facts: {result['facts'][:200]}...")
            print(f"Question: {result['question']}")
            print(f"Generated Response: {result['generated'][:300]}...")
            print(f"Reference Response: {result['reference'][:300]}...")
            print(f"F1 Score: {result['quality_metrics'].get('f1', 0):.3f}")
            print("-" * 60)
    else:
        print("Best model path not found")
else:
    print("No models to show responses from")

## Generation Parameter Analysis

Analyze how different generation parameters affect output quality:

In [None]:
# Run generation analysis on the best model (if available)
if comparison_data and available_models:
    best_model_path = None
    for path in available_models:
        if os.path.basename(path) == best_model_name:
            best_model_path = path
            break
    
    if best_model_path:
        print(f"Running generation analysis on: {best_model_name}")
        
        # Run with a subset of test cases for faster analysis
        subset_cases = test_cases[:5]  # Use first 5 cases
        
        try:
            generation_results = run_generation_analysis(
                model_path=best_model_path,
                test_file=test_file
            )
            
            print("\nGeneration Analysis Results:")
            print(f"Total parameter combinations tested: {generation_results['summary']['total_combinations']}")
            print(f"Best score: {generation_results['summary']['best_score']:.3f}")
            print(f"Average score: {generation_results['summary']['average_score']:.3f}")
            
            print(f"\nBest generation parameters:")
            best_params = generation_results['best_params']['params']
            for param, value in best_params.items():
                print(f"  {param}: {value}")
                
        except Exception as e:
            print(f"Error in generation analysis: {e}")
            print("Skipping generation parameter analysis")
    else:
        print("Best model path not found for generation analysis")
else:
    print("No models available for generation analysis")

## Summary and Recommendations

Provide summary of evaluation results and recommendations:

In [None]:
print("=== EVALUATION SUMMARY ===")

if comparison_data:
    print(f"Models evaluated: {len(comparison_data)}")
    print(f"Best model: {best_model['Model']}")
    print(f"Best overall score: {best_model['Overall Score']:.3f}")
    print(f"Best F1 score: {best_model['F1 Score']:.3f}")
    
    # Performance analysis
    avg_score = df_comparison['Overall Score'].mean()
    print(f"Average model score: {avg_score:.3f}")
    
    if best_model['Overall Score'] > 0.7:
        print("✓ Strong performance - Model shows good constitutional law reasoning")
    elif best_model['Overall Score'] > 0.5:
        print("⚠ Moderate performance - Consider additional training or parameter tuning")
    else:
        print("⚠ Weak performance - Significant improvements needed")
    
    print(f"\n=== RECOMMENDATIONS ===")
    
    # Specific recommendations based on metrics
    if best_model['Legal Term Coverage'] < 0.6:
        print("- Improve legal terminology coverage in training data")
    
    if best_model['Constitutional Framework'] < 0.6:
        print("- Enhance constitutional framework understanding")
    
    if best_model['Reasoning Structure'] < 0.6:
        print("- Improve logical reasoning structure in responses")
    
    # General recommendations
    print("- Consider hyperparameter optimization for better performance")
    print("- Evaluate on additional test cases for robustness")
    print("- Monitor for bias in constitutional interpretation")
    print("- Consider ensemble methods for improved accuracy")
    
else:
    print("No successful evaluations completed")
    print("Check model paths and dependencies")

print(f"\n=== NEXT STEPS ===")
print("1. Use best model for deployment or further fine-tuning")
print("2. Collect additional training data if performance is insufficient")
print("3. Implement the recommended generation parameters")
print("4. Consider specialized evaluation metrics for legal reasoning")

print("\nEvaluation completed!")