## Effibench Dataset 

In [1]:
!pip install datasets



In [2]:
# Cell 1: Load the EffiBench Dataset
from datasets import load_dataset
import pandas as pd

# Load the EffiBench dataset
print("Loading EffiBench dataset...")
dataset = load_dataset("DONG19/EffiBench", split="train")

# Basic dataset info
print(f"Dataset size: {len(dataset)}")
print(f"Dataset features: {list(dataset.features.keys())}")

# Convert to pandas for easier exploration
df = dataset.to_pandas()
print(f"DataFrame shape: {df.shape}")
print("\nColumn names:")
for col in df.columns:
    print(f"  - {col}")

  from .autonotebook import tqdm as notebook_tqdm


Loading EffiBench dataset...
Dataset size: 1000
Dataset features: ['problem_idx', 'task_name', 'description', 'markdown_description', 'canonical_solution', 'test_case_generator', 'test_case']
DataFrame shape: (1000, 7)

Column names:
  - problem_idx
  - task_name
  - description
  - markdown_description
  - canonical_solution
  - test_case_generator
  - test_case


In [3]:
# Cell 2: Explore Test Case Format
import json

# Look at a few sample problems
print("=== SAMPLE PROBLEM EXAMINATION ===")
sample_idx = 0

print(f"\n--- Problem {sample_idx}: {df.iloc[sample_idx]['task_name']} ---")
print(f"Description: {df.iloc[sample_idx]['description'][:200]}...")

print("\n--- Canonical Solution ---")
print(df.iloc[sample_idx]['canonical_solution'][:300] + "...")

print("\n--- Test Case Generator ---")
print(df.iloc[sample_idx]['test_case_generator'][:300] + "...")

print("\n--- Test Cases ---")
test_case_raw = df.iloc[sample_idx]['test_case']
print(f"Type: {type(test_case_raw)}")
print(f"Content preview: {str(test_case_raw)[:500]}...")

# Try to parse test cases if they're JSON
try:
    if isinstance(test_case_raw, str):
        test_cases_parsed = json.loads(test_case_raw)
        print(f"\nParsed as JSON - Type: {type(test_cases_parsed)}")
        if isinstance(test_cases_parsed, list):
            print(f"Number of test cases: {len(test_cases_parsed)}")
            if len(test_cases_parsed) > 0:
                print(f"First test case: {test_cases_parsed[0]}")
                print(f"First test case keys: {list(test_cases_parsed[0].keys()) if isinstance(test_cases_parsed[0], dict) else 'Not a dict'}")
except Exception as e:
    print(f"JSON parsing failed: {e}")
    
# Let's also check a few more samples
print(f"\n=== CHECKING TEST CASE FORMATS ACROSS SAMPLES ===")
for i in range(min(3, len(df))):
    test_case = df.iloc[i]['test_case']
    print(f"Problem {i} ({df.iloc[i]['task_name']}): {type(test_case)} - {str(test_case)[:100]}...")

=== SAMPLE PROBLEM EXAMINATION ===

--- Problem 0: Longest Substring Without Repeating Characters ---
Description: 

<p>Given a string <code>s</code>, find the length of the <strong>longest</strong> <span data-keyword="substring-nonempty"><strong>substring</strong></span> without repeating characters.</p>

<p>&nbs...

--- Canonical Solution ---
class Solution:
    def lengthOfLongestSubstring(self, s: str) -> int:
        ss = set()
        i = ans = 0
        for j, c in enumerate(s):
            while c in ss:
                ss.remove(s[i])
                i += 1
            ss.add(c)
            ans = max(ans, j - i + 1)
        return...

--- Test Case Generator ---

import random

class Solution:
    def lengthOfLongestSubstring(self, s: str) -> int:
        ss = set()
        i = ans = 0
        for j, c in enumerate(s):
            while c in ss:
                ss.remove(s[i])
                i += 1
            ss.add(c)
            ans = max(ans, j - i + 1...

--- Test Cases 

In [4]:
# Cell 3: Parse Assert Statements to Extract Test Data
import re
import ast

def parse_assert_statements(test_case_string):
    """
    Parse assert statements to extract inputs and expected outputs
    Format: assert solution.method_name(args) == expected_output
    """
    test_cases = []
    
    # Split into individual assert statements
    assert_lines = [line.strip() for line in test_case_string.split('\n') if line.strip().startswith('assert')]
    
    for i, line in enumerate(assert_lines):
        try:
            # Use regex to extract method call and expected output
            # Pattern: assert solution.method_name(args) == expected_output
            pattern = r'assert solution\.(\w+)\((.*?)\) == (.+)'
            match = re.search(pattern, line)
            
            if match:
                method_name = match.group(1)
                args_str = match.group(2)
                expected_output_str = match.group(3)
                
                # Parse arguments using ast.literal_eval for safety
                try:
                    if args_str.strip() == '':
                        inputs = []
                    else:
                        # Try to parse as tuple of arguments
                        inputs = ast.literal_eval(f"({args_str},)" if ',' not in args_str else f"({args_str})")
                        if isinstance(inputs, tuple) and len(inputs) == 1:
                            inputs = inputs[0]  # Single argument
                        else:
                            inputs = list(inputs)  # Multiple arguments
                except:
                    # If literal_eval fails, treat as single string argument
                    inputs = args_str.strip("'\"")
                
                # Parse expected output
                try:
                    expected_output = ast.literal_eval(expected_output_str)
                except:
                    expected_output = expected_output_str.strip("'\"")
                
                test_cases.append({
                    'test_id': f't{i+1}',
                    'method_name': method_name,
                    'inputs': inputs,
                    'expected_output': expected_output,
                    'original_line': line
                })
                
        except Exception as e:
            print(f"Failed to parse line: {line}")
            print(f"Error: {e}")
    
    return test_cases

# Test the parser on sample problems
print("=== PARSING TEST CASES ===")
for idx in range(min(3, len(df))):
    problem_name = df.iloc[idx]['task_name']
    test_case_str = df.iloc[idx]['test_case']
    
    print(f"\n--- Problem {idx}: {problem_name} ---")
    parsed_cases = parse_assert_statements(test_case_str)
    
    print(f"Parsed {len(parsed_cases)} test cases:")
    for case in parsed_cases[:3]:  # Show first 3
        print(f"  {case['test_id']}: inputs={case['inputs']}, expected={case['expected_output']}")
    
    if len(parsed_cases) > 3:
        print(f"  ... and {len(parsed_cases) - 3} more test cases")

=== PARSING TEST CASES ===

--- Problem 0: Longest Substring Without Repeating Characters ---
Parsed 100 test cases:
  t1: inputs=, expected=0
  t2: inputs=krLKl6F, expected=7
  t3: inputs=p2Cn3Y6, expected=7
  ... and 97 more test cases

--- Problem 1: Median of Two Sorted Arrays ---
Parsed 100 test cases:
  t1: inputs=[[20, 67], [37, 85]], expected=52.0
  t2: inputs=[[1, 2, 13, 22, 34, 46, 63, 86], [59, 80]], expected=40.0
  t3: inputs=[[8, 57, 82, 87], [8, 18, 20, 23, 40, 41, 54, 63, 72, 93]], expected=47.5
  ... and 97 more test cases

--- Problem 2: Regular Expression Matching ---
Parsed 100 test cases:
  t1: inputs=['xneafi', '.asrgzjwjjxxoho'], expected=False
  t2: inputs=['ggmxwwkbebouidkhdya', 'rhe'], expected=False
  t3: inputs=['dyz', 'biragpervxyvwagor'], expected=False
  ... and 97 more test cases


In [5]:
# Cell 4: Extract Function Names and Prepare Solutions
import re

def extract_function_name_from_solution(canonical_solution):
    """Extract the main function name from the canonical solution"""
    # Look for class methods first
    class_method_pattern = r'def\s+(\w+)\s*\('
    matches = re.findall(class_method_pattern, canonical_solution)
    
    if matches:
        # Filter out __init__ and other special methods
        methods = [m for m in matches if not m.startswith('__')]
        if methods:
            return methods[0]  # Return first non-special method
    
    return None

def create_wrapper_function(canonical_solution, function_name):
    """
    Create a top-level wrapper function for jouletrace.
    Jouletrace needs a top-level function, but EffiBench has class methods.
    """
    wrapper_code = f"""# Original solution
{canonical_solution}

# Wrapper function for jouletrace
def solve(*args, **kwargs):
    solution = Solution()
    return solution.{function_name}(*args, **kwargs)
"""
    return wrapper_code

# Test solution preparation
print("=== PREPARING SOLUTIONS FOR JOULETRACE ===")

for idx in range(min(3, len(df))):
    problem_name = df.iloc[idx]['task_name']
    canonical_solution = df.iloc[idx]['canonical_solution']
    
    print(f"\n--- Problem {idx}: {problem_name} ---")
    
    # Extract function name
    func_name = extract_function_name_from_solution(canonical_solution)
    print(f"Detected function name: {func_name}")
    
    if func_name:
        # Create wrapper
        wrapped_solution = create_wrapper_function(canonical_solution, func_name)
        print("Wrapper function created:")
        print(wrapped_solution[:200] + "...")
        
        # Also show what the test case parsing found for method name
        test_case_str = df.iloc[idx]['test_case']
        parsed_cases = parse_assert_statements(test_case_str)
        if parsed_cases:
            test_method_name = parsed_cases[0]['method_name']
            print(f"Test case method name: {test_method_name}")
            print(f"Names match: {func_name == test_method_name}")
    else:
        print("Could not extract function name")

# Let's also check the distribution of function names across the dataset
print(f"\n=== FUNCTION NAME ANALYSIS ===")
function_names = []
for idx in range(len(df)):
    canonical_solution = df.iloc[idx]['canonical_solution']
    func_name = extract_function_name_from_solution(canonical_solution)
    if func_name:
        function_names.append(func_name)

from collections import Counter
name_counts = Counter(function_names)
print(f"Total problems with extractable function names: {len(function_names)}/{len(df)}")
print("Most common function names:")
for name, count in name_counts.most_common(10):
    print(f"  {name}: {count}")

=== PREPARING SOLUTIONS FOR JOULETRACE ===

--- Problem 0: Longest Substring Without Repeating Characters ---
Detected function name: lengthOfLongestSubstring
Wrapper function created:
# Original solution
class Solution:
    def lengthOfLongestSubstring(self, s: str) -> int:
        ss = set()
        i = ans = 0
        for j, c in enumerate(s):
            while c in ss:
         ...
Test case method name: lengthOfLongestSubstring
Names match: True

--- Problem 1: Median of Two Sorted Arrays ---
Detected function name: findMedianSortedArrays
Wrapper function created:
# Original solution
class Solution:
    def findMedianSortedArrays(self, nums1: List[int], nums2: List[int]) -> float:
        def f(i: int, j: int, k: int) -> int:
            if i >= m:
            ...
Test case method name: findMedianSortedArrays
Names match: True

--- Problem 2: Regular Expression Matching ---
Detected function name: isMatch
Wrapper function created:
# Original solution
class Solution:
    def isMatc

In [6]:
# Cell 5: Build JouletTrace API Request
import json

def build_jouletrace_request(problem_idx, df, max_test_cases=None):
    """
    Convert an EffiBench problem into a jouletrace API request format
    """
    problem_data = df.iloc[problem_idx]
    
    # Extract basic info
    task_name = problem_data['task_name']
    canonical_solution = problem_data['canonical_solution']
    test_case_str = problem_data['test_case']
    
    # Get function name and create wrapper
    func_name = extract_function_name_from_solution(canonical_solution)
    if not func_name:
        raise ValueError(f"Could not extract function name for problem {problem_idx}")
    
    wrapped_solution = create_wrapper_function(canonical_solution, func_name)
    
    # Parse test cases
    parsed_test_cases = parse_assert_statements(test_case_str)
    
    # Limit test cases if specified
    if max_test_cases:
        parsed_test_cases = parsed_test_cases[:max_test_cases]
    
    # Convert to jouletrace format
    jouletrace_test_cases = []
    for case in parsed_test_cases:
        # Determine input format based on jouletrace call semantics
        inputs = case['inputs']
        
        # Handle different input types for jouletrace
        if isinstance(inputs, list):
            # Multiple positional args: func(*inputs)
            jouletrace_inputs = inputs
        elif isinstance(inputs, dict):
            # Keyword args: func(**inputs)  
            jouletrace_inputs = inputs
        else:
            # Single positional arg: func(inputs)
            jouletrace_inputs = [inputs]
        
        jouletrace_test_cases.append({
            "test_id": case['test_id'],
            "inputs": jouletrace_inputs,
            "expected_output": case['expected_output']
        })
    
    # Build the complete request
    request = {
        "candidate_code": wrapped_solution,
        "function_name": "solve",  # Our wrapper function name
        "test_cases": jouletrace_test_cases,
        "timeout_seconds": 30,  # Generous timeout for efficiency problems
        "memory_limit_mb": 512,  # Reasonable memory limit
        "energy_measurement_trials": 5,
        "warmup_trials": 2,
        "candidate_id": f"effibench_{problem_idx}",
        "problem_name": task_name
    }
    
    return request

# Test the request builder on a sample problem
print("=== BUILDING JOULETRACE REQUESTS ===")

sample_idx = 0
print(f"Building request for problem {sample_idx}: {df.iloc[sample_idx]['task_name']}")

try:
    request = build_jouletrace_request(sample_idx, df, max_test_cases=5)
    
    print(f"\nRequest structure:")
    print(f"- Function name: {request['function_name']}")
    print(f"- Number of test cases: {len(request['test_cases'])}")
    print(f"- Candidate ID: {request['candidate_id']}")
    print(f"- Problem name: {request['problem_name']}")
    
    print(f"\nSample test cases:")
    for i, test_case in enumerate(request['test_cases'][:3]):
        print(f"  {test_case['test_id']}: inputs={test_case['inputs']}, expected={test_case['expected_output']}")
    
    print(f"\nCode preview:")
    print(request['candidate_code'][:300] + "...")
    
    print(f"\nFull request JSON (first 1000 chars):")
    request_json = json.dumps(request, indent=2)
    print(request_json[:1000] + "...")
    
except Exception as e:
    print(f"Error building request: {e}")

# Test on a few more problems to ensure consistency
print(f"\n=== TESTING ON MULTIPLE PROBLEMS ===")
for idx in range(1, min(4, len(df))):
    try:
        request = build_jouletrace_request(idx, df, max_test_cases=3)
        print(f"✓ Problem {idx} ({df.iloc[idx]['task_name']}): {len(request['test_cases'])} test cases")
    except Exception as e:
        print(f"✗ Problem {idx}: {e}")

=== BUILDING JOULETRACE REQUESTS ===
Building request for problem 0: Longest Substring Without Repeating Characters

Request structure:
- Function name: solve
- Number of test cases: 5
- Candidate ID: effibench_0
- Problem name: Longest Substring Without Repeating Characters

Sample test cases:
  t1: inputs=[''], expected=0
  t2: inputs=['krLKl6F'], expected=7
  t3: inputs=['p2Cn3Y6'], expected=7

Code preview:
# Original solution
class Solution:
    def lengthOfLongestSubstring(self, s: str) -> int:
        ss = set()
        i = ans = 0
        for j, c in enumerate(s):
            while c in ss:
                ss.remove(s[i])
                i += 1
            ss.add(c)
            ans = max(ans, j - i...

Full request JSON (first 1000 chars):
{
  "candidate_code": "# Original solution\nclass Solution:\n    def lengthOfLongestSubstring(self, s: str) -> int:\n        ss = set()\n        i = ans = 0\n        for j, c in enumerate(s):\n            while c in ss:\n                ss.re

In [7]:
# Cell 6: Send Request to JouletTrace API
import requests
import time
import json
from typing import Dict, Any

# Configure your JouletTrace API endpoint
JOULETRACE_BASE_URL = "http://localhost:8000"  # Replace with actual endpoint
# If you need authentication, add headers here
API_HEADERS = {
    "Content-Type": "application/json",
    # "Authorization": "Bearer your-token-here"  # Add if needed
}

def send_jouletrace_request(request_data: Dict[Any, Any]) -> Dict[Any, Any]:
    """
    Send a request to the JouletTrace API
    Returns the task queued response
    """
    url = f"{JOULETRACE_BASE_URL}/api/v1/measure"
    
    try:
        response = requests.post(url, json=request_data, headers=API_HEADERS)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"Response: {e.response.text}")
        raise

def poll_task_result(task_id: str, max_polls: int = 60, poll_interval: int = 5) -> Dict[Any, Any]:
    """
    Poll the JouletTrace API for task completion
    Returns the final result
    """
    url = f"{JOULETRACE_BASE_URL}/api/v1/tasks/{task_id}"
    
    for attempt in range(max_polls):
        try:
            response = requests.get(url, headers=API_HEADERS)
            response.raise_for_status()
            result = response.json()
            
            status = result.get('status')
            print(f"Poll {attempt + 1}: Status = {status}")
            
            if status == "completed":
                return result
            elif status == "failed":
                print(f"Task failed: {result.get('error_message', 'Unknown error')}")
                return result
            elif status in ["queued", "running"]:
                if status == "running":
                    stage = result.get('stage', 'Unknown')
                    progress = result.get('progress', 0)
                    print(f"  Running: {stage} ({progress*100:.1f}%)")
                time.sleep(poll_interval)
            else:
                print(f"Unknown status: {status}")
                return result
                
        except requests.exceptions.RequestException as e:
            print(f"Polling failed: {e}")
            time.sleep(poll_interval)
    
    raise TimeoutError(f"Task {task_id} did not complete within {max_polls * poll_interval} seconds")

# Test with a single problem (DRY RUN - comment out the actual API calls for now)
print("=== TESTING JOULETRACE API INTEGRATION ===")

# Build a request for the first problem
sample_idx = 0
print(f"Preparing to test problem {sample_idx}: {df.iloc[sample_idx]['task_name']}")

try:
    request = build_jouletrace_request(sample_idx, df, max_test_cases=3)  # Limit to 3 test cases for testing
    
    print(f"Request ready:")
    print(f"- Problem: {request['problem_name']}")
    print(f"- Test cases: {len(request['test_cases'])}")
    print(f"- Code length: {len(request['candidate_code'])} characters")
    
    # UNCOMMENT THESE LINES WHEN YOU'RE READY TO TEST WITH REAL API:
    print("\\nSending request to JouletTrace...")
    queued_response = send_jouletrace_request(request)
    
    print(f"Task queued successfully!")
    print(f"Task ID: {queued_response['task_id']}")
    print(f"Estimated completion: {queued_response.get('estimated_completion_seconds', 'Unknown')} seconds")
    print(f"Poll URL: {queued_response.get('poll_url', 'N/A')}")
    
    print("\\nPolling for results...")
    final_result = poll_task_result(queued_response['task_id'])
    
    print(f"\\nFinal result:")
    print(f"Status: {final_result['status']}")
    
    if final_result['status'] == 'completed':
        validation = final_result.get('validation', {})
        print(f"Correctness: {validation.get('is_correct', 'Unknown')}")
        print(f"Passed tests: {validation.get('passed_tests', 0)}/{validation.get('total_tests', 0)}")
        
        energy_metrics = final_result.get('energy_metrics', {})
        if energy_metrics:
            print(f"\\nEnergy Metrics:")
            print(f"- Total energy: {energy_metrics.get('median_total_energy_joules', 'N/A')} joules")
            print(f"- Execution time: {energy_metrics.get('median_execution_time_seconds', 'N/A')} seconds")
            print(f"- Power consumption: {energy_metrics.get('power_consumption_watts', 'N/A')} watts")
    
    # print(f"\nTO ACTUALLY RUN:")
    # print(f"1. Replace JOULETRACE_BASE_URL with your actual API endpoint")
    # print(f"2. Add authentication headers if needed")
    # print(f"3. Uncomment the API call lines above")
    # print(f"4. Run this cell again")
    
except Exception as e:
    print(f"Error in test setup: {e}")

=== TESTING JOULETRACE API INTEGRATION ===
Preparing to test problem 0: Longest Substring Without Repeating Characters
Request ready:
- Problem: Longest Substring Without Repeating Characters
- Test cases: 3
- Code length: 477 characters
\nSending request to JouletTrace...
Task queued successfully!
Task ID: f2845a57-1cac-4605-bdac-737f5cc944ce
Estimated completion: 93 seconds
Poll URL: /api/v1/tasks/f2845a57-1cac-4605-bdac-737f5cc944ce
\nPolling for results...
Poll 1: Status = running
  Running: validation (10.0%)


Poll 2: Status = completed
\nFinal result:
Status: completed
Correctness: True
Passed tests: 3/3
\nEnergy Metrics:
- Total energy: 1.54 joules
- Execution time: 1.3491997378878295e-05 seconds
- Power consumption: 114141.73578264017 watts


In [8]:

import requests

requests.get(f"{JOULETRACE_BASE_URL}/ping").json()

{'status': 'ok', 'service': 'jouletrace'}

In [9]:
import os
import time
import requests
from urllib.parse import urljoin
from pprint import pprint

# Set once per notebook (falls back to localhost if not set)
JOULETRACE_BASE_URL = os.getenv("JOULETRACE_BASE_URL", "http://127.0.0.1:8000")

def poll_from_poll_url(poll_url: str, timeout_s: int = 600, interval_s: float = 0.5):
    """
    Poll a JouleTrace task until it finishes (completed/failed).
    Accepts either a relative poll_url (e.g., /api/v1/tasks/<id>)
    or an absolute URL.
    """
    # Build full URL if needed
    url = poll_url if poll_url.startswith("http") else urljoin(JOULETRACE_BASE_URL, poll_url)
    s = requests.Session()
    start = time.time()
    last_status = None

    while True:
        try:
            r = s.get(url, timeout=30)
            r.raise_for_status()
            data = r.json()
            status = data.get("status")
            # Print status transitions or occasional updates
            if status != last_status or status == "running":
                if status == "running":
                    stage = data.get("stage") or data.get("meta", {}).get("stage")
                    progress = data.get("progress")
                    print(f"status=running stage={stage} progress={progress}")
                else:
                    print(f"status={status}")
                last_status = status

            if status in {"completed", "failed"}:
                return data

            if time.time() - start > timeout_s:
                raise TimeoutError(f"Polling timed out after {timeout_s}s: {url}")

            time.sleep(interval_s)

        except requests.RequestException as e:
            # Brief backoff on transient errors
            print(f"poll error: {e}; retrying…")
            time.sleep(1.0)

# Example usage:
# queued = requests.post(f"{JOULETRACE_BASE_URL}/api/v1/measure", json=payload).json()
# result = poll_from_poll_url(queued["poll_url"])
# pprint(result)


In [10]:
result = poll_from_poll_url(queued_response.get('poll_url', 'N/A'))

status=completed


In [11]:
# Cell 7: Batch Processing Multiple Problems
import json
import time
import os
import requests
from datetime import datetime
from typing import List, Dict, Any
import pandas as pd

class EffiBenchJouletTraceProcessor:
    def __init__(self, df: pd.DataFrame, base_url: str, headers: Dict[str, str] = None):
        self.df = df
        self.base_url = base_url
        self.headers = headers or {"Content-Type": "application/json"}
        self.results = {}
        self.failed_problems = {}
        
    def process_batch(self, 
                     problem_indices: List[int], 
                     max_test_cases_per_problem: int = 10,
                     batch_delay: float = 2.0,
                     results_file: str = None) -> Dict[str, Any]:
        """
        Process a batch of EffiBench problems through JouletTrace
        """
        if results_file is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            results_file = f"effibench_jouletrace_results_{timestamp}.json"
        
        # Load existing results if file exists
        if os.path.exists(results_file):
            with open(results_file, 'r') as f:
                existing_data = json.load(f)
                self.results = existing_data.get('results', {})
                self.failed_problems = existing_data.get('failed_problems', {})
            print(f"Loaded existing results: {len(self.results)} completed, {len(self.failed_problems)} failed")
        
        print(f"Processing {len(problem_indices)} problems...")
        print(f"Results will be saved to: {results_file}")
        
        for i, prob_idx in enumerate(problem_indices):
            prob_key = str(prob_idx)
            
            # Skip if already processed
            if prob_key in self.results or prob_key in self.failed_problems:
                print(f"Skipping problem {prob_idx} (already processed)")
                continue
                
            try:
                print(f"\n[{i+1}/{len(problem_indices)}] Processing problem {prob_idx}: {self.df.iloc[prob_idx]['task_name']}")
                
                # Build request
                request = build_jouletrace_request(prob_idx, self.df, max_test_cases_per_problem)
                
                # Send request
                queued_response = self._send_request(request)
                task_id = queued_response['task_id']
                
                print(f"  Task queued: {task_id}")
                
                # Poll for result
                result = self._poll_result(task_id)
                
                # Store result
                self.results[prob_key] = {
                    'problem_idx': prob_idx,
                    'task_name': self.df.iloc[prob_idx]['task_name'],
                    'task_id': task_id,
                    'request_timestamp': datetime.now().isoformat(),
                    'result': result
                }
                
                # Log summary
                if result['status'] == 'completed':
                    validation = result.get('validation', {})
                    energy = result.get('energy_metrics', {})
                    print(f"  ✓ Completed: {validation.get('passed_tests', 0)}/{validation.get('total_tests', 0)} tests passed")
                    if energy:
                        print(f"    Energy: {energy.get('median_total_energy_joules', 'N/A')} joules, {energy.get('median_execution_time_seconds', 'N/A')} seconds")
                else:
                    print(f"  ✗ Failed: {result.get('error_message', 'Unknown error')}")
                
                print(f"  ✓ Result stored")
                
            except Exception as e:
                print(f"  ✗ Error processing problem {prob_idx}: {e}")
                self.failed_problems[prob_key] = {
                    'problem_idx': prob_idx,
                    'task_name': self.df.iloc[prob_idx]['task_name'],
                    'error': str(e),
                    'timestamp': datetime.now().isoformat()
                }
            
            # Save progress after each problem
            self._save_results(results_file)
            
            # Rate limiting delay
            if i < len(problem_indices) - 1:  # Don't wait after last item
                time.sleep(batch_delay)
        
        print(f"\n🏁 Batch processing complete!")
        print(f"✅ Successful: {len(self.results)}")
        print(f"❌ Failed: {len(self.failed_problems)}")
        
        return {
            'results': self.results,
            'failed_problems': self.failed_problems,
            'results_file': results_file
        }
    
    def _save_results(self, results_file: str):
        """Save current results to file"""
        data = {
            'results': self.results,
            'failed_problems': self.failed_problems,
            'metadata': {
                'total_problems_processed': len(self.results) + len(self.failed_problems),
                'successful_problems': len(self.results),
                'failed_problems': len(self.failed_problems),
                'last_updated': datetime.now().isoformat()
            }
        }
        with open(results_file, 'w') as f:
            json.dump(data, f, indent=2)
    
    def _send_request(self, request_data: Dict[Any, Any]) -> Dict[Any, Any]:
        """Send a request to the JouletTrace API"""
        url = f"{self.base_url}/api/v1/measure"
        
        try:
            response = requests.post(url, json=request_data, headers=self.headers)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            if hasattr(e, 'response') and e.response is not None:
                print(f"Response: {e.response.text}")
            raise
    
    def _poll_result(self, task_id: str, max_polls: int = 60, poll_interval: int = 5) -> Dict[Any, Any]:
        """Poll the JouletTrace API for task completion"""
        url = f"{self.base_url}/api/v1/tasks/{task_id}"
        
        for attempt in range(max_polls):
            try:
                response = requests.get(url, headers=self.headers)
                response.raise_for_status()
                result = response.json()
                
                status = result.get('status')
                if attempt % 10 == 0:  # Print every 10th poll to reduce noise
                    print(f"    Poll {attempt + 1}: Status = {status}")
                
                if status == "completed":
                    return result
                elif status == "failed":
                    print(f"    Task failed: {result.get('error_message', 'Unknown error')}")
                    return result
                elif status in ["queued", "running"]:
                    if status == "running" and attempt % 5 == 0:  # Show progress occasionally
                        stage = result.get('stage', 'Unknown')
                        progress = result.get('progress', 0)
                        print(f"    Running: {stage} ({progress*100:.1f}%)")
                    time.sleep(poll_interval)
                else:
                    print(f"    Unknown status: {status}")
                    return result
                    
            except requests.exceptions.RequestException as e:
                print(f"    Polling failed: {e}")
                time.sleep(poll_interval)
        
        raise TimeoutError(f"Task {task_id} did not complete within {max_polls * poll_interval} seconds")

# Test batch processing on a small subset
print("=== SETTING UP BATCH PROCESSING ===")

# Initialize processor
processor = EffiBenchJouletTraceProcessor(
    df=df, 
    base_url="http://127.0.0.1:8000",
    headers={"Content-Type": "application/json"}
)

# Test on first 5 problems
test_indices = list(range(5))
print(f"Test batch: problems {test_indices}")

# Show what we're about to process
print("\nProblems to process:")
for idx in test_indices:
    print(f"  {idx}: {df.iloc[idx]['task_name']}")

print(f"\n✅ READY TO RUN BATCH PROCESSING!")
print(f"JouletTrace endpoint: http://127.0.0.1:8000")
print(f"To start processing:")
print(f"results = processor.process_batch(test_indices, max_test_cases_per_problem=5)")

# Ready to run - uncomment this line when you want to start:
# results = processor.process_batch(test_indices, max_test_cases_per_problem=5)

=== SETTING UP BATCH PROCESSING ===
Test batch: problems [0, 1, 2, 3, 4]

Problems to process:
  0: Longest Substring Without Repeating Characters
  1: Median of Two Sorted Arrays
  2: Regular Expression Matching
  3: Container With Most Water
  4: Generate Parentheses

✅ READY TO RUN BATCH PROCESSING!
JouletTrace endpoint: http://127.0.0.1:8000
To start processing:
results = processor.process_batch(test_indices, max_test_cases_per_problem=5)


In [12]:
results = processor.process_batch(test_indices, max_test_cases_per_problem=5)

Processing 5 problems...
Results will be saved to: effibench_jouletrace_results_20250929_225137.json

[1/5] Processing problem 0: Longest Substring Without Repeating Characters
  Task queued: 2f02a702-030d-4d82-8713-258989d55f14
    Poll 1: Status = running
    Running: initializing (0.0%)
  ✓ Completed: 5/5 tests passed
    Energy: 1.1400000000000001 joules, 1.639200490899384e-05 seconds
  ✓ Result stored

[2/5] Processing problem 1: Median of Two Sorted Arrays
  Task queued: 4909f4d3-d29c-4e00-8d1e-8d1bff4b920d
    Poll 1: Status = running
    Running: validation (10.0%)
    Task failed: Energy measurement failed
  ✗ Failed: Energy measurement failed
  ✓ Result stored

[3/5] Processing problem 2: Regular Expression Matching
  Task queued: 0b6376fa-1742-4b0f-a290-3af1ac83cd00
    Poll 1: Status = running
    Running: validation (10.0%)
  ✓ Completed: 5/5 tests passed
    Energy: 1.47 joules, 7.670599734410644e-05 seconds
  ✓ Result stored

[4/5] Processing problem 3: Container With Mo

In [13]:
def build_wrapper_code(class_src: str, class_name: str, method_name: str,
                       public_name: str, arg_names):
    """
    Compose candidate_code that defines your class AND a top-level wrapper
    with name `public_name` that calls class method `method_name`.
    """
    args_sig = ", ".join(arg_names)
    call_sig = ", ".join(arg_names)
    wrapper = f"\n\ndef {public_name}({args_sig}):\n    return {class_name}().{method_name}({call_sig})\n"
    return class_src.rstrip() + wrapper

def normalize_tests(tests, spec):
    """
    Normalize test cases' inputs to match the public function signature.
    spec: list describing the expected argument shapes, e.g.
      - ['list']            => one positional arg that is a list
      - ['list','list']     => two positional args that are lists
      - ['str'] or ['str','str']
      - ['int'] etc.
    Returns a new list of tests with corrected 'inputs'.
    """
    norm = []
    for t in tests:
        inp = t['inputs']
        # Always work with a list of positional args
        args = inp if isinstance(inp, (list, tuple)) else [inp]

        # If spec is one-list, but args is a flat list of ints, wrap it
        if spec == ['list']:
            if len(args) == 1 and not isinstance(args[0], list):
                # single non-list → keep as is (user passed scalar)
                pass
            elif len(args) == 0:
                args = [[]]
            elif len(args) == 1 and isinstance(args[0], list):
                pass  # already correct
            else:
                # user passed a flattened sequence instead of one list: wrap
                args = [list(args)]

        elif spec == ['list', 'list']:
            # Ensure exactly two lists
            if len(args) == 2 and all(isinstance(a, list) for a in args):
                pass
            elif len(args) == 1 and isinstance(args[0], list) and len(args[0]) == 2 and all(isinstance(a, list) for a in args[0]):
                args = args[0]  # user nested [[list1, list2]]
            else:
                # Best-effort: if not two lists, keep as-is (will fail fast)
                args = list(args)

        elif spec == ['str']:
            if len(args) != 1 or not isinstance(args[0], str):
                # coerce to single string if possible
                args = ["".join(map(str, args))]
        elif spec == ['str','str']:
            if len(args) == 2 and all(isinstance(a, str) for a in args):
                pass
            elif len(args) == 1 and isinstance(args[0], (list, tuple)) and len(args[0]) == 2:
                args = list(args[0])
            else:
                # crude fallback
                s = "".join(map(str, args))
                args = [s, ""]
        # leave other specs as-is

        norm.append({**t, 'inputs': args})
    return norm


In [14]:
# Your class-based candidate (example skeleton)
container_class_src = """\
from typing import List

class Solution:
    def maxArea(self, height: List[int]) -> int:
        i, j = 0, len(height) - 1
        best = 0
        while i < j:
            best = max(best, min(height[i], height[j]) * (j - i))
            if height[i] < height[j]:
                i += 1
            else:
                j -= 1
        return best
"""

# Build wrapper: public function name must match what you send as function_name
container_code = build_wrapper_code(
    class_src=container_class_src,
    class_name="Solution",
    method_name="maxArea",
    public_name="maxArea",
    arg_names=["height"],
)

# Fix tests: spec is one list positional arg
# If you already have tests, do: container_tests_fixed = normalize_tests(container_tests, ['list'])
# Example tests:
container_tests = [
    {"test_id":"t1","inputs":[[1,8,6,2,5,4,8,3,7]],"expected_output":49},
    {"test_id":"t2","inputs":[[1,1]],"expected_output":1},
    {"test_id":"t3","inputs":[[2,3,10,5,7,8,9]],"expected_output":36},
]
container_tests_fixed = normalize_tests(container_tests, ['list'])


In [15]:
median_class_src = """\
from typing import List

class Solution:
    def findMedianSortedArrays(self, nums1: List[int], nums2: List[int]) -> float:
        A, B = nums1, nums2
        m, n = len(A), len(B)
        if m > n: A, B, m, n = B, A, n, m
        total, half = m + n, (m + n)//2
        lo, hi = 0, m
        while lo <= hi:
            i = (lo + hi)//2
            j = half - i
            Aleft = A[i-1] if i>0 else float('-inf')
            Aright = A[i] if i<m else float('inf')
            Bleft = B[j-1] if j>0 else float('-inf')
            Bright = B[j] if j<n else float('inf')
            if Aleft <= Bright and Bleft <= Aright:
                if total % 2:
                    return float(min(Aright, Bright))
                return (max(Aleft, Bleft) + min(Aright, Bright)) / 2.0
            elif Aleft > Bright:
                hi = i - 1
            else:
                lo = i + 1
        raise ValueError("invalid")
"""

median_code = build_wrapper_code(
    class_src=median_class_src,
    class_name="Solution",
    method_name="findMedianSortedArrays",
    public_name="findMedianSortedArrays",
    arg_names=["nums1","nums2"],
)

# Example tests; normalize to two list args
median_tests = [
    {"test_id":"m1","inputs":[[1,3],[2]],"expected_output":2.0},
    {"test_id":"m2","inputs":[[1,2],[3,4]],"expected_output":2.5},
]
median_tests_fixed = normalize_tests(median_tests, ['list','list'])


In [16]:
import requests, time
from pprint import pprint

BASE_URL = "http://127.0.0.1:8000"
MEASURE_URL = f"{BASE_URL}/api/v1/measure"
TASK_URL = f"{BASE_URL}/api/v1/tasks"

def queue_job(code, function_name, tests, timeout=10, trials=3, warmup=1, mem_mb=1024):
    payload = {
        "candidate_code": code,
        "function_name": function_name,
        "test_cases": tests,
        "timeout_seconds": timeout,
        "memory_limit_mb": mem_mb,
        "energy_measurement_trials": trials,
        "warmup_trials": warmup,
    }
    r = requests.post(MEASURE_URL, json=payload, timeout=60)
    r.raise_for_status()
    return r.json()

def poll_task(task_id):
    while True:
        r = requests.get(f"{TASK_URL}/{task_id}", timeout=60)
        r.raise_for_status()
        data = r.json()
        if data.get("status") in {"completed", "failed"}:
            return data
        time.sleep(0.5)

# Container With Most Water
q1 = queue_job(container_code, "maxArea", container_tests_fixed, timeout=10, trials=3, warmup=1)
r1 = poll_task(q1["task_id"]); pprint(r1)

# Median of Two Sorted Arrays
q2 = queue_job(median_code, "findMedianSortedArrays", median_tests_fixed, timeout=10, trials=3, warmup=1)
r2 = poll_task(q2["task_id"]); pprint(r2)


{'candidate_id': None,
 'energy_metrics': {'energy_efficiency_score': 0.38000000000000006,
                    'energy_per_test_case_joules': 0.38000000000000006,
                    'median_execution_time_seconds': 1.2235002941451967e-05,
                    'median_package_energy_joules': 1.06,
                    'median_ram_energy_joules': 0.08,
                    'median_total_energy_joules': 1.1400000000000001,
                    'power_consumption_watts': 93175.29431380036},
 'measurement_environment': {'cpu_model': None,
                             'measurement_core': None,
                             'meter_type': 'unknown',
                             'thermal_controlled': True,
                             'timestamp': 1759186402.717732},
 'measurement_timestamp': 1759186402.717732,
 'problem_name': None,
 'processing_time_seconds': 0.2545042037963867,
 'request_id': '70302eb8-2088-4411-82d2-bed12a0a75b9',
 'status': 'completed',
 'validation': {'error_summary': None,
 

In [44]:
# Test for rewards

In [17]:
import os
import time
import math
import requests
from urllib.parse import urljoin
from pprint import pprint

# Base URL for the running JouleTrace API
JOULETRACE_BASE_URL = os.getenv("JOULETRACE_BASE_URL", "http://127.0.0.1:8000")
MEASURE_URL = f"{JOULETRACE_BASE_URL}/api/v1/measure"

def poll_from_poll_url(poll_url: str, timeout_s: int = 600, interval_s: float = 0.5):
    url = poll_url if poll_url.startswith("http") else urljoin(JOULETRACE_BASE_URL, poll_url)
    s = requests.Session()
    start = time.time()
    last_status = None
    while True:
        r = s.get(url, timeout=30)
        r.raise_for_status()
        data = r.json()
        status = data.get("status")
        if status != last_status or status == "running":
            if status == "running":
                print(f"status=running stage={data.get('stage')} progress={data.get('progress')}")
            else:
                print(f"status={status}")
            last_status = status
        if status in {"completed", "failed"}:
            return data
        if time.time() - start > timeout_s:
            raise TimeoutError(f"Polling timed out after {timeout_s}s: {url}")
        time.sleep(interval_s)

def queue_job(candidate_code: str, function_name: str, test_cases: list,
              timeout_seconds=10, memory_limit_mb=1024,
              trials=3, warmup=1):
    payload = {
        "candidate_code": candidate_code,
        "function_name": function_name,
        "test_cases": test_cases,
        "timeout_seconds": timeout_seconds,
        "memory_limit_mb": memory_limit_mb,
        "energy_measurement_trials": trials,
        "warmup_trials": warmup,
    }
    r = requests.post(MEASURE_URL, json=payload, timeout=60)
    r.raise_for_status()
    return r.json()


In [18]:
# Pure Python local reference to produce expected outputs
def fib_ref(n: int) -> int:
    a, b = 0, 1
    for _ in range(n):
        a, b = b, a + b
    return a

# Candidate: fast iterative (O(n))
candidate_fast = """\
def solve(n):
    a, b = 0, 1
    for _ in range(n):
        a, b = b, a + b
    return a
"""

# Candidate: slow naive recursion (exponential) — keep n modest so it finishes
candidate_slow = """\
def solve(n):
    if n < 2:
        return n
    return solve(n-1) + solve(n-2)
"""

# Build the same tests for both candidates
# Choose ns so slow version finishes within timeout; adjust if needed
ns = [22, 24]  # both should complete for naive recursion with a reasonable timeout
tests = [
    {"test_id": f"t-{n}", "inputs": [n], "expected_output": fib_ref(n)}
    for n in ns
]
pprint(tests)


[{'expected_output': 17711, 'inputs': [22], 'test_id': 't-22'},
 {'expected_output': 46368, 'inputs': [24], 'test_id': 't-24'}]


In [19]:
def energy_reward(result_json: dict,
                  ref_energy_j: float = 1.0,
                  ref_time_s: float = 0.01,
                  beta_energy: float = 1.5,
                  k_energy: float = 1.2,
                  k_time: float = 1.0,
                  hard_cap_multiple: float = 50.0) -> float:
    """Reward in [0, 1] with harmonic emphasis on the worse dimension."""
    if result_json.get("status") != "completed":
        return 0.0

    validation = result_json.get("validation") or {}
    if not validation.get("is_correct", False):
        return 0.0

    em = result_json.get("energy_metrics") or {}
    E = em.get("median_total_energy_joules")
    T = em.get("median_execution_time_seconds")
    if E is None or T is None or E < 0 or T < 0:
        return 0.0

    if E > hard_cap_multiple * ref_energy_j or T > hard_cap_multiple * ref_time_s:
        return 0.0

    safe_ref_energy = max(ref_energy_j, 1e-12)
    safe_ref_time = max(ref_time_s, 1e-12)

    rE = E / safe_ref_energy
    rT = T / safe_ref_time

    Se = 1.0 / (1.0 + (rE ** k_energy))
    St = 1.0 / (1.0 + (rT ** k_time))

    beta2 = beta_energy * beta_energy
    denom = beta2 * Se + St
    if denom <= 0.0:
        return 0.0

    reward = (1.0 + beta2) * (Se * St) / denom
    if not (reward == reward):
        return 0.0

    return float(max(0.0, min(1.0, reward)))


def summarize_energy(result_json: dict) -> dict:
    em = result_json.get("energy_metrics") or {}
    return {
        "package_J": em.get("median_package_energy_joules"),
        "ram_J": em.get("median_ram_energy_joules"),
        "total_J": em.get("median_total_energy_joules"),
        "time_s": em.get("median_execution_time_seconds"),
        "power_W": em.get("power_consumption_watts"),
    }


def _format_with_units(value, thresholds_units, scale):
    if value is None:
        return "n/a"
    abs_value = abs(value)
    for threshold, unit in thresholds_units:
        if abs_value >= threshold:
            return f"{value / scale[unit]:.3f} {unit}"
    unit = thresholds_units[-1][1]
    return f"{value / scale[unit]:.3f} {unit}"


def format_energy(value):
    thresholds = ((1.0, "J"), (1e-3, "mJ"), (1e-6, "uJ"), (1e-9, "nJ"), (0.0, "pJ"))
    scale = {"J": 1.0, "mJ": 1e-3, "uJ": 1e-6, "nJ": 1e-9, "pJ": 1e-12}
    return _format_with_units(value, thresholds, scale)


def format_time(value):
    thresholds = ((1.0, "s"), (1e-3, "ms"), (1e-6, "us"), (1e-9, "ns"), (0.0, "ps"))
    scale = {"s": 1.0, "ms": 1e-3, "us": 1e-6, "ns": 1e-9, "ps": 1e-12}
    return _format_with_units(value, thresholds, scale)


def format_power(value):
    thresholds = ((1.0, "W"), (1e-3, "mW"), (1e-6, "uW"), (1e-9, "nW"), (0.0, "pW"))
    scale = {"W": 1.0, "mW": 1e-3, "uW": 1e-6, "nW": 1e-9, "pW": 1e-12}
    return _format_with_units(value, thresholds, scale)


def pad(text: str, width: int = 20) -> str:
    return f"{text:<{width}}"


In [20]:
print("Queuing slow...")
q_slow = queue_job(candidate_slow, "solve", tests, timeout_seconds=15, trials=3, warmup=1)
print("Queuing fast...")
q_fast = queue_job(candidate_fast, "solve", tests, timeout_seconds=15, trials=3, warmup=1)

print("\nPolling slow...")
r_slow = poll_from_poll_url(q_slow["poll_url"])
print("\nPolling fast...")
r_fast = poll_from_poll_url(q_fast["poll_url"])

print("\nSlow result:")
pprint(r_slow)
print("\nFast result:")
pprint(r_fast)


Queuing slow...
Queuing fast...

Polling slow...
status=running stage=validation progress=0.1
status=completed

Polling fast...
status=completed

Slow result:
{'candidate_id': None,
 'energy_metrics': {'energy_efficiency_score': 1.1099999999999999,
                    'energy_per_test_case_joules': 1.1099999999999999,
                    'median_execution_time_seconds': 0.009937327995430678,
                    'median_package_energy_joules': 1.97,
                    'median_ram_energy_joules': 0.25,
                    'median_total_energy_joules': 2.2199999999999998,
                    'power_consumption_watts': 223.40009316596843},
 'measurement_environment': {'cpu_model': None,
                             'measurement_core': None,
                             'meter_type': 'unknown',
                             'thermal_controlled': True,
                             'timestamp': 1759186417.706287},
 'measurement_timestamp': 1759186417.706287,
 'problem_name': None,
 'processin

In [44]:
slow_energy = summarize_energy(r_slow)
fast_energy = summarize_energy(r_fast)
slow_reward = energy_reward(r_slow)
fast_reward = energy_reward(r_fast)

print("Slow energy:", slow_energy, "reward:", slow_reward)
print("Fast energy:", fast_energy, "reward:", fast_reward)

rows = {
    "slow": {
        "status": r_slow.get("status"),
        "energy": slow_energy,
        "reward": slow_reward,
    },
    "fast": {
        "status": r_fast.get("status"),
        "energy": fast_energy,
        "reward": fast_reward,
    },
}

print("" + "=" * 66)
print("ENERGY EFFICIENCY SUMMARY".center(66))
print("=" * 66)
print(f"{'Candidate':<12}{'Status':<12}{'Reward':<12}{'Energy':<15}{'Time':<15}")
print("-" * 66)
for name, info in rows.items():
    energy = info["energy"]
    print(f"{name:<12}"
          f"{str(info['status']):<12}"
          f"{info['reward']:<12.4f}"
          f"{format_energy(energy.get('total_J')):<15}"
          f"{format_time(energy.get('time_s')):<15}")
print("-" * 66)
print(f"Winner: {'fast' if fast_reward > slow_reward else ('slow' if slow_reward > fast_reward else 'tie')}")


Slow energy: {'package_J': 1.96, 'ram_J': 0.25, 'total_J': 2.21, 'time_s': 0.00993454000854399, 'power_W': 222.4561980825822} reward: 0.40247187835817677
Fast energy: {'package_J': 1.25, 'ram_J': 0.17, 'total_J': 1.43, 'time_s': 5.997993866913021e-06, 'power_W': 238413.048050677} reward: 0.6788664759891463
                    ENERGY EFFICIENCY SUMMARY                     
Candidate   Status      Reward      Energy         Time           
------------------------------------------------------------------
slow        completed   0.4025      2.210 J        9.935 ms       
fast        completed   0.6789      1.430 J        5.998 us       
------------------------------------------------------------------
Winner: fast


In [21]:
def run_once():
    q1 = queue_job(candidate_slow, "solve", tests, timeout_seconds=15, trials=3, warmup=1)
    q2 = queue_job(candidate_fast, "solve", tests, timeout_seconds=15, trials=3, warmup=1)
    r1 = poll_from_poll_url(q1["poll_url"])
    r2 = poll_from_poll_url(q2["poll_url"])
    return energy_reward(r1), energy_reward(r2)

repeats = 3
slow_rs, fast_rs = [], []
for _ in range(repeats):
    sr, fr = run_once()
    slow_rs.append(sr)
    fast_rs.append(fr)
    print(f"slow={sr:.3f} fast={fr:.3f}")

print("Averaged rewards over", repeats, "runs:")
print("slow avg:", sum(slow_rs)/len(slow_rs))
print("fast avg:", sum(fast_rs)/len(fast_rs))


status=running stage=validation progress=0.1
status=completed
status=completed
slow=0.406 fast=0.666
status=running stage=validation progress=0.1
status=completed
status=completed
slow=0.377 fast=0.639
status=running stage=validation progress=0.1
status=completed
status=completed
slow=0.409 fast=0.672
Averaged rewards over 3 runs:
slow avg: 0.39727016963377926
fast avg: 0.6588473184449485
