In [15]:
import pandas as pd
import os
import subprocess
import tempfile
import shutil
import platform
import time
import re
from datetime import datetime
import sys

# ======== CUSTOMIZATION OPTIONS ========
# Set these variables to customize the evaluator
FILE_NAME = "mHumanEval-eng_Latn-cpp.csv"   # CSV file name
SOLUTION_COLUMN = "canonical_solutions"      # Column name containing solutions
# =======================================

def simple_progress_bar(iterable, desc="Processing", total=None):
    """
    A simple text-based progress bar that works in any environment
    """
    if total is None:
        total = len(iterable) if hasattr(iterable, '__len__') else None
    
    if total is None:
        # Can't show progress without total, just iterate
        for item in iterable:
            yield item
        return
        
    width = 40  # Width of the progress bar
    start_time = time.time()
    
    for i, item in enumerate(iterable):
        percent = (i + 1) / total
        filled_width = int(width * percent)
        bar = '█' * filled_width + '░' * (width - filled_width)
        
        elapsed = time.time() - start_time
        if elapsed > 0 and i > 0:
            items_per_sec = i / elapsed
            eta = (total - i) / items_per_sec if items_per_sec > 0 else 0
            eta_str = f"ETA: {int(eta)}s" if eta < 600 else f"ETA: {int(eta/60)}m {int(eta%60)}s"
        else:
            eta_str = "ETA: --"
            
        sys.stdout.write(f"\r{desc}: |{bar}| {(i+1)}/{total} [{percent:.1%}] {eta_str}")
        sys.stdout.flush()
        
        yield item
    
    sys.stdout.write("\n")
    sys.stdout.flush()

def evaluate_cpp_solutions(csv_file=None, solution_column=None):
    """
    Evaluate C++ solutions against their test cases and report Pass@1 score.
    
    Args:
        csv_file (str, optional): Path to the CSV file. Defaults to FILE_NAME.
        solution_column (str, optional): Name of the column containing solutions. 
                                       Defaults to SOLUTION_COLUMN.
        
    Returns:
        float: Pass@1 percentage
    """
    # Use defaults if not provided
    csv_file = csv_file or FILE_NAME
    solution_column = solution_column or SOLUTION_COLUMN
    
    # Determine platform-specific settings
    system = platform.system()
    if system == "Windows":
        exe_extension = '.exe'
        compiler = 'g++'
    else:  # Linux, macOS, etc.
        exe_extension = ''
        # On macOS, prefer clang++ over g++
        if platform.system() == "Darwin":
            compiler = 'clang++'
        else:
            compiler = 'g++'
    
    # Create a temporary directory for our files
    temp_dir = tempfile.mkdtemp()
    print(f"Using temporary directory: {temp_dir}")
    
    try:
        # Read the CSV file
        print(f"Reading CSV file: {csv_file}")
        try:
            df = pd.read_csv(csv_file)
        except FileNotFoundError:
            print(f"Error: File '{csv_file}' not found.")
            return None
        except pd.errors.EmptyDataError:
            print(f"Error: File '{csv_file}' is empty.")
            return None
        except Exception as e:
            print(f"Error reading file: {e}")
            return None
            
        total_problems = len(df)
        
        # Check if solution column exists
        if solution_column not in df.columns:
            print(f"Error: Column '{solution_column}' not found in the CSV file.")
            print(f"Available columns: {', '.join(df.columns)}")
            return None
        
        # Extract NL and PL from the first row
        nl = df['nl'].iloc[0] if 'nl' in df.columns else "Unknown"
        pl = df['pl'].iloc[0] if 'pl' in df.columns else "Unknown"
        
        print(f"Natural Language: {nl}, Programming Language: {pl}")
        
        # Check if compiler is available
        try:
            subprocess.run([compiler, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
            print(f"{compiler} compiler found. Proceeding with evaluation.")
        except (subprocess.SubprocessError, FileNotFoundError):
            print(f"Error: {compiler} compiler not found. Cannot evaluate solutions.")
            return None
        
        # Initialize counters and data storage
        passed = 0
        failed = 0
        passed_tasks = []
        compilation_errors = []
        runtime_errors = []
        timeouts = []
        other_errors = []
        
        start_time = time.time()
        
        # Standard C++ headers to replace bits/stdc++.h
        std_headers = """
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdlib>
#include <functional>
#include <iostream>
#include <map>
#include <memory>
#include <numeric>
#include <set>
#include <sstream>
#include <stack>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <tuple>
#include <array>
#include <queue>
#include <deque>
#include <list>
#include <bitset>
"""

        print(f"\nEvaluating {total_problems} problems...")
        
        # Process each problem with our custom progress bar
        for index, row in simple_progress_bar(df.iterrows(), desc="Evaluating", total=total_problems):
            task_id = row['task_id']
            prompt = row['prompt']
            solution = row[solution_column]  # Use the specified solution column
            test = row['test']
            
            # Create filenames
            safe_name = task_id.replace('/', '_').replace('\\', '_')
            cpp_file = os.path.join(temp_dir, f"{safe_name}.cpp")
            exe_file = os.path.join(temp_dir, f"{safe_name}{exe_extension}")
            
            # Process solution - replace bits/stdc++.h if present
            solution_lines = solution.strip().split('\n')
            processed_solution_lines = []
            
            bits_stdc_replaced = False
            
            for line in solution_lines:
                # Replace bits/stdc++.h with our standard headers
                if re.search(r'#include\s*<\s*bits/stdc\+\+\.h\s*>', line):
                    # Mark that we've replaced this line
                    bits_stdc_replaced = True
                    # Don't add this line - it will be replaced by std_headers
                elif "#include" in line:
                    # Keep other includes
                    processed_solution_lines.append(line)
                else:
                    # Keep all other lines
                    processed_solution_lines.append(line)
            
            # Process the test code - remove the first closing brace if present
            test_lines = test.strip().split('\n')
            if test_lines and test_lines[0].strip() == "}":
                test_lines = test_lines[1:]  # Skip the first line
            
            # Construct the complete code
            # Always include standard headers - if bits/stdc++.h was replaced, it will add them
            complete_code = std_headers + "\n"
            
            # Add processed solution
            complete_code += "\n".join(processed_solution_lines) + "\n\n"
            
            # Add test code
            complete_code += "\n".join(test_lines)
            
            # Write the complete code to file
            with open(cpp_file, 'w') as f:
                f.write(complete_code)
            
            # For all examples, save debug files
            debug_file = os.path.join(temp_dir, f"debug_{safe_name}.cpp")
            with open(debug_file, 'w') as f:
                f.write(complete_code)
            
            try:
                # Compile 
                compilation = subprocess.run(
                    [compiler, cpp_file, '-o', exe_file, '-std=c++17'],
                    capture_output=True,
                    text=True,
                    timeout=30
                )
                
                if compilation.returncode != 0:
                    failed += 1
                    compilation_errors.append(task_id)
                    continue
                
                # Run
                execution = subprocess.run(
                    [exe_file],
                    capture_output=True,
                    text=True,
                    timeout=10
                )
                
                # Check result
                if execution.returncode == 0:
                    passed += 1
                    passed_tasks.append(task_id)
                else:
                    failed += 1
                    runtime_errors.append(task_id)
                    
            except subprocess.TimeoutExpired:
                failed += 1
                timeouts.append(task_id)
            except Exception as e:
                failed += 1
                other_errors.append(task_id)
        
        # Calculate Pass@1
        pass_at_1 = (passed / total_problems) * 100
        
        end_time = time.time()
        execution_time = end_time - start_time
        
        # Terminal colors (if supported)
        try:
            # Check if terminal supports colors
            is_color_supported = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
        except:
            is_color_supported = False
            
        if is_color_supported:
            GREEN = "\033[92m"
            RED = "\033[91m"
            YELLOW = "\033[93m"
            BLUE = "\033[94m"
            MAGENTA = "\033[95m"
            CYAN = "\033[96m"
            BOLD = "\033[1m"
            UNDERLINE = "\033[4m"
            END = "\033[0m"
        else:
            # No color support, use empty strings
            GREEN = RED = YELLOW = BLUE = MAGENTA = CYAN = BOLD = UNDERLINE = END = ""
        
        # Generate fancy summary
        print("\n" + "=" * 80)
        print(f"{CYAN}{BOLD}📊 HUMAN EVAL RESULTS SUMMARY 📊{END}")
        print("=" * 80)
        
        # Print dataset info
        print(f"\n{BOLD}Dataset Information:{END}")
        print(f"📂 File: {os.path.basename(csv_file)}")
        print(f"🔤 Natural Language: {MAGENTA}{nl}{END}")
        print(f"💻 Programming Language: {MAGENTA}{pl.upper()}{END}")
        print(f"🧩 Total Problems: {total_problems}")
        print(f"📝 Solution Column: {MAGENTA}{solution_column}{END}")
        
        # Show date and time
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"\n{BOLD}Evaluation Details:{END}")
        print(f"⏰ Completed at: {current_time}")
        print(f"⌛ Total execution time: {execution_time:.2f} seconds")
        print(f"🔧 Compiler used: {compiler}")
        
        # Show Pass@1 score with colored output
        print(f"\n{BOLD}{UNDERLINE}PASS@1 SCORE:{END}")
        if pass_at_1 >= 90:
            color = GREEN
            emoji = "🏆"
        elif pass_at_1 >= 70:
            color = YELLOW
            emoji = "🥈"
        else:
            color = RED
            emoji = "📊"
            
        print(f"{color}{BOLD}{emoji} {pass_at_1:.2f}%{END} ({passed}/{total_problems} problems passed)")
        
        # Show breakdown of results
        print(f"\n{BOLD}{UNDERLINE}RESULT BREAKDOWN:{END}")
        print(f"{GREEN}✓ Passed: {passed} ({passed/total_problems*100:.1f}%){END}")
        
        if compilation_errors:
            print(f"{RED}✗ Compilation errors: {len(compilation_errors)} ({len(compilation_errors)/total_problems*100:.1f}%){END}")
            if len(compilation_errors) <= 5:  # Show if only a few
                print(f"  Examples: {', '.join(compilation_errors[:5])}" + 
                      (" + more..." if len(compilation_errors) > 5 else ""))
                
        if runtime_errors:
            print(f"{RED}✗ Runtime errors: {len(runtime_errors)} ({len(runtime_errors)/total_problems*100:.1f}%){END}")
            if len(runtime_errors) <= 5:  # Show if only a few
                print(f"  Examples: {', '.join(runtime_errors[:5])}" + 
                      (" + more..." if len(runtime_errors) > 5 else ""))
                
        if timeouts:
            print(f"{YELLOW}⏱ Timeouts: {len(timeouts)} ({len(timeouts)/total_problems*100:.1f}%){END}")
            if timeouts:  # Always show timeouts as they're unusual
                print(f"  Tasks: {', '.join(timeouts)}")
                
        if other_errors:
            print(f"{RED}✗ Other errors: {len(other_errors)} ({len(other_errors)/total_problems*100:.1f}%){END}")
            if other_errors:  # Always show other errors
                print(f"  Tasks: {', '.join(other_errors)}")
        
        # Keep temp directory info in summary
        print(f"\n{BOLD}{UNDERLINE}DEBUG INFO:{END}")
        print(f"🔍 Debug files are available in: {BLUE}{temp_dir}{END}")
        print("📝 Remember to manually clean up this directory when done.")
        
        print("\n" + "=" * 80)
        
        return pass_at_1
    
    finally:
        # No automatic cleanup to ensure debug files are available
        pass

# Usage example
if __name__ == "__main__":
    # You can override the defaults when calling the function:
    # evaluate_cpp_solutions("other_file.csv", "other_solution_column")
    
    # Or use the defaults set at the top of the file:
    pass_at_1 = evaluate_cpp_solutions()
    if pass_at_1 is not None:
        print(f"\nThank you for using the mHumanEval \nIf found helpful, Please cite our work!")

Using temporary directory: /var/folders/lf/qtbtj7t13k190p04hkrqbp1w0000gn/T/tmpzdisofyd
Reading CSV file: mHumanEval-eng_Latn-cpp.csv
Natural Language: eng_Latn, Programming Language: cpp
clang++ compiler found. Proceeding with evaluation.

Evaluating 161 problems...
Evaluating: |████████████████████████████████████████| 161/161 [100.0%] ETA: 0s

📊 HUMAN EVAL RESULTS SUMMARY 📊

Dataset Information:
📂 File: mHumanEval-eng_Latn-cpp.csv
🔤 Natural Language: eng_Latn
💻 Programming Language: CPP
🧩 Total Problems: 161
📝 Solution Column: canonical_solutions

Evaluation Details:
⏰ Completed at: 2025-03-17 19:16:48
⌛ Total execution time: 141.19 seconds
🔧 Compiler used: clang++

PASS@1 SCORE:
🥈 85.71% (138/161 problems passed)

RESULT BREAKDOWN:
✓ Passed: 138 (85.7%)
✗ Compilation errors: 7 (4.3%)
✗ Runtime errors: 15 (9.3%)
⏱ Timeouts: 1 (0.6%)
  Tasks: HumanEval/40

DEBUG INFO:
🔍 Debug files are available in: /var/folders/lf/qtbtj7t13k190p04hkrqbp1w0000gn/T/tmpzdisofyd
📝 Remember to manually c