**Generative Models for Code** -- Midterm Report<br><br>
**Maria Gancayco (mig2131@columbia.edu)**<br>
**Stephen Wright (svw2112@columbia.edu)**<br>
*Due:* Wednesday, 13 Nov 2024 at 11:59pm ET

In [1]:
"""
###########################################
# Setup: Environment and Memory Management
###########################################
This module initializes the GPU environment and sets up memory management utilities
for efficient model handling during the evaluation process.
"""

import torch
import gc
from pathlib import Path
from dataclasses import dataclass
from typing import Optional

# Check and display GPU availability for transparency
print("CUDA available:", torch.cuda.is_available())
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

# Memory management utilities
def clear_memory() -> None:
    """
    Clears GPU memory cache and performs garbage collection.

    This function is crucial for maintaining optimal memory usage during model evaluation,
    especially when loading and comparing multiple large language models.
    """
    if torch.cuda.is_available():
        torch.cuda.empty_cache()  # Clear CUDA cache
    gc.collect()  # Trigger Python garbage collection

def get_memory_status() -> None:
    """
    Displays current GPU memory usage statistics.

    Reports both allocated and reserved memory in megabytes (MB).
    This helps monitor memory consumption during model operations.

    Note:
        - Allocated memory: Actually used GPU memory
        - Reserved memory: Total memory reserved by PyTorch
    """
    if torch.cuda.is_available():
        # Convert bytes to MB for better readability
        allocated = torch.cuda.memory_allocated() / 1024**2
        reserved = torch.cuda.memory_reserved() / 1024**2
        print(f"GPU Memory: Allocated: {allocated:.2f}MB, Reserved: {reserved:.2f}MB")

# Initialize by checking current memory status
get_memory_status()

CUDA available: True
GPU device name: NVIDIA A100-SXM4-40GB
GPU Memory: Allocated: 0.00MB, Reserved: 0.00MB


In [2]:
"""
#################################
# Configuration and Setup
#################################
This section defines the experimental configuration parameters and initializes
the necessary directory structure for storing results.
"""

@dataclass
class ExperimentConfig:
    """
    Configuration dataclass containing all hyperparameters and settings for model evaluation.

    Attributes:
        model_name (str): Name/path of the model to be evaluated
        batch_size (int): Number of samples processed in each batch
        learning_rate (float): Learning rate for model optimization
        num_epochs (int): Number of training epochs
        max_seq_length (int): Maximum sequence length for input tokenization
        gradient_accumulation_steps (int): Number of steps to accumulate gradients
        warmup_steps (Optional[int]): Number of warmup steps for learning rate scheduler
        weight_decay (float): L2 regularization factor
        eval_steps (int): Frequency of evaluation steps
        save_steps (int): Frequency of model checkpoint saves
        logging_steps (int): Frequency of logging training metrics
    """
    model_name: str
    batch_size: int
    learning_rate: float
    num_epochs: int
    max_seq_length: int
    gradient_accumulation_steps: int
    warmup_steps: Optional[int] = None
    weight_decay: float = 0.01
    eval_steps: int = 100
    save_steps: int = 100
    logging_steps: int = 10

# Initialize configuration with DeepSeek model parameters
config = ExperimentConfig(
    model_name="deepseek-ai/deepseek-coder-6.7b-instruct",  # Using DeepSeek's 6.7B instruction-tuned model
    batch_size=1,                    # Small batch size due to model size
    learning_rate=5e-5,             # Conservative learning rate for fine-tuning
    num_epochs=3,                   # Number of training epochs
    max_seq_length=512,            # Maximum sequence length for input processing
    gradient_accumulation_steps=32, # Accumulate gradients to simulate larger batch size
    warmup_steps=100               # Warmup steps for learning rate scheduler
)

# Set up results directory for storing evaluation outputs
results_dir = Path("./results")
results_dir.mkdir(parents=True, exist_ok=True)  # Create directory if it doesn't exist

print("Configuration and directories initialized!")

Configuration and directories initialized!


In [3]:
"""
#################################
# Model Dependencies and Imports
#################################
This section installs required packages and imports essential libraries
for working with transformer-based language models.
"""

# Install core dependencies for transformer model handling and evaluation
!pip install transformers torch timeout-decorator

# Import required libraries
import torch  # PyTorch for deep learning operations
from transformers import (
    AutoTokenizer,         # For tokenization of input text
    AutoModelForCausalLM   # For loading pre-trained causal language models
)
from typing import List, Dict  # Type hints for better code documentation



In [4]:
"""
#################################
# Model Loading and Code Generation
#################################
This section implements core functionality for loading language models
and generating code from prompts with appropriate error handling and
memory optimization.
"""

def load_model_and_tokenizer(config: ExperimentConfig) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
    """
    Loads and initializes the model and tokenizer with memory optimizations.

    Args:
        config (ExperimentConfig): Configuration object containing model parameters

    Returns:
        tuple: (model, tokenizer) initialized and ready for generation

    Raises:
        Exception: If model loading fails, with detailed error message
    """
    try:
        # Clear memory before loading new model to prevent OOM errors
        clear_memory()

        print(f"Loading {config.model_name}...")

        # Initialize tokenizer with remote code execution enabled
        tokenizer = AutoTokenizer.from_pretrained(
            config.model_name,
            trust_remote_code=True  # Required for custom tokenizer implementations
        )

        # Load model with memory-efficient settings
        model = AutoModelForCausalLM.from_pretrained(
            config.model_name,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,    # Use bfloat16 for memory efficiency
            device_map="auto",             # Optimize model placement across available devices
            low_cpu_mem_usage=True         # Minimize CPU memory during loading
        )

        # Enable gradient checkpointing if available
        if hasattr(model, "gradient_checkpointing_enable"):
            model.gradient_checkpointing_enable()  # Trade compute for memory savings

        print("Model loaded successfully!")
        get_memory_status()  # Display current memory usage

        return model, tokenizer

    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

def generate_code(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 512,
    temperature: float = 0.8,
    top_p: float = 0.95,
    top_k: int = 50
) -> str:
    """
    Generates code using the loaded model with specified generation parameters.

    Args:
        model: The loaded language model
        tokenizer: The model's tokenizer
        prompt: Input prompt for code generation
        max_new_tokens: Maximum number of tokens to generate
        temperature: Sampling temperature (higher = more creative)
        top_p: Nucleus sampling parameter
        top_k: Top-k sampling parameter

    Returns:
        str: Generated code or empty string if generation fails
    """
    try:
        # Format prompt as chat message
        messages = [{"role": "user", "content": prompt}]

        # Tokenize input with chat template
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        # Generate code with specified parameters
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,  # Control generation length
            do_sample=True,                 # Enable sampling-based generation
            temperature=temperature,         # Control randomness
            top_p=top_p,                    # Nucleus sampling threshold
            top_k=top_k,                    # Top-k sampling parameter
            num_return_sequences=1,         # Generate single sequence
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

        # Decode and return only the generated portion (excluding prompt)
        return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

    except Exception as e:
        print(f"Error in code generation: {str(e)}")
        return ""

# Initialize model and tokenizer using configuration
model, tokenizer = load_model_and_tokenizer(config)

# Test the generation pipeline with a simple prompt
test_prompt = "Write a quicksort algorithm in Python."
generated_code = generate_code(model, tokenizer, test_prompt)
print("\nGenerated Code:\n", generated_code)

Loading deepseek-ai/deepseek-coder-6.7b-instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully!
GPU Memory: Allocated: 12856.52MB, Reserved: 12858.00MB

Generated Code:
 Sure, here is a basic implementation of the quicksort algorithm in Python:

```python
def quicksort(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[len(arr) // 2]
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return quicksort(left) + middle + quicksort(right)

print(quicksort([3,6,8,10,1,2,1]))
# Output: [1, 1, 2, 3, 6, 8, 10]
```

This program works by selecting a 'pivot' element from the array and partitioning the other elements into two sub-arrays, according to whether they are less than or greater than the pivot. The sub-arrays are then recursively sorted.



In [5]:
"""
#################################
# Code Generation Management System
#################################
This module implements a robust code generation system with retry mechanisms,
logging, and performance tracking capabilities.
"""

from datetime import datetime

class CodeGenerator:
    """
    A class to manage code generation with retry logic and generation history tracking.

    Attributes:
        model: The language model for code generation
        tokenizer: The model's tokenizer
        generation_history (list): History of all generation attempts
    """

    def __init__(self, model, tokenizer):
        """
        Initialize the code generator with a model and tokenizer.

        Args:
            model: The language model to use for generation
            tokenizer: The corresponding tokenizer
        """
        self.model = model
        self.tokenizer = tokenizer
        self.generation_history = []

    def generate_with_retry(self, prompt: str, max_attempts: int = 3) -> Dict:
        """
        Generate code with automatic retry mechanism and comprehensive logging.

        Args:
            prompt (str): The input prompt for code generation
            max_attempts (int): Maximum number of retry attempts

        Returns:
            Dict: Generation result containing:
                - prompt: Original input prompt
                - code: Generated code
                - attempt: Attempt number
                - generation_time: Time taken
                - timestamp: Generation timestamp

        Note:
            Temperature increases with each retry attempt to encourage diversity
        """
        for attempt in range(max_attempts):
            try:
                # Track generation time
                start_time = datetime.now()

                # Generate code with adaptive temperature
                generated_code = generate_code(
                    self.model,
                    self.tokenizer,
                    prompt,
                    temperature=0.8 if attempt > 0 else 0.6  # Higher temperature for retries
                )

                # Calculate generation duration
                end_time = datetime.now()
                generation_time = (end_time - start_time).total_seconds()

                # Create comprehensive result log
                result = {
                    "prompt": prompt,
                    "code": generated_code,
                    "attempt": attempt + 1,
                    "generation_time": generation_time,
                    "timestamp": end_time.isoformat()
                }

                # Update generation history
                self.generation_history.append(result)

                # Return successful generation
                if generated_code:
                    return result

            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")

        # Return error if all attempts fail
        return {"error": "All generation attempts failed"}

    def get_generation_stats(self) -> Dict:
        """
        Calculate and return statistics about code generation performance.

        Returns:
            Dict containing:
                - total_generations: Total number of generation attempts
                - average_generation_time: Average time per generation
                - successful_generations: Number of successful generations
        """
        if not self.generation_history:
            return {}

        total_generations = len(self.generation_history)
        avg_time = sum(g["generation_time"] for g in self.generation_history) / total_generations

        return {
            "total_generations": total_generations,
            "average_generation_time": avg_time,
            "successful_generations": sum(1 for g in self.generation_history if "code" in g)
        }

# Initialize the code generation system
code_generator = CodeGenerator(model, tokenizer)

# Test the generation system with a sample prompt
test_result = code_generator.generate_with_retry("Write a binary search function in Python.")
print("\nGeneration Result:", test_result)
print("\nGeneration Stats:", code_generator.get_generation_stats())


Generation Result: {'prompt': 'Write a binary search function in Python.', 'code': 'Sure, here is a simple implementation of a binary search function in Python:\n\n```python\ndef binary_search(arr, low, high, x):\n \n    if high >= low:\n \n        mid = (high + low) // 2\n \n        if arr[mid] == x:\n            return mid\n \n        elif arr[mid] > x:\n            return binary_search(arr, low, mid - 1, x)\n \n        else:\n            return binary_search(arr, mid + 1, high, x)\n \n    else:\n        return -1\n \n# Test array\narr = [2, 3, 4, 10, 40]\nx = 10\n \n# Function call\nresult = binary_search(arr, 0, len(arr)-1, x)\n \nif result != -1:\n    print("Element is present at index", str(result))\nelse:\n    print("Element is not present in array")\n```\n\nIn this function, `arr` is the list we\'re searching through, `low` and `high` are the indices of the first and last elements of the list, and `x` is the element we\'re searching for. The function returns the index of the e

In [6]:
"""
#################################
# SemCoder Model Setup
#################################
This section handles the installation and setup of the SemCoder model,
including Git LFS setup and repository cloning.
"""

# Clear GPU memory before new model setup
clear_memory()  # Ensure clean memory state for new model

# Install Git LFS and clone SemCoder repository
print("Installing Git LFS and cloning SemCoder...")
!git lfs install  # Initialize Git Large File Storage for model weights

# Clone SemCoder from HuggingFace repository
# Note: Using /content/SemCoder path for Google Colab compatibility
!git clone https://huggingface.co/semcoder/semcoder /content/SemCoder

# Verify successful repository cloning
import os
if os.path.exists('/content/SemCoder'):
    print("SemCoder repository cloned successfully!")
else:
    raise RuntimeError("Failed to clone SemCoder repository")  # Critical error if clone fails

Installing Git LFS and cloning SemCoder...
Git LFS initialized.
fatal: destination path '/content/SemCoder' already exists and is not an empty directory.
SemCoder repository cloned successfully!


In [7]:
"""
#################################
# SemCoder File Verification
#################################
This module verifies the integrity of the SemCoder installation by checking
for all required model files in the safetensors format.
"""

import os
from typing import List

def verify_semcoder_files() -> None:
    """
    Verifies the presence of all required SemCoder model files.

    Checks for:
        - Configuration files (config.json, tokenizer.json)
        - Model weight files in safetensors format
        - Model index file

    Raises:
        RuntimeError: If any required files are missing from the installation
    """
    # Define required files for model functionality
    required_files = [
        'config.json',           # Model configuration
        'tokenizer.json',        # Tokenizer configuration
        'model.safetensors.index.json',  # Model weights index
        # Sharded model weights in safetensors format
        'model-00001-of-00003.safetensors',
        'model-00002-of-00003.safetensors',
        'model-00003-of-00003.safetensors'
    ]
    missing_files: List[str] = []

    # Display current directory contents for debugging
    print("SemCoder directory contents:")
    files = os.listdir('/content/SemCoder')
    print("\n".join(files))

    # Check for missing files
    for file in required_files:
        if file not in files:
            missing_files.append(file)

    # Handle verification results
    if missing_files:
        raise RuntimeError(f"Missing required files: {', '.join(missing_files)}")
    else:
        print("\nAll required files present!")
        print("\nModel files verification successful!")

# Execute verification
verify_semcoder_files()

SemCoder directory contents:
trainer_state.json
tokenizer.json
special_tokens_map.json
model.safetensors.index.json
model-00002-of-00003.safetensors
model-00001-of-00003.safetensors
training_args.bin
tokenizer_config.json
model-00003-of-00003.safetensors
README.md
.git
generation_config.json
config.json
.gitattributes

All required files present!

Model files verification successful!


In [8]:
"""
#################################
# SemCoder Model Implementation
#################################
This module implements the SemCoder model class with memory-efficient loading
and code generation capabilities.
"""

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from typing import Optional

class SemCoderModel:
    """
    A class implementing the SemCoder model with optimized loading and generation.

    Attributes:
        model_path (str): Path to the local SemCoder model files
        model: The loaded language model (initialized in load())
        tokenizer: The model's tokenizer (initialized in load())
    """

    def __init__(self, model_path: str):
        """
        Initialize SemCoder model instance.

        Args:
            model_path (str): Path to the local model directory
        """
        self.model_path = model_path
        self.model: Optional[AutoModelForCausalLM] = None
        self.tokenizer: Optional[AutoTokenizer] = None

    def load(self) -> None:
        """
        Load the SemCoder model and tokenizer with memory optimizations.

        Implements:
            - Memory clearing before load
            - bfloat16 precision for efficiency
            - Automatic device mapping
            - Gradient checkpointing

        Raises:
            Exception: If model loading fails
        """
        try:
            # Ensure clean memory state
            clear_memory()

            # Load tokenizer first
            print("Loading SemCoder tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

            # Load model with optimizations
            print("Loading SemCoder model...")
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_path,
                torch_dtype=torch.bfloat16,    # Use bfloat16 for memory efficiency
                device_map="auto",             # Automatic device placement
                low_cpu_mem_usage=True         # Minimize CPU memory usage
            )

            # Enable memory optimization
            if hasattr(self.model, "gradient_checkpointing_enable"):
                self.model.gradient_checkpointing_enable()

            print("Successfully loaded SemCoder!")
            get_memory_status()  # Display memory usage

        except Exception as e:
            print(f"Error loading SemCoder: {str(e)}")
            raise

    def generate_code(self, prompt: str, max_new_tokens: int = 512) -> str:
        """
        Generate code using the loaded SemCoder model.

        Args:
            prompt (str): Input prompt for code generation
            max_new_tokens (int): Maximum number of tokens to generate

        Returns:
            str: Generated code or empty string if generation fails

        Note:
            Uses sampling-based generation with temperature=0.7 and top_p=0.95
            for balanced creativity and coherence
        """
        try:
            # Tokenize input with proper device placement
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to(self.model.device)

            # Generate with specified parameters
            outputs = self.model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_new_tokens,
                do_sample=True,         # Enable sampling
                temperature=0.7,        # Control randomness
                top_p=0.95             # Nucleus sampling threshold
            )

            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        except Exception as e:
            print(f"Error generating code: {str(e)}")
            return ""

# Initialize and load SemCoder model
semcoder = SemCoderModel("/content/SemCoder")
semcoder.load()

Loading SemCoder tokenizer...
Loading SemCoder model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Successfully loaded SemCoder!
GPU Memory: Allocated: 25721.17MB, Reserved: 25734.00MB


In [9]:
"""
#################################
# SemCoder Generation Testing
#################################
This module implements a test function to verify SemCoder's code generation
capabilities using a Fibonacci sequence implementation as a test case.
"""

def test_semcoder_generation() -> None:
    """
    Tests SemCoder's code generation capabilities with a standard programming task.

    Test includes:
        1. Code generation for Fibonacci sequence
        2. Basic validation of generated code structure
        3. Memory usage monitoring

    The test uses the Fibonacci sequence as it requires:
        - Function definition
        - Loop or recursion
        - Return statement
        - Basic algorithm implementation

    Prints:
        - Input prompt
        - Generated code
        - Validation results
        - Memory status
    """
    # Define test prompt for Fibonacci sequence
    prompt = "Write a Python function to calculate the Fibonacci sequence."

    print("Testing SemCoder with Fibonacci sequence prompt...")
    print(f"Input prompt: {prompt}")

    try:
        # Generate code using SemCoder
        generated_code = semcoder.generate_code(prompt)

        # Display generation results
        print("\nGenerated Code:")
        print(generated_code)

        # Perform basic structural validation
        validation_checks = {
            "function_definition": "def" in generated_code,
            "return_statement": "return" in generated_code
        }

        if all(validation_checks.values()):
            print("\nCode generation appears successful!")
            print("✓ Found function definition")
            print("✓ Found return statement")
        else:
            print("\nWarning: Generated code might be incomplete!")
            print("Missing elements:")
            for check, passed in validation_checks.items():
                if not passed:
                    print(f"✗ Missing {check.replace('_', ' ')}")

        # Monitor memory usage after generation
        print("\nMemory status after generation:")
        get_memory_status()

    except Exception as e:
        print(f"Error in test generation: {str(e)}")
        print(f"Error type: {type(e).__name__}")

# Execute the test
print("Initiating SemCoder generation test...")
test_semcoder_generation()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Initiating SemCoder generation test...
Testing SemCoder with Fibonacci sequence prompt...
Input prompt: Write a Python function to calculate the Fibonacci sequence.

Generated Code:
Write a Python function to calculate the Fibonacci sequence.

```python
def fibonacci(n):
    if n <= 0:
        return "Please enter a positive integer."
    elif n == 1:
        return [0]
    elif n == 2:
        return [0, 1]
    else:
        fib_sequence = [0, 1]
        for i in range(2, n):
            fib_sequence.append(fib_sequence[i - 1] + fib_sequence[i - 2])
        return fib_sequence

# Test the function with n = 10
n = 10
result = fibonacci(n)
print(result)
```

This solution defines a function `fibonacci(n)` that calculates the Fibonacci sequence up to the nth term. It handles cases where n is less than or equal to 0, n is 1, or n is 2, returning the appropriate sequence. For n greater than 2, it iterates to calculate the Fibonacci sequence up to the nth term. Finally, it tests the functio

In [10]:
"""
#################################
# Evaluation Framework Setup
#################################
This section installs required packages for implementing the model
evaluation framework, including dataset handling and progress tracking.
"""

# Install essential evaluation packages with version specifications
!pip install --upgrade pip  # Ensure pip is up to date
!pip install 'datasets>=3.1.0' 'tqdm>=4.66.0' 'fsspec==2024.10.0' --no-deps
!pip install 'gcsfs>=2024.10.0'  # Install after fsspec to ensure compatibility

"""
Package Details:
- datasets: HuggingFace's datasets library for efficient data handling
           Used for loading and managing evaluation benchmarks

- tqdm: Progress bar library for tracking long-running operations
        Used to monitor evaluation progress across multiple samples

- fsspec: Filesystem interface library
         Required by datasets and gcsfs

- gcsfs: Google Cloud Storage interface
         Requires specific fsspec version

Note:
- Version specifications are used to avoid dependency conflicts
- The --no-deps flag prevents unwanted dependency downgrades
- Packages are installed in order to maintain compatibility
"""

# Verify installations
import pkg_resources
print("\nInstalled versions:")
for package in ['datasets', 'tqdm', 'fsspec', 'gcsfs']:
    try:
        version = pkg_resources.get_distribution(package).version
        print(f"{package}: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package}: Not found")

Collecting fsspec==2024.10.0
  Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2024.10.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.9.0
    Uninstalling fsspec-2024.9.0:
      Successfully uninstalled fsspec-2024.9.0
Successfully installed fsspec-2024.10.0

Installed versions:
datasets: 3.1.0
tqdm: 4.66.6
fsspec: 2024.10.0
gcsfs: 2024.10.0


  import pkg_resources


In [11]:
"""
#################################
# Test Execution Framework
#################################
This module implements a robust test execution system for evaluating
generated code solutions against predefined test cases.
"""

# Import required libraries for code parsing and system operations
from typing import List
import ast
import sys

def run_tests(solution_code, test_code, namespace):
    """
    Executes and validates test cases against a generated solution.

    Args:
        solution_code: The code solution to be tested
        test_code: The test cases to run against the solution
        namespace: The execution environment for running tests

    Returns:
        bool: True if all tests pass, False otherwise
    """
    # Clean up input code by removing quotes and whitespace
    solution_code = solution_code.strip('"\'\n ')
    test_code = test_code.strip('"\'\n ')

    # Execute solution code in provided namespace
    try:
        exec(solution_code, namespace)
    except:
        print(f"Error occurred in solution code: {str(e)}")
        print(f"Error type: {type(e).__name__}")
        print(f"Solution code: {solution_code}")
        return False

    try:
        # Parse solution code to extract function name
        tree = ast.parse(solution_code)
        function_name = None
        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef):
                function_name = node.name
                break

        if not function_name:
            raise ValueError("Could not find function definition in solution code")

        # Modify test code to collect results instead of using assertions
        modified_test_code = test_code.replace("def check(candidate):",
            f"def check(candidate):\n    global test_results\n    test_results = []")

        # Convert assertion statements to result collection
        test_lines = [line for line in test_code.split('\n') if line.strip().startswith('assert')]
        for i, line in enumerate(test_lines):
            modified_line = line.replace("assert ", "test_results.append((")
            modified_line = f"{modified_line}, {repr(line)}))"
            test_lines[i] = modified_line

        # Construct complete test execution code
        modified_test_code = "\n".join([
            "test_results = []",          # Initialize results list
            modified_test_code,           # Modified test function
            "\n".join(test_lines),        # Modified assertions
            f"check({function_name})"     # Execute tests
        ])

        # Execute modified test code
        exec(modified_test_code, namespace)
    except Exception as e:
        print(f"Error occurred for executing modified test code: {str(e)}")
        print(f"Error type: {type(e).__name__}")
        print(f"Modified test code: {modified_test_code}")
        return False

    # Process and display test results
    test_results = namespace.get('test_results', [])
    print(f"\nExecuting {len(test_results)} tests:\n")

    # Track test results and display each test outcome
    all_passed = True
    for i, (result, test_code) in enumerate(test_results, 1):
        if result:
            print(f"✓ Test {i} passed: {test_code}")
        else:
            print(f"✗ Test {i} failed: {test_code}")
            all_passed = False

    # Display test summary
    print(f"\nSummary: {sum(r[0] for r in test_results)}/{len(test_results)} tests passed")
    return all_passed

# Example usage demonstration
if __name__ == "__main__":
    # Initialize test environment with required imports
    setup_code = """from typing import List, Dict, Optional, Any, TypeVar, Tuple
import math
import string
import re

M = TypeVar('M')
"""
    namespace = {}
    exec(setup_code, namespace)

    # Example solution implementation
    solution_code = """def has_close_elements(numbers: List[float], threshold: float) -> bool:
    numbers.sort()
    for i in range(1, len(numbers)):
        if numbers[i] - numbers[i - 1] < threshold:
            return True
    return False"""

    # Example test cases
    test_code = '''METADATA = {
        'author': 'jt',
        'dataset': 'test'
}

def check(candidate):
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False
    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False'''

    # Execute test suite
    run_tests(solution_code, test_code, namespace)


Executing 7 tests:

✓ Test 1 passed:     assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
✓ Test 2 passed:     assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
✓ Test 3 passed:     assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True
✓ Test 4 passed:     assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False
✓ Test 5 passed:     assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
✓ Test 6 passed:     assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
✓ Test 7 passed:     assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False

Summary: 7/7 tests passed


In [12]:
"""
#################################
# Model Evaluation Framework
#################################
This module implements a comprehensive evaluation system for comparing
code generation models using the HumanEval dataset.
"""

# Import required libraries
from datasets import load_dataset
from typing import Dict, List, Any, TypeVar
import json
from tqdm import tqdm
import torch
import re

class ModelEvaluator:
    """
    A class for evaluating code generation models on the HumanEval benchmark.

    Attributes:
        human_eval: Loaded HumanEval dataset
        results: Dictionary storing evaluation results
        debug: Boolean controlling debug output level
    """

    def __init__(self):
        """Initialize evaluator with HumanEval dataset and empty results"""
        self.human_eval = load_dataset("openai_humaneval")
        self.results = {}
        self.debug = True  # Control debug output

    def format_prompt(self, prompt: str, model_type: str) -> str:
        """
        Format input prompt according to model-specific requirements.

        Args:
            prompt: Original task prompt
            model_type: Type of model ("deepseek" or "semcoder")

        Returns:
            Formatted prompt string
        """
        # Format for DeepSeek model
        if model_type == "deepseek":
            return (
                "Write a Python function that solves the following task. "
                "Provide ONLY the function implementation starting with 'def' and proper indentation. "
                "The function should be properly indented with 4 spaces. "
                "Do not include any explanations, comments, docstrings, type hints, or test code. "
                "Do not include any print statements or assertions. "
                "Only include the function definition and its implementation.\n\n"
                "Example format:\n"
                "def example_function(param1, param2):\n"
                "    result = param1 + param2\n"
                "    return result\n\n"
                "Your task:\n"
                f"{prompt}"
            )
        # Format for SemCoder model
        elif model_type == "semcoder":
            return (
                "# Task: Implement the following Python function\n"
                f"{prompt}\n"
                "# Provide only the function implementation with proper indentation.\n"
            )
        return prompt

    def clean_generated_code(self, code: str) -> str:
        """
        Clean and normalize generated code.

        Args:
            code: Raw generated code

        Returns:
            Cleaned and formatted code string
        """
        # Debug output of original code
        if self.debug:
            print("\nOriginal generated code:")
            print(code)

        # Normalize line endings and split into lines
        code = code.replace('\r\n', '\n')
        lines = code.splitlines()

        cleaned_lines = []
        target_function_found = False
        indent_level = 0
        INDENT = "    "
        has_seen_def = False

        # Process each line
        for line in lines:
            stripped = line.strip()
            if not stripped: continue

            function_def_found = stripped.startswith('def ')
            if function_def_found:
                if has_seen_def:
                    cleaned_lines = []
                    indent_level = 0
                else:
                    has_seen_def = True
            target_function_found = has_seen_def

            if not target_function_found: continue

            if function_def_found:
                # Clean function definition
                function_def = stripped
                # Remove return type hints
                function_def = re.sub(r'\s*->\s*(?:List|Dict|Tuple|Optional|Set|Union|Any|float|int|str|bool)\[?[^\]]*\]?\s*:', ':', function_def)

                # Clean parameter type hints
                parts = function_def.split('(', 1)
                if len(parts) == 2:
                    func_name, params_part = parts
                    params_and_rest = params_part.split(')', 1)
                    if len(params_and_rest) == 2:
                        params, rest = params_and_rest
                        param_list = params.split(',')
                        cleaned_params = []
                        for param in param_list:
                            cleaned_param = re.sub(r':\s*(?:List|Dict|Tuple|Optional|Set|Union|Any|float|int|str|bool)\[?[^\]]*\]?\s*(?=[,)])?', '', param.strip())
                            cleaned_params.append(cleaned_param)
                        function_def = f"{func_name}({', '.join(cleaned_params)}){rest}"

                # Normalize spacing
                function_def = re.sub(r'\s+:', ':', function_def)
                function_def = re.sub(r'\(\s+', '(', function_def)
                function_def = re.sub(r'\s+\)', ')', function_def)

                cleaned_lines.append(function_def)
                indent_level += 1
                continue

            # Filter out unwanted lines
            if any(skip in stripped for skip in ['print(', 'assert', 'if __name__']):
                continue

            cleaned_lines.append(line)

        # Join lines with Unix-style newlines
        cleaned_code = '\n'.join(cleaned_lines)

        if self.debug:
            print("\nCleaned code:")
            print(cleaned_code)
            print("\nCleaned code (repr):")
            print(repr(cleaned_code))

        return cleaned_code if target_function_found else ""

    def evaluate_single_solution(self, solution_code, test_cases, entry_point) -> Dict:
        """
        Evaluate a single generated solution against its test cases.

        Args:
            solution_code: Generated solution to evaluate
            test_cases: Test cases to run
            entry_point: Name of the function to test

        Returns:
            Dictionary containing evaluation metrics
        """
        print(test_cases)

        # Setup environment
        setup_code = """from typing import List, Dict, Optional, Any, TypeVar, Tuple
import math
import string
import re

M = TypeVar('M')
"""
        # Validate syntax
        try:
            compile(solution_code, '<string>', 'exec')
        except SyntaxError as e:
            if self.debug:
                print(f"Syntax error: {str(e)}")
                print(f"Generated code:\n{solution_code}")
            return {
                "pass@1": 0,
                "pass@10": 0,
                "pass@100": 0,
                "syntax_validity": 0,
                "execution_accuracy": 0
            }

        # Execute tests
        namespace = {}
        try:
            exec(setup_code, namespace)
        except Exception as e:
            if self.debug:
                print(f"Execution error for setup code: {str(e)}")
                print(f"Setup code:\n{setup_code}")
            execution_success = False

        execution_success = run_tests(solution_code, test_cases, namespace)
        return {
            "pass@1": int(execution_success),
            "pass@10": int(execution_success),
            "pass@100": int(execution_success),
            "syntax_validity": 1,
            "execution_accuracy": int(execution_success)
        }

    def evaluate_model(self, model, tokenizer, model_type: str, num_samples: int = None):
        """
        Evaluate model performance on HumanEval dataset.

        Args:
            model: The model to evaluate
            tokenizer: Model's tokenizer
            model_type: Type of model ("deepseek" or "semcoder")
            num_samples: Number of samples to evaluate (None for all)

        Returns:
            Dictionary containing aggregated evaluation results
        """
        results = {
            "pass@1": 0,
            "pass@10": 0,
            "pass@100": 0,
            "syntax_validity": 0,
            "execution_accuracy": 0
        }

        total_samples = len(self.human_eval["test"]) if num_samples is None else num_samples

        # Process each task
        for idx in tqdm(range(total_samples)):
            task = self.human_eval["test"][idx]
            formatted_prompt = self.format_prompt(task["prompt"], model_type)

            if self.debug:
                print(f"\n\nProcessing task {idx + 1}/{total_samples}")
                print("Prompt:")
                print(formatted_prompt)

            try:
                # Generate code based on model type
                if model_type == "deepseek":
                    messages = [{"role": "user", "content": formatted_prompt}]
                    inputs = tokenizer.apply_chat_template(
                        messages,
                        return_tensors="pt",
                        padding=True
                    ).to(model.device)

                    attention_mask = torch.ones_like(inputs)

                    outputs = model.generate(
                        inputs,
                        attention_mask=attention_mask,
                        max_new_tokens=512,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.95,
                        pad_token_id=tokenizer.eos_token_id
                    )
                    generated_code = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

                else:  # semcoder
                    inputs = tokenizer(
                        formatted_prompt,
                        return_tensors="pt",
                        padding=True,
                        truncation=True,
                        max_length=512
                    ).to(model.device)

                    outputs = model.generate(
                        input_ids=inputs["input_ids"],
                        attention_mask=inputs["attention_mask"],
                        max_new_tokens=512,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.95,
                        pad_token_id=tokenizer.eos_token_id
                    )
                    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

                # Process and evaluate generated code
                cleaned_code = self.clean_generated_code(generated_code)
                if cleaned_code:
                    evaluation = self.evaluate_single_solution(
                        cleaned_code,
                        task["test"],
                        task["entry_point"]
                    )

                    if self.debug:
                        print("\nEvaluation results:")
                        for metric, value in evaluation.items():
                            print(f"{metric}: {value}")

                    # Update metrics
                    for metric in results:
                        results[metric] += evaluation[metric]

            except Exception as e:
                if self.debug:
                    print(f"Error processing sample {idx}: {str(e)}")
                continue

        # Calculate final averages
        for metric in results:
            results[metric] /= total_samples

        return results

# Initialize the evaluator
evaluator = ModelEvaluator()

In [13]:
"""
#################################
# DeepSeek Model Evaluation
#################################
This section evaluates the DeepSeek base model's performance on the
HumanEval benchmark using the previously defined evaluation framework.
"""

# Begin DeepSeek model evaluation
print("Evaluating DeepSeek base model...")

# Run evaluation with limited sample size for initial testing
# num_samples=10 provides a quick assessment of model performance
deepseek_results = evaluator.evaluate_model(
    model=model,              # Previously loaded DeepSeek model
    tokenizer=tokenizer,      # DeepSeek tokenizer
    model_type="deepseek",    # Specify model type for proper prompt formatting
    num_samples=10           # Number of test cases to evaluate
)

# Display evaluation results
print("\nDeepSeek Base Results:")
print(json.dumps(deepseek_results, indent=2))  # Pretty print results in JSON format

"""
Results include:
- pass@1: Single-attempt success rate
- pass@10: Success rate within 10 attempts
- pass@100: Success rate within 100 attempts
- syntax_validity: Proportion of syntactically valid generations
- execution_accuracy: Proportion of functionally correct solutions
"""

Evaluating DeepSeek base model...


  0%|          | 0/10 [00:00<?, ?it/s]



Processing task 1/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



 10%|█         | 1/10 [00:01<00:17,  1.92s/it]


Original generated code:
    numbers.sort()
    for i in range(1, len(numbers)):
        if numbers[i] - numbers[i - 1] < threshold:
            return True
    return False


Cleaned code:


Cleaned code (repr):
''


Processing task 2/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and ret

 20%|██        | 2/10 [00:20<01:34, 11.83s/it]


Original generated code:

    # Write your code here
    # IMPORTANT: Don't include the following lines in your code
    # assert False, "Not implemented"
    # return None

























































































































































































































































































































































































































































































Cleaned code:


Cleaned code (repr):
''


Processing task 3/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do n

 30%|███       | 3/10 [00:33<01:25, 12.28s/it]


Original generated code:
    integer_part = int(number)
    decimal_part = number - integer_part

    return decimal_part


def round_number(number: float, ndigits: int) -> float:
    """ Given a positive floating point number and a number of decimal places,
    it can be decomposed into the integer part and decimals.

    The decimals are multiplied by 10 to the power of the number of decimal places,
    then the rounded result is divided by 10 to the power of the number of decimal places.

    Return the rounded number.
    >>> round_number(3.5714, 2)
    3.57
    """

    factor = 10 ** ndigits
    rounded_number = round(number * factor) / factor

    return rounded_number


def truncate_and_round(number: float, ndigits: int) -> float:
    """ Given a positive floating point number and a number of decimal places,
    it first trims off the integer part of the number and obtains the decimal part,
    then it rounds the decimal part to the specified number of decimal places.

    Ret

 40%|████      | 4/10 [00:34<00:48,  8.02s/it]


Original generated code:
    # Your code goes here
    balance = 0
    for operation in operations:
        balance += operation
        if balance < 0:
            return True
    return False



Cleaned code:


Cleaned code (repr):
''


Processing task 5/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def mean_absolute_deviation(numbers: List[float]) -> float:
    """ For a given list of input numbers, calculate Mean Absolute Deviation
    around the mean of this dataset.
    Mean Absolute Deviation

 50%|█████     | 5/10 [00:37<00:31,  6.20s/it]


Original generated code:
    # Calculate the mean
    mean = sum(numbers) / len(numbers)

    # Calculate the absolute differences from the mean
    differences = [abs(num - mean) for num in numbers]

    # Calculate the average of these absolute differences
    mad = sum(differences) / len(differences)

    return mad


Cleaned code:


Cleaned code (repr):
''


Processing task 6/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def intersperse(numbers: List[int], delimeter: int) -> List[int]:
    """ I

 60%|██████    | 6/10 [00:40<00:19,  4.79s/it]


Original generated code:
    result = []
    for i in range(len(numbers)):
        result.append(numbers[i])
        if i < len(numbers) - 1:
            result.append(delimeter)
    return result


Cleaned code:


Cleaned code (repr):
''


Processing task 7/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, 

 70%|███████   | 7/10 [00:43<00:13,  4.35s/it]


Original generated code:
    result = []
    for group in paren_string.split(' '):
        count = 0
        max_count = 0
        for char in group:
            if char == '(':
                count += 1
                if count > max_count:
                    max_count = count
            elif char == ')':
                count -= 1
        result.append(max_count)
    return result


Cleaned code:


Cleaned code (repr):
''


Processing task 8/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def fil

 80%|████████  | 8/10 [01:01<00:17,  8.85s/it]


Original generated code:

def filter_by_length(strings: List[str], length: int) -> List[str]:
    """ Filter an input list of strings only for ones that are of given length
    >>> filter_by_length([], 5)
    []
    >>> filter_by_length(['abc', 'bacd', 'cde', 'array'], 4)
    ['abcd', 'cde']
    """


def filter_by_vowels(strings: List[str], vowels: int) -> List[str]:
    """ Filter an input list of strings only for ones that contain given number of vowels
    >>> filter_by_vowels([], 2)
    []
    >>> filter_by_vowels(['abc', 'bacd', 'cde', 'array'], 2)
    ['abc', 'bacd', 'array']
    """


def filter_by_consonants(strings: List[str], consonants: int) -> List[str]:
    """ Filter an input list of strings only for ones that contain given number of consonants
    >>> filter_by_consonants([], 3)
    []
    >>> filter_by_consonants(['abc', 'bacd', 'cde', 'array'], 3)
    ['bacd', 'cde', 'array']
    """


def filter_by_special_chars(strings: List[str], special_chars: int) -> List[str]:


 90%|█████████ | 9/10 [01:03<00:06,  6.56s/it]


Original generated code:
    # Your code here
    sum = 0
    product = 1
    for num in numbers:
        sum += num
        product *= num
    return (sum, product)


Cleaned code:


Cleaned code (repr):
''


Processing task 10/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List, Tuple


def rolling_max(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling maximum element found until given moment
    in the sequence.
    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])
 


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.10/doctest.py", line 1501, in run
    sys.settrace(save_trace)

100%|██████████| 10/10 [01:19<00:00,  7.91s/it]


Original generated code:
    # Your code here
    pass


def rolling_min(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling minimum element found until given moment
    in the sequence.
    >>> rolling_min([1, 2, 3, 2, 3, 4, 2])
    [1, 1, 1, 1, 1, 1, 1]
    """

    # Your code here
    pass


def min_max_difference(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of differences between rolling minimum and rolling maximum 
    found until given moment in the sequence.
    >>> min_max_difference([1, 2, 3, 2, 3, 4, 2])
    [1, 1, 2, 2, 3, 4, 4]
    """

    # Your code here
    pass


def mean_and_difference(numbers: List[int]) -> Tuple[List[float], List[int]]:
    """ From a given list of integers, generate a list of means and differences between rolling mean and rolling 
    maximum/minimum found until given moment in the sequence.
    >>> mean_and_difference([1, 2, 3, 2, 3, 4, 2])
    ([1.0, 1.5




'\nResults include:\n- pass@1: Single-attempt success rate\n- pass@10: Success rate within 10 attempts\n- pass@100: Success rate within 100 attempts\n- syntax_validity: Proportion of syntactically valid generations\n- execution_accuracy: Proportion of functionally correct solutions\n'

In [14]:
"""
#################################
# SemCoder Model Evaluation
#################################
This section evaluates the SemCoder model's performance on the same
HumanEval benchmark for direct comparison with DeepSeek results.
"""

# Begin SemCoder evaluation
print("Evaluating SemCoder...")

# Run evaluation using identical parameters as DeepSeek for fair comparison
semcoder_results = evaluator.evaluate_model(
    model=semcoder.model,        # Previously loaded SemCoder model
    tokenizer=semcoder.tokenizer, # SemCoder tokenizer
    model_type="semcoder",       # Specify model type for appropriate prompt formatting
    num_samples=10              # Match DeepSeek sample size for direct comparison
)

# Display evaluation results
print("\nSemCoder Results:")
print(json.dumps(semcoder_results, indent=2))  # Pretty print results in JSON format

"""
Note: Results use same metrics as DeepSeek evaluation:
- pass@1: Single-attempt success rate
- pass@10: Success rate within 10 attempts
- pass@100: Success rate within 100 attempts
- syntax_validity: Proportion of syntactically valid generations
- execution_accuracy: Proportion of functionally correct solutions

These results can be directly compared with DeepSeek results
to assess relative model performance.
"""

Evaluating SemCoder...


  0%|          | 0/10 [00:00<?, ?it/s]



Processing task 1/10
Prompt:
# Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

# Provide only the function implementation with proper indentation.



 10%|█         | 1/10 [00:03<00:33,  3.76s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

# Provide only the function implementation with proper indentation.
# To solve this problem, we can iterate through the list of numbers and check if the absolute difference between any two numbers is less than the given threshold.

    for i in range(len(numbers)):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False

# Test cases are provided in the docstring of the function.

Cleaned code:
def has_close_elements(numbers, threshold):
    """ Check if in given list of numbers, a

 20%|██        | 2/10 [00:10<00:44,  5.62s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """

# Provide only the function implementation with proper indentation.
    result = []
    current_group = ''
    stack = []

    for char in paren_string:
        if char == '(':
            if current_group:
                result.append(current_group)
                current_group = ''
            stack.append('(')
        elif char == ')':
            if stack and stack[-1] == '(':
                stack.pop()
     

 30%|███       | 3/10 [00:16<00:38,  5.54s/it]


Original generated code:
# Task: Implement the following Python function


def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """

# Provide only the function implementation with proper indentation.
# The provided solution is incorrect as it does not handle the case where the number is negative.

def truncate_number(number: float) -> float:
    if number < 0:
        return 0  # Return 0 if the number is negative
    return number % 1  # Calculate the decimal part of the number

# Test the function with the given examples
print(truncate_number(3.5))  # Output: 0.5
print(truncate_number(-2.7))  # Output: 0
print(truncate_number(10.999))  # Output: 0.999


Cleaned code:
def truncate_number(number):
    if number < 0:


 40%|████      | 4/10 [00:18<00:26,  4.35s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def below_zero(operations: List[int]) -> bool:
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and
    at that point function should return True. Otherwise it should return False.
    >>> below_zero([1, 2, 3])
    False
    >>> below_zero([1, 2, -4, 5])
    True
    """

# Provide only the function implementation with proper indentation.
    balance = 0
    for operation in operations:
        balance += operation
        if balance < 0:
            return True
    return False

# Run the doctests to validate the implementation.
if __name__ == "__main__":
    import doctest
    doctest.testmod()

Cleaned code:
def below_zero(operations):
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
   

 50%|█████     | 5/10 [00:26<00:28,  5.65s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def mean_absolute_deviation(numbers: List[float]) -> float:
    """ For a given list of input numbers, calculate Mean Absolute Deviation
    around the mean of this dataset.
    Mean Absolute Deviation is the average absolute difference between each
    element and a centerpoint (mean in this case):
    MAD = average | x - x_mean |
    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])
    1.0
    """

# Provide only the function implementation with proper indentation.
# The provided solution has the function definition and docstring, but not the implementation.
# To solve this problem, we need to calculate the mean of the input list and then compute the Mean Absolute Deviation.

    if not numbers:
        return 0.0

    mean_val = sum(numbers) / len(numbers)
    mad_val = sum(abs(num - mean_val) for num in numbers) / len(numbers)

    return mad_val

# Test the function with the exampl

 60%|██████    | 6/10 [00:30<00:20,  5.22s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def intersperse(numbers: List[int], delimeter: int) -> List[int]:
    """ Insert a number 'delimeter' between every two consecutive elements of input list `numbers'
    >>> intersperse([], 4)
    []
    >>> intersperse([1, 2, 3], 4)
    [1, 4, 2, 4, 3]
    """

# Provide only the function implementation with proper indentation.
    result = []
    for i in range(len(numbers) - 1):
        result.extend([numbers[i], delimeter])
    result.append(numbers[-1])
    return result


# Test cases
import doctest
doctest.testmod()

# The `intersperse` function takes a list of integers `numbers` and an integer `delimeter` as input. It inserts the `delimeter` between every two consecutive elements of the input list and returns the modified list.

Cleaned code:
def intersperse(numbers, delimeter):
    """ Insert a number 'delimeter' between every two consecutive elements of input list `numbers'
    

 70%|███████   | 7/10 [00:37<00:16,  5.51s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output the deepest level of nesting of parentheses.
    E.g. (()()) has maximum two levels of nesting while ((())) has three.

    >>> parse_nested_parens('(()()) ((())) () ((())()())')
    [2, 3, 1, 3]
    """

# Provide only the function implementation with proper indentation.
    def max_nesting_level(paren_group: str) -> int:
        max_level = 0
        current_level = 0

        for char in paren_group:
            if char == '(':
                current_level += 1
                max_level = max(max_level, current_level)
            elif char == ')':
                current_level -= 1

        return max_level

    groups = paren_string.split()
    return [max_nesting_level(group) for

 80%|████████  | 8/10 [00:39<00:08,  4.38s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def filter_by_substring(strings: List[str], substring: str) -> List[str]:
    """ Filter an input list of strings only for ones that contain given substring
    >>> filter_by_substring([], 'a')
    []
    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')
    ['abc', 'bacd', 'array']
    """

# Provide only the function implementation with proper indentation.
    return [s for s in strings if substring in s]

# Run the provided test cases to verify the correctness of the function.
# The function filters the input list of strings based on whether the given substring is present in each string.

Cleaned code:
def filter_by_substring(strings, substring):
    """ Filter an input list of strings only for ones that contain given substring
    >>> filter_by_substring([], 'a')
    []
    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')
    ['abc', 'bacd', 'array']
    """


 90%|█████████ | 9/10 [00:43<00:04,  4.54s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List, Tuple


def sum_product(numbers: List[int]) -> Tuple[int, int]:
    """ For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.
    Empty sum should be equal to 0 and empty product should be equal to 1.
    >>> sum_product([])
    (0, 1)
    >>> sum_product([1, 2, 3, 4])
    (10, 24)
    """

# Provide only the function implementation with proper indentation.
# To solve this problem, we will iterate through the list of numbers and calculate the sum and product of all the integers.

    sum_result = 0
    product_result = 1

    for num in numbers:
        sum_result += num
        product_result *= num

    return sum_result, product_result

# Test the function with the provided test cases
print(sum_product([]))  # Output: (0, 1)
print(sum_product([1, 2, 3, 4]))  # Output: (10, 24)

Cleaned code:
def sum_product(numbers):
    """ For a

100%|██████████| 10/10 [00:51<00:00,  5.17s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List, Tuple


def rolling_max(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling maximum element found until given moment
    in the sequence.
    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])
    [1, 2, 3, 3, 3, 4, 4]
    """

# Provide only the function implementation with proper indentation.
# To solve this problem, we will iterate through the input list of numbers and keep track of the maximum element found so far.
# At each step, we will compare the current number with the maximum element found so far and update the maximum accordingly.
# We will store these maximum elements in a new list and return it as the final result.

    rolling_max_list = []
    max_element = float('-inf')

    for num in numbers:
        max_element = max(max_element, num)
        rolling_max_list.append(max_element)

    return rolling_max_list

# Test the function with t




'\nNote: Results use same metrics as DeepSeek evaluation:\n- pass@1: Single-attempt success rate\n- pass@10: Success rate within 10 attempts\n- pass@100: Success rate within 100 attempts\n- syntax_validity: Proportion of syntactically valid generations\n- execution_accuracy: Proportion of functionally correct solutions\n\nThese results can be directly compared with DeepSeek results\nto assess relative model performance.\n'

In [31]:
"""
#################################
# Extended Evaluation Setup
#################################
This section prepares for comprehensive model evaluation using both
HumanEval and HumanEval+ benchmarks.
"""

# Install HuggingFace datasets library for benchmark access
!pip install datasets  # Required for loading HumanEval and HumanEval+ datasets

"""
Package Details:
- datasets: HuggingFace's datasets library
           Provides access to:
           - Original HumanEval benchmark (164 programming tasks)
           - HumanEval+ extended benchmark

Note: HumanEval+ extends the original benchmark with:
      - Additional test cases
      - Edge cases
      - Complex inputs
      - Performance considerations
"""

Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2024.9.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.10.0
    Uninstalling fsspec-2024.10.0:
      Successfully uninstalled fsspec-2024.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2024.9.0


"\nPackage Details:\n- datasets: HuggingFace's datasets library\n           Provides access to:\n           - Original HumanEval benchmark (164 programming tasks)\n           - HumanEval+ extended benchmark\n           \nNote: HumanEval+ extends the original benchmark with:\n      - Additional test cases\n      - Edge cases\n      - Complex inputs\n      - Performance considerations\n"

In [33]:
"""
#################################
# Benchmark Dataset Loading and Testing
#################################
This section loads the HumanEval dataset and performs an initial
test generation using the first benchmark problem.
"""

from datasets import load_dataset

def generate_code_with_semcoder(prompt: str) -> str:
    """
    Generate code using the SemCoder model.

    Args:
        prompt (str): The programming task prompt

    Returns:
        str: Generated code solution
    """
    # Format prompt for SemCoder
    formatted_prompt = (
        "# Task: Implement the following Python function\n"
        f"{prompt}\n"
        "# Provide only the function implementation with proper indentation.\n"
    )

    # Generate code using previously loaded SemCoder model
    return semcoder.generate_code(formatted_prompt)

# Load the complete HumanEval benchmark
human_eval = load_dataset("openai_humaneval")  # Contains 164 Python programming tasks

# Extract first task for initial testing
task = human_eval["test"][0]  # Index 0 contains first benchmark problem
prompt = task["prompt"]       # Extract problem description

# Display task details for verification
print("HumanEval Prompt:\n", prompt)  # Show problem description
print("Expected Solution:\n", task["canonical_solution"])  # Show reference solution

# Test code generation with SemCoder
generated_code = generate_code_with_semcoder(prompt)  # Generate solution using SemCoder
print("Generated Code:\n", generated_code)  # Display generated solution

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


HumanEval Prompt:
 from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

Expected Solution:
     for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False

Generated Code:
 # Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements(

'\nNote: This initial test:\n1. Verifies dataset loading\n2. Confirms prompt extraction\n3. Tests code generation pipeline\n4. Allows comparison between:\n   - Problem description\n   - Expected solution\n   - Generated solution\n'

In [34]:
"""
#################################
# Solution Comparison
#################################
This section performs a basic comparison between the generated
solution and the canonical solution from the benchmark.
"""

# Compare generated code with canonical solution
# Note: Strips whitespace for more accurate comparison
if generated_code.strip() == task["canonical_solution"].strip():
    print("The generated code matches the expected solution!")
else:
    print("The generated code does not match the expected solution.")

The generated code does not match the expected solution.


'\nNote: This is a strict comparison that:\n1. Only checks for exact matches\n2. Ignores whitespace differences\n3. Does not account for:\n   - Functionally equivalent but differently written solutions\n   - Alternative algorithmic approaches\n   - Different variable names\n   - Different formatting styles\n   \nFor more comprehensive evaluation, functional testing\n(running test cases) provides better insight into solution\ncorrectness.\n'

In [39]:
"""
#################################
# Doctest Validation
#################################
This section executes the generated code and runs its doctests
to verify functionality against the provided examples.
"""

# First attempt to execute the generated code
try:
    exec(generated_code)  # Load the generated function into namespace
except Exception as e:
    print(f"Error in executing generated code: {e}")
    print("Generated code that failed:")
    print(generated_code)

# If code execution succeeded, run doctests
try:
    import doctest
    doctest.testmod()  # Run all doctests in the current namespace
except Exception as e:
    print(f"Error running doctests: {e}")
    print("Doctest execution failed. This might indicate:")
    print("- Syntax errors in the docstring examples")
    print("- Mismatched output formatting")
    print("- Function behavior different from examples")

In [40]:
"""
#################################
# Custom Test Suite Execution
#################################
This section implements and runs a custom test suite to validate
the generated function against specific test cases.
"""

# Define comprehensive test cases
test_cases = [
    ([1.0, 2.0, 3.0], 0.5, False),          # Basic case with no close elements
    ([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3, True)  # Case with close elements
]

def run_tests(func):
    """
    Execute test cases against the generated function.

    Args:
        func: The generated function to test

    Raises:
        AssertionError: If any test case fails

    Note: Each test case contains:
        - Input list of numbers
        - Threshold value
        - Expected boolean result
    """
    for numbers, threshold, expected in test_cases:
        result = func(numbers, threshold)
        assert result == expected, f"Test failed: {numbers}, {threshold} -> {result}"

# Execute tests on generated function
try:
    # Load the generated function into current namespace
    exec(generated_code)

    # Run test suite against the loaded function
    run_tests(has_close_elements)
    print("All tests passed successfully!")

except Exception as e:
    print(f"Test failed: {e}")
    print("\nDetails:")
    print(f"- Error type: {type(e).__name__}")
    print(f"- Generated code being tested:")
    print(generated_code)

All tests passed successfully!


In [41]:
"""
#################################
# Multi-Task Evaluation Loop
#################################
This section implements a loop to test SemCoder's performance
across multiple HumanEval tasks, providing a broader assessment
of model capabilities.
"""

# Evaluate first 5 tasks from HumanEval
for i in range(5):  # Limited sample for initial testing
    # Extract task details
    task = human_eval["test"][i]
    prompt = task["prompt"]

    # Display task information
    print(f"\nTask {i + 1} Prompt:\n{prompt}")

    # Generate solution using SemCoder
    generated_code = generate_code_with_semcoder(prompt)
    print("Generated Code:\n", generated_code)

    # Test generated solution
    try:
        # Load generated function into namespace
        exec(generated_code)

        # Execute test cases
        # Note: Currently configured for has_close_elements
        # TODO: Modify test runner for each specific function
        run_tests(has_close_elements)

        print(f"Task {i + 1}: All tests passed successfully!\n")

    except Exception as e:
        print(f"Task {i + 1}: Test failed - {e}")
        print(f"Error type: {type(e).__name__}")
        print("Generated code that failed:")
        print(generated_code)
        print()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Task 1 Prompt:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Code:
 # Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

# Provide only the function implementation with proper indentation.
    sorted_numbers = sorted(numbers)
    for i in range(len(sorted_numbers) - 1):
        if sorted_numbers[i + 1] - sorted_numbers[i] < threshold:
            return True
    return False

# Run doctest to validate the function
if __name__ == "__main__":
    import doctest
    doctest.testmod()

# Task: Ensure that the function works correctly with the provided test cases.
Task 1: All tests passed successfully!


Task 2 Prompt:
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Code:
 # Task: Implement the following Python function
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """

# Provide only the function implementation with proper indentation.
    groups = []
    start_index = None
    open_count = 0

    for i, char in enumerate(paren_string):
        if char == '(':
            if open_count == 0:
                start_index = i
            open_count += 1
        elif char == ')':
            open_count -= 1
            if open_count == 0:
                groups.append(paren_string[start_index:

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Code:
 # Task: Implement the following Python function


def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """

# Provide only the function implementation with proper indentation.
# The provided solution is incorrect as it does not handle the case when the number is negative.

def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """

    if number < 0:
        raise ValueError("Input number must be positive")

    return number % 1
Task 3:

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Code:
 # Task: Implement the following Python function
from typing import List


def below_zero(operations: List[int]) -> bool:
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and
    at that point function should return True. Otherwise it should return False.
    >>> below_zero([1, 2, 3])
    False
    >>> below_zero([1, 2, -4, 5])
    True
    """

# Provide only the function implementation with proper indentation.
    balance = 0
    for operation in operations:
        balance += operation
        if balance < 0:
            return True
    return False

# Run the doctest to check the function
if __name__ == "__main__":
    import doctest
    doctest.testmod()
**********************************************************************
File "__main__", line 10, in __main__.separate_paren_groups
Failed example:
    separate_paren_

In [42]:
"""
#################################
# Advanced Code Evaluation System
#################################
This module implements a comprehensive evaluation system for code
generation models, including syntax validation, test execution,
and performance metrics calculation.
"""

from typing import List, Dict
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import timeout_decorator

class CodeEvaluator:
    """
    A class for evaluating code generation model performance.

    Attributes:
        dataset: HumanEval or similar benchmark dataset
        metrics: Dictionary tracking various performance metrics
    """

    def __init__(self, dataset="openai_humaneval"):
        """
        Initialize evaluator with specified dataset and metrics.

        Args:
            dataset: Name of the evaluation dataset
        """
        self.dataset = load_dataset(dataset)
        self.metrics = {
            "pass@1": 0.0,      # Single-attempt success rate
            "pass@10": 0.0,     # Success within 10 attempts
            "pass@100": 0.0,    # Success within 100 attempts
            "syntax_validity": 0.0,  # Syntactic correctness
            "execution_accuracy": 0.0  # Functional correctness
        }

    @timeout_decorator.timeout(5)  # Prevent infinite loops/hanging
    def execute_test_case(self, code: str, test_case: str) -> bool:
        """
        Execute a single test case with timeout protection.

        Args:
            code: Generated code to test
            test_case: Test case to execute

        Returns:
            bool: True if test passes, False otherwise
        """
        try:
            namespace = {}
            exec(code, namespace)
            exec(test_case, namespace)
            return True
        except Exception as e:
            return False

    def check_syntax(self, code: str) -> bool:
        """
        Verify syntactic correctness of generated code.

        Args:
            code: Code to check

        Returns:
            bool: True if syntax is valid
        """
        try:
            compile(code, '<string>', 'exec')
            return True
        except SyntaxError:
            return False

    def evaluate_single_solution(self,
                               task_id: int,
                               generated_code: str,
                               num_samples: int = 1) -> Dict:
        """
        Evaluate a single generated solution comprehensively.

        Args:
            task_id: Index of the task
            generated_code: Generated solution to evaluate
            num_samples: Number of evaluation samples

        Returns:
            Dict containing evaluation results
        """
        task = self.dataset["test"][task_id]

        # Verify syntax first
        syntax_valid = self.check_syntax(generated_code)

        # Execute test cases if syntax is valid
        if syntax_valid:
            test_cases = task["test_cases"]
            # Use thread pool for parallel test execution
            with ThreadPoolExecutor() as executor:
                results = list(executor.map(
                    lambda tc: self.execute_test_case(generated_code, tc),
                    test_cases
                ))
                print("Results")
                print(results)
            execution_success = all(results)
        else:
            execution_success = False

        return {
            "syntax_valid": syntax_valid,
            "execution_success": execution_success
        }

    def evaluate_model(self, model, tokenizer, n_tasks: int = None):
        """
        Evaluate model performance across multiple tasks.

        Args:
            model: The model to evaluate
            tokenizer: Model's tokenizer
            n_tasks: Number of tasks to evaluate (None for all)

        Returns:
            Dict containing aggregated metrics
        """
        if n_tasks is None:
            n_tasks = len(self.dataset["test"])

        results = []
        for i in range(n_tasks):
            task = self.dataset["test"][i]
            prompt = task["prompt"]

            # Generate solution
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            outputs = model.generate(
                inputs["input_ids"],
                max_new_tokens=512,
                num_return_sequences=1,
                temperature=0.8
            )
            generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Evaluate solution
            result = self.evaluate_single_solution(i, generated_code)
            results.append(result)

        # Calculate aggregate metrics
        self.metrics["syntax_validity"] = np.mean([r["syntax_valid"] for r in results])
        self.metrics["execution_accuracy"] = np.mean([r["execution_success"] for r in results])

        return self.metrics

# Initialize the evaluation system
evaluator = CodeEvaluator()

def evaluate_stage(model, tokenizer, stage_name: str):
    """
    Evaluate and log results for a specific evaluation stage.

    Args:
        model: Model to evaluate
        tokenizer: Model's tokenizer
        stage_name: Name of evaluation stage

    Returns:
        Dict containing evaluation metrics
    """
    print(f"\nEvaluating {stage_name}...")
    metrics = evaluator.evaluate_model(model, tokenizer)

    print(f"\nResults for {stage_name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

    return metrics

In [43]:
"""
#################################
# Supervised Fine-Tuning Setup
#################################
This section prepares the environment for implementing supervised
fine-tuning (SFT) on our code generation models.
"""

# Install required packages for fine-tuning
!pip install transformers datasets  # Core libraries for model training



"\nPackage Details:\n- transformers: Hugging Face's transformers library\n               Provides:\n               - Model fine-tuning capabilities\n               - Training utilities\n               - Optimization tools\n               \n- datasets: Data handling library\n           Used for:\n           - Loading training data\n           - Data preprocessing\n           - Batch preparation\n\nNote: SFT Process Overview:\n1. Prepare training data\n2. Configure training parameters\n3. Implement training loop\n4. Evaluate fine-tuned model\n5. Compare with base model performance\n"

In [44]:
"""
#################################
# Training Data Preparation
#################################
This section loads and examines the CodeSearchNet dataset,
which will be used for supervised fine-tuning of our models.
"""

from datasets import load_dataset

# Load CodeSearchNet dataset, focusing on Python code
# This dataset contains real-world Python code examples with documentation
codesearchnet = load_dataset(
    "code_search_net",  # Dataset name
    "python"           # Language subset
)

# Display dataset structure and statistics
print(codesearchnet)  # Shows splits, sizes, and features

README.md:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

code_search_net.py:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

The repository for code_search_net contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/code_search_net.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


python.zip:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})


'\nCodeSearchNet Dataset Details:\n- Contains Python code from open-source repositories\n- Includes:\n  * Function implementations\n  * Docstrings\n  * Method names\n  * Repository metadata\n  \nDataset Structure:\n- Training split\n- Validation split\n- Test split\n\nFeatures typically include:\n- repository_name\n- func_name\n- whole_func_string\n- language\n- func_code_string\n- docstring\n- sha\n- url\n'

In [45]:
"""
#################################
# Dataset Sample Inspection
#################################
This section examines a single sample from the CodeSearchNet dataset
to understand its structure and content format.
"""

# Extract first training sample for inspection
sample = codesearchnet["train"][0]  # Index 0 contains first training example

# Display sample contents with clear formatting
print(f"Code:\n{sample['func_code_string']}")  # Function implementation
print(f"Documentation:\n{sample['func_documentation_string']}")  # Associated documentation

Code:
def update(self, field_dict, where_clause=None):
        '''
        update db entry

        :param field_dict: dictionary of fields and values
        :param where_clause: where clause for the update
        '''
        query = '''
        UPDATE %s SET %s
        ''' % (
            self._name,
            ','.join('%s=:%s' % (k, k) for k in field_dict)
        )
        if where_clause:
            query += ' WHERE %s' % (where_clause)
        self._cursor.execute(query, field_dict)
        self._connection.commit()
Documentation:
update db entry

        :param field_dict: dictionary of fields and values
        :param where_clause: where clause for the update


"\nSample Analysis:\n- func_code_string: Contains the actual Python function implementation\n- func_documentation_string: Contains the function's documentation\n                           (docstrings, comments, etc.)\n\nThis inspection helps verify:\n1. Data format and structure\n2. Code-documentation alignment\n3. Content quality\n4. Potential preprocessing needs\n"

In [46]:
"""
#################################
# Dataset Tokenization
#################################
This section implements the tokenization process for preparing
the CodeSearchNet dataset for model training.
"""

def tokenize_function(examples):
    """
    Tokenize and format examples for training.

    Args:
        examples: Dictionary containing batched dataset examples

    Returns:
        Dictionary containing tokenized inputs and labels

    Process:
    1. Extract documentation and code
    2. Tokenize both with padding
    3. Process labels for loss calculation
    4. Combine into model inputs
    """
    # Extract paired examples
    inputs = examples["func_documentation_string"]   # Documentation as input
    targets = examples["func_code_string"]          # Code as target

    # Tokenize documentation (inputs)
    model_inputs = tokenizer(
        inputs,
        max_length=512,        # Maximum sequence length
        truncation=True,       # Truncate if needed
        padding="max_length"   # Pad to max_length
    )

    # Tokenize code (targets/labels)
    labels = tokenizer(
        targets,
        max_length=512,
        truncation=True,
        padding="max_length"
    ).input_ids

    # Process labels for training
    # Replace padding tokens with -100 to ignore in loss computation
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in seq]
        for seq in labels
    ]

    # Combine inputs and labels
    model_inputs["labels"] = labels
    return model_inputs

# Apply tokenization to entire dataset
tokenized_datasets = codesearchnet.map(
    tokenize_function,
    batched=True  # Process examples in batches for efficiency
)

# Verify tokenization results
print(tokenized_datasets["train"][0])  # Display first tokenized example

Map:   0%|          | 0/412178 [00:00<?, ? examples/s]

Map:   0%|          | 0/22176 [00:00<?, ? examples/s]

Map:   0%|          | 0/23107 [00:00<?, ? examples/s]

{'repository_name': 'cisco-sas/kitty', 'func_path_in_repository': 'kitty/data/data_manager.py', 'func_name': 'Table.update', 'whole_func_string': "def update(self, field_dict, where_clause=None):\n        '''\n        update db entry\n\n        :param field_dict: dictionary of fields and values\n        :param where_clause: where clause for the update\n        '''\n        query = '''\n        UPDATE %s SET %s\n        ''' % (\n            self._name,\n            ','.join('%s=:%s' % (k, k) for k in field_dict)\n        )\n        if where_clause:\n            query += ' WHERE %s' % (where_clause)\n        self._cursor.execute(query, field_dict)\n        self._connection.commit()", 'language': 'python', 'func_code_string': "def update(self, field_dict, where_clause=None):\n        '''\n        update db entry\n\n        :param field_dict: dictionary of fields and values\n        :param where_clause: where clause for the update\n        '''\n        query = '''\n        UPDATE %s SET %s

'\nNote: Tokenization process includes:\n1. Input processing:\n   - Documentation text → token IDs\n   - Padding to fixed length\n   - Truncation of long sequences\n\n2. Label processing:\n   - Code text → token IDs\n   - Padding token replacement\n   - Loss masking setup\n\n3. Verification:\n   - Input token structure\n   - Label formatting\n   - Padding handling\n'

In [47]:
"""
#################################
# Training Setup Configuration
#################################
This section configures the training environment, including model
initialization, tokenizer setup, and training parameters.
"""

from transformers import (
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

# Initialize model for fine-tuning
model = AutoModelForCausalLM.from_pretrained(
    "/content/SemCoder",          # Local model path
    torch_dtype=torch.bfloat16    # Use bfloat16 for memory efficiency
).cuda()                          # Move to GPU

# Setup tokenizer from local files
tokenizer = AutoTokenizer.from_pretrained("/content/SemCoder")

# Initialize data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Configure training parameters
training_args = TrainingArguments(
    # Basic training configuration
    output_dir="./results",          # Directory for saving outputs
    eval_strategy="epoch",           # Evaluate after each epoch
    learning_rate=5e-5,             # Conservative learning rate

    # Batch size configuration
    per_device_train_batch_size=1,  # Small batch size due to memory constraints
    per_device_eval_batch_size=1,   # Matching evaluation batch size
    gradient_accumulation_steps=32,  # Accumulate gradients to simulate larger batch

    # Training duration
    num_train_epochs=3,             # Number of training epochs

    # Optimization parameters
    weight_decay=0.01,              # L2 regularization
    fp16=True,                      # Enable mixed precision training

    # Logging and saving configuration
    logging_dir="./logs",           # Directory for logs
    logging_steps=10,               # Log every 10 steps
    save_total_limit=2,             # Keep only last 2 checkpoints
    report_to="none",               # Disable external logging
)

# Initialize the training framework
trainer = Trainer(
    model=model,                           # Fine-tuning model
    args=training_args,                    # Training configuration
    train_dataset=tokenized_datasets["train"],        # Training data
    eval_dataset=tokenized_datasets["validation"],    # Validation data
    data_collator=data_collator,                     # Padding utility
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
"""
#################################
# Model Fine-Tuning - Minimal Memory
#################################
This section attempts fine-tuning with absolute minimal
memory footprint and explicit PyTorch memory settings.
"""

# Set PyTorch memory management settings
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:32'

# Clear all GPU memory
clear_memory()
torch.cuda.empty_cache()
gc.collect()

# Configure training with absolute minimal settings
training_args = TrainingArguments(
    output_dir="./results",

    # Absolute minimal batch and data settings
    per_device_train_batch_size=1,
    gradient_accumulation_steps=512,    # Extreme accumulation
    max_grad_norm=0.3,                 # More aggressive gradient clipping

    # Maximum memory optimization
    fp16=True,
    gradient_checkpointing=True,
    dataloader_num_workers=0,
    group_by_length=True,
    remove_unused_columns=True,

    # Minimal training loop
    max_steps=10,                      # Just try 10 steps initially
    eval_strategy="no",                # No evaluation
    logging_steps=5,                   # Minimal logging
    save_total_limit=1,               # Keep only one checkpoint
    save_steps=10,                    # Save at the end only
    report_to="none",                 # No reporting

    # Conservative hyperparameters
    learning_rate=1e-5,               # Reduced learning rate
    weight_decay=0.01,
    warmup_steps=2,                   # Minimal warmup

    # Additional memory optimizations
    optim="adamw_torch_fused",        # Use fused optimizer
    ddp_find_unused_parameters=False,  # Disable unused parameter detection
)

# Create minimal dataset
tiny_train_dataset = tokenized_datasets["train"].select(range(20))  # Just 20 examples

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tiny_train_dataset,
    data_collator=data_collator,
)

print("Starting minimal fine-tuning test...")
print("Using only 20 examples and 10 steps...")
trainer.train()

In [None]:
"""
#################################
# Model Saving
#################################
This section saves the fine-tuned model and tokenizer
for future use and evaluation.
"""

# Save complete model artifacts
trainer.save_model("./semcoder-sft")          # Save model weights and configuration
tokenizer.save_pretrained("./semcoder-sft")   # Save associated tokenizer

print("Fine-tuned model saved!")

In [None]:
"""
#################################
# Initial Model Testing
#################################
This section performs an initial test of the fine-tuned model
using a simple programming task.
"""

# Load the fine-tuned model for testing
fine_tuned_model = AutoModelForCausalLM.from_pretrained(
    "./semcoder-sft"   # Path to saved model
).cuda()              # Move to GPU

# Define test case
test_prompt = "Write a Python function to find the maximum element in a list."

# Prepare input for generation
inputs = tokenizer(
    test_prompt,
    return_tensors="pt"   # Convert to PyTorch tensor
).to(fine_tuned_model.device)

# Generate code using fine-tuned model
outputs = fine_tuned_model.generate(
    inputs["input_ids"],
    max_new_tokens=100    # Limit generation length
)

# Convert generated tokens to readable code
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Display results
print("Generated Code:\n", generated_code)

**TODO:  Implement CodeDPO with SemCoder**

**TODO:  Implement SemCoder for SWE Bench Verified**