In [None]:
import torch
import gc
from pathlib import Path
from dataclasses import dataclass
from typing import Optional

# Check CUDA availability
print("CUDA available:", torch.cuda.is_available())
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

# Memory management utilities
def clear_memory():
    """Clear GPU memory and garbage collection"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

def get_memory_status():
    """Get current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**2
        reserved = torch.cuda.memory_reserved() / 1024**2
        print(f"GPU Memory: Allocated: {allocated:.2f}MB, Reserved: {reserved:.2f}MB")

# Print initial memory status
get_memory_status()

CUDA available: True
GPU device name: NVIDIA A100-SXM4-40GB
GPU Memory: Allocated: 0.00MB, Reserved: 0.00MB


In [None]:
@dataclass
class ExperimentConfig:
    model_name: str
    batch_size: int
    learning_rate: float
    num_epochs: int
    max_seq_length: int
    gradient_accumulation_steps: int
    warmup_steps: Optional[int] = None
    weight_decay: float = 0.01
    eval_steps: int = 100
    save_steps: int = 100
    logging_steps: int = 10

# Create configuration
config = ExperimentConfig(
    model_name="deepseek-ai/deepseek-coder-6.7b-instruct",
    batch_size=1,
    learning_rate=5e-5,
    num_epochs=3,
    max_seq_length=512,
    gradient_accumulation_steps=32,
    warmup_steps=100
)

# Create results directory
results_dir = Path("./results")
results_dir.mkdir(parents=True, exist_ok=True)

print("Configuration and directories initialized!")

Configuration and directories initialized!


**Deepseek-Coder-6.7B-Instruct**

In [None]:
# Install necessary packages
!pip install transformers torch timeout-decorator

# Import libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Dict

def load_model_and_tokenizer(config: ExperimentConfig):
    """Load model and tokenizer with proper error handling and memory optimization"""
    try:
        # Clear memory before loading new model
        clear_memory()

        print(f"Loading {config.model_name}...")

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            config.model_name,
            trust_remote_code=True
        )

        # Load model with memory optimizations
        model = AutoModelForCausalLM.from_pretrained(
            config.model_name,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            device_map="auto",  # Automatically handle model placement
            low_cpu_mem_usage=True
        )

        # Enable gradient checkpointing for memory efficiency
        if hasattr(model, "gradient_checkpointing_enable"):
            model.gradient_checkpointing_enable()

        print("Model loaded successfully!")
        get_memory_status()

        return model, tokenizer

    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

def generate_code(model, tokenizer, prompt: str,
                 max_new_tokens: int = 512,
                 temperature: float = 0.8,
                 top_p: float = 0.95,
                 top_k: int = 50) -> str:
    """Generate code with the model using specified parameters"""
    try:
        messages = [{"role": "user", "content": prompt}]

        # Tokenize with proper error handling
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        # Generate with proper error handling
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

        return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

    except Exception as e:
        print(f"Error in code generation: {str(e)}")
        return ""

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(config)

# Test generation
test_prompt = "Write a quicksort algorithm in Python."
generated_code = generate_code(model, tokenizer, test_prompt)
print("\nGenerated Code:\n", generated_code)

Collecting timeout-decorator
  Downloading timeout-decorator-0.5.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: timeout-decorator
  Building wheel for timeout-decorator (setup.py) ... [?25l[?25hdone
  Created wheel for timeout-decorator: filename=timeout_decorator-0.5.0-py3-none-any.whl size=5007 sha256=18a1bd6833ed51acf59b7199dbb457253337b291a490c50ae9bc1124bbdb53fe
  Stored in directory: /root/.cache/pip/wheels/68/2f/bc/76f1192d474666d41ae6f09813fccbd00fe3f07e8261c4cff5
Successfully built timeout-decorator
Installing collected packages: timeout-decorator
Successfully installed timeout-decorator-0.5.0
Loading deepseek-ai/deepseek-coder-6.7b-instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Model loaded successfully!
GPU Memory: Allocated: 12856.52MB, Reserved: 12858.00MB

Generated Code:
 Sure, here is a simple implementation of the quicksort algorithm in Python.

```python
def quicksort(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[len(arr) // 2]
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return quicksort(left) + middle + quicksort(right)

print(quicksort([3,6,8,10,1,2,1]))
```

This program works by selecting a pivot element from the array and partitioning the other elements into two sub-arrays, according to whether they are less than or greater than the pivot. The sub-arrays are then recursively sorted.



In [None]:
# Cell 4
from datetime import datetime

class CodeGenerator:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.generation_history = []

    def generate_with_retry(self, prompt: str, max_attempts: int = 3) -> Dict:
        """Generate code with retry mechanism and logging"""
        for attempt in range(max_attempts):
            try:
                start_time = datetime.now()

                generated_code = generate_code(
                    self.model,
                    self.tokenizer,
                    prompt,
                    temperature=0.8 if attempt > 0 else 0.6  # Increase temperature on retries
                )

                end_time = datetime.now()
                generation_time = (end_time - start_time).total_seconds()

                # Log generation attempt
                result = {
                    "prompt": prompt,
                    "code": generated_code,
                    "attempt": attempt + 1,
                    "generation_time": generation_time,
                    "timestamp": end_time.isoformat()
                }

                self.generation_history.append(result)

                if generated_code:  # If generation was successful
                    return result

            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")

        return {"error": "All generation attempts failed"}

    def get_generation_stats(self) -> Dict:
        """Get statistics about code generation"""
        if not self.generation_history:
            return {}

        total_generations = len(self.generation_history)
        avg_time = sum(g["generation_time"] for g in self.generation_history) / total_generations

        return {
            "total_generations": total_generations,
            "average_generation_time": avg_time,
            "successful_generations": sum(1 for g in self.generation_history if "code" in g)
        }

# Initialize code generator
code_generator = CodeGenerator(model, tokenizer)

# Test the generator
test_result = code_generator.generate_with_retry("Write a binary search function in Python.")
print("\nGeneration Result:", test_result)
print("\nGeneration Stats:", code_generator.get_generation_stats())


Generation Result: {'prompt': 'Write a binary search function in Python.', 'code': "Sure, here's a simple binary search function in Python:\n\n```python\ndef binary_search(arr, low, high, x):\n \n    # Check base case\n    if high >= low:\n \n        mid = (high + low) // 2\n \n        # If element is present at the middle\n        if arr[mid] == x:\n            return mid\n \n        # If element is smaller than mid\n        elif arr[mid] > x:\n            return binary_search(arr, low, mid - 1, x)\n \n        # Else the element can only be present in right subarray\n        else:\n            return binary_search(arr, mid + 1, high, x)\n \n    else:\n        # Element is not present in array\n        return -1\n```\n\nIn this function, `arr` is the array to be searched, `low` and `high` are the starting and ending indices of the array, and `x` is the element to be searched. The function returns the index of the element if it is present in the array, else it returns `-1`.\n\nPlease n

**Clone SemCoder Repo**

In [None]:
# Cell 5: Clear memory before loading new model
clear_memory()

# Install Git LFS and clone SemCoder
print("Installing Git LFS and cloning SemCoder...")
!git lfs install
!git clone https://huggingface.co/semcoder/semcoder /content/SemCoder

# Verify clone success
import os
if os.path.exists('/content/SemCoder'):
    print("SemCoder repository cloned successfully!")
else:
    raise RuntimeError("Failed to clone SemCoder repository")

Installing Git LFS and cloning SemCoder...
Git LFS initialized.
Cloning into '/content/SemCoder'...
remote: Enumerating objects: 17, done.[K
remote: Total 17 (delta 0), reused 0 (delta 0), pack-reused 17 (from 1)[K
Unpacking objects: 100% (17/17), 398.02 KiB | 3.83 MiB/s, done.
Filtering content: 100% (4/4), 4.55 GiB | 30.22 MiB/s, done.
Encountered 2 file(s) that may not have been copied correctly on Windows:
	model-00002-of-00003.safetensors
	model-00001-of-00003.safetensors

See: `git lfs help smudge` for more details.
SemCoder repository cloned successfully!


In [None]:
# Cell 6: List and verify SemCoder directory contents
import os

def verify_semcoder_files():
    # Updated required files for safetensors format
    required_files = [
        'config.json',
        'tokenizer.json',
        'model.safetensors.index.json',
        'model-00001-of-00003.safetensors',
        'model-00002-of-00003.safetensors',
        'model-00003-of-00003.safetensors'
    ]
    missing_files = []

    print("SemCoder directory contents:")
    files = os.listdir('/content/SemCoder')
    print("\n".join(files))

    for file in required_files:
        if file not in files:
            missing_files.append(file)

    if missing_files:
        raise RuntimeError(f"Missing required files: {', '.join(missing_files)}")
    else:
        print("\nAll required files present!")
        print("\nModel files verification successful!")

verify_semcoder_files()

SemCoder directory contents:
generation_config.json
model-00001-of-00003.safetensors
README.md
training_args.bin
model-00003-of-00003.safetensors
tokenizer.json
model-00002-of-00003.safetensors
.gitattributes
model.safetensors.index.json
special_tokens_map.json
trainer_state.json
config.json
tokenizer_config.json
.git

All required files present!

Model files verification successful!


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Cell 7: Load the SemCoder model and tokenizer from the local path
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class SemCoderModel:
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.model = None
        self.tokenizer = None

    def load(self):
        try:
            # Clear memory before loading
            clear_memory()

            print("Loading SemCoder tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

            print("Loading SemCoder model...")
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_path,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                low_cpu_mem_usage=True
            )

            # Enable gradient checkpointing
            if hasattr(self.model, "gradient_checkpointing_enable"):
                self.model.gradient_checkpointing_enable()

            print("Successfully loaded SemCoder!")
            get_memory_status()

        except Exception as e:
            print(f"Error loading SemCoder: {str(e)}")
            raise

    def generate_code(self, prompt: str, max_new_tokens: int = 512) -> str:
        try:
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to(self.model.device)

            outputs = self.model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.95
            )

            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        except Exception as e:
            print(f"Error generating code: {str(e)}")
            return ""

# Initialize and load SemCoder
semcoder = SemCoderModel("/content/SemCoder")
semcoder.load()

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}


Loading SemCoder tokenizer...
Loading SemCoder model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Successfully loaded SemCoder!
GPU Memory: Allocated: 25721.17MB, Reserved: 25734.00MB


In [None]:
# Cell 8: Example SemCoder prompt
def test_semcoder_generation():
    # Test prompt
    prompt = "Write a Python function to calculate the Fibonacci sequence."

    print("Testing SemCoder with Fibonacci sequence prompt...")
    print(f"Input prompt: {prompt}")

    try:
        # Generate code
        generated_code = semcoder.generate_code(prompt)

        print("\nGenerated Code:")
        print(generated_code)

        # Basic validation
        if "def" in generated_code and "return" in generated_code:
            print("\nCode generation appears successful!")
        else:
            print("\nWarning: Generated code might be incomplete!")

        # Memory status after generation
        get_memory_status()

    except Exception as e:
        print(f"Error in test generation: {str(e)}")

# Run the test
test_semcoder_generation()

Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Testing SemCoder with Fibonacci sequence prompt...
Input prompt: Write a Python function to calculate the Fibonacci sequence.

Generated Code:
Write a Python function to calculate the Fibonacci sequence.

```python
def fibonacci_sequence(n):
    if n <= 0:
        return "Invalid input. Please enter a positive integer."
    elif n == 1:
        return [0]
    elif n == 2:
        return [0, 1]
    else:
        fib_sequence = [0, 1]
        for i in range(2, n):
            next_fib = fib_sequence[i - 1] + fib_sequence[i - 2]
            fib_sequence.append(next_fib)
        return fib_sequence

# Test the function with n = 10
n = 10
result = fibonacci_sequence(n)
print(result)
```

This revised solution should address the previous error by correctly indexing the Fibonacci sequence list. It calculates the Fibonacci sequence up to the nth term and returns the sequence as a list.

Code generation appears successful!
GPU Memory: Allocated: 25721.17MB, Reserved: 25946.00MB


**Set Up Evaluation Framework**

In [None]:
# Install required packages
!pip install datasets tqdm

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
# Cell 9: Set up evaluation framework
from datasets import load_dataset
from typing import Dict, List, Any, TypeVar
import json
from tqdm import tqdm
import torch
import re

class ModelEvaluator:
    def __init__(self):
        self.human_eval = load_dataset("openai_humaneval")
        self.results = {}
        self.debug = True  # Control debug output

    def format_prompt(self, prompt: str, model_type: str) -> str:
        """Format prompt based on model type"""
        if model_type == "deepseek":
            return (
                "Write a Python function that solves the following task. "
                "Provide ONLY the function implementation starting with 'def' and proper indentation. "
                "The function should be properly indented with 4 spaces. "
                "Do not include any explanations, comments, docstrings, type hints, or test code. "
                "Do not include any print statements or assertions. "
                "Only include the function definition and its implementation.\n\n"
                "Example format:\n"
                "def example_function(param1, param2):\n"
                "    result = param1 + param2\n"
                "    return result\n\n"
                "Your task:\n"
                f"{prompt}"
            )
        elif model_type == "semcoder":
            return (
                "# Task: Implement the following Python function\n"
                f"{prompt}\n"
                "# Provide only the function implementation with proper indentation"
            )
        return prompt

    def clean_generated_code(self, code: str) -> str:
        """Clean and extract the actual code from the generation"""
        if self.debug:
            print("\nOriginal generated code:")
            print(code)

        # Remove any markdown code blocks
        if "```python" in code:
            code = code.split("```python")[1].split("```")[0]
        elif "```" in code:
            code = code.split("```")[1].split("```")[0]

        # Split into lines
        lines = code.split('\n')
        cleaned_lines = []
        in_target_function = False
        target_function_name = None
        current_indent = 0

        # Find the target function name from the prompt
        for line in lines:
            if 'def ' in line and not target_function_name:
                target_function_name = line[line.index('def ') + 4:].split('(')[0].strip()
                break

        if not target_function_name:
            return ""

        # Process lines
        for line in lines:
            stripped = line.strip()

            # Skip empty lines, comments, and docstrings
            if not stripped or stripped.startswith('#') or stripped.startswith('"""'):
                continue

            # Check for function definition
            if f'def {target_function_name}' in line:
                in_target_function = True
                current_indent = len(line) - len(line.lstrip())

                # Clean type hints from function definition
                func_def = line[line.index('def'):]
                func_def = re.sub(r':\s*[A-Z][a-zA-Z0-9_]*\s*[,\)]', ')', func_def)
                func_def = re.sub(r':\s*List\[[^\]]+\]', '', func_def)
                func_def = re.sub(r':\s*Dict\[[^\]]+\]', '', func_def)
                func_def = re.sub(r':\s*Tuple\[[^\]]+\]', '', func_def)
                func_def = re.sub(r'->\s*[a-zA-Z][a-zA-Z0-9_]*\s*:', ':', func_def)

                cleaned_lines.append(func_def)
                continue

            # If we're in the target function, add the line
            if in_target_function:
                # Check if we're still in the function
                if line.strip() and len(line) - len(line.lstrip()) <= current_indent:
                    if not any(x in line for x in ['def ', 'class ', '@']):
                        break

                # Skip test code, prints, and doctest
                if any(x in stripped for x in ['if __name__ ==', 'print(', 'assert', '>>>']):
                    continue

                # Add the line with proper indentation
                if stripped:
                    indent = '    ' * (1 + (len(line) - len(line.lstrip())) // 4)
                    cleaned_lines.append(indent + stripped)

        cleaned_code = '\n'.join(cleaned_lines)

        if self.debug:
            print("\nCleaned code:")
            print(cleaned_code)

        if not cleaned_code or 'def ' not in cleaned_code:
            if self.debug:
                print("Warning: No valid function found")
            return ""

        return cleaned_code

    def evaluate_single_solution(self, generated_code: str, test_cases: List[str], entry_point: str) -> Dict:
        """Evaluate a single generated solution"""
        # Add necessary imports and type definitions
        setup_code = """
    from typing import List, Dict, Optional, Any, TypeVar, Tuple
    import math
    import string
    import re

    M = TypeVar('M')
    """

        try:
            # Check syntax
            compile(generated_code, '<string>', 'exec')
            syntax_valid = True
        except SyntaxError as e:
            if self.debug:
                print(f"Syntax error: {str(e)}")
                print(f"Generated code:\n{generated_code}")
            syntax_valid = False
            return {
                "pass@1": 0,
                "pass@10": 0,
                "pass@100": 0,
                "syntax_validity": 0,
                "execution_accuracy": 0
            }

        # Execute test cases if syntax is valid
        if syntax_valid:
            namespace = {}
            try:
                # Execute setup code first
                exec(setup_code, namespace)

                # Execute the generated code
                exec(generated_code, namespace)

                # Execute test cases
                for test in test_cases:
                    exec(test, namespace)
                execution_success = True

            except Exception as e:
                if self.debug:
                    print(f"Execution error: {str(e)}")
                    print(f"Generated code:\n{generated_code}")
                execution_success = False
        else:
            execution_success = False

        return {
            "pass@1": int(execution_success),
            "pass@10": int(execution_success),
            "pass@100": int(execution_success),
            "syntax_validity": int(syntax_valid),
            "execution_accuracy": int(execution_success)
        }

    def evaluate_model(self, model, tokenizer, model_type: str, num_samples: int = None):
        """Evaluate model performance on HumanEval dataset"""
        results = {
            "pass@1": 0,
            "pass@10": 0,
            "pass@100": 0,
            "syntax_validity": 0,
            "execution_accuracy": 0
        }

        total_samples = len(self.human_eval["test"]) if num_samples is None else num_samples

        for idx in tqdm(range(total_samples)):
            task = self.human_eval["test"][idx]
            formatted_prompt = self.format_prompt(task["prompt"], model_type)

            if self.debug:
                print(f"\n\nProcessing task {idx + 1}/{total_samples}")
                print("Prompt:")
                print(formatted_prompt)

            try:
                # Generate code
                if model_type == "deepseek":
                    messages = [{"role": "user", "content": formatted_prompt}]
                    inputs = tokenizer.apply_chat_template(
                        messages,
                        return_tensors="pt",
                        padding=True
                    ).to(model.device)

                    attention_mask = torch.ones_like(inputs)

                    outputs = model.generate(
                        inputs,
                        attention_mask=attention_mask,
                        max_new_tokens=512,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.95,
                        pad_token_id=tokenizer.eos_token_id
                    )
                    generated_code = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

                else:  # semcoder
                    inputs = tokenizer(
                        formatted_prompt,
                        return_tensors="pt",
                        padding=True,
                        truncation=True,
                        max_length=512
                    ).to(model.device)

                    outputs = model.generate(
                        input_ids=inputs["input_ids"],
                        attention_mask=inputs["attention_mask"],
                        max_new_tokens=512,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.95,
                        pad_token_id=tokenizer.eos_token_id
                    )
                    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

                # Clean the generated code
                cleaned_code = self.clean_generated_code(generated_code)

                if cleaned_code:
                    # Evaluate the generated code
                    evaluation = self.evaluate_single_solution(
                        cleaned_code,
                        task["test"],
                        task["entry_point"]
                    )

                    if self.debug:
                        print("\nEvaluation results:")
                        for metric, value in evaluation.items():
                            print(f"{metric}: {value}")

                    # Update results
                    for metric in results:
                        results[metric] += evaluation[metric]

            except Exception as e:
                if self.debug:
                    print(f"Error processing sample {idx}: {str(e)}")
                continue

        # Calculate averages
        for metric in results:
            results[metric] /= total_samples

        return results

# Initialize evaluator
evaluator = ModelEvaluator()

In [None]:
# Cell 10: Evaluate DeepSeek base model
print("Evaluating DeepSeek base model...")
deepseek_results = evaluator.evaluate_model(model, tokenizer, "deepseek", num_samples=10)
print("\nDeepSeek Base Results:")
print(json.dumps(deepseek_results, indent=2))

Evaluating DeepSeek base model...


  0%|          | 0/10 [00:00<?, ?it/s]



Processing task 1/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



 10%|█         | 1/10 [00:08<01:13,  8.18s/it]


Original generated code:
    # TODO: Implement the function
    pass


def main():
    # TODO: Call the function with some test inputs
    pass


if __name__ == "__main__":
    main()


Solution:
from typing import List

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    numbers.sort()
    for i in range(1, len(numbers)):
        if numbers[i] - numbers[i - 1] < threshold:
            return True
    return False


def main():
    print(has_close_elements([1.0, 2.0, 3.0], 0.5))
    print(has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3))


if __name__ == "__main__":
    main()


Cleaned code:
def main():
        pass
Execution error: unexpected indent (<string>, line 2)
Generated code:
def main():
        pass

Evaluation results:
pass@1: 0
pass@10: 0
pass@100: 0
syntax_validity: 1
execution_accuracy: 0


Processing task 2/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and p

 20%|██        | 2/10 [00:13<00:50,  6.31s/it]


Original generated code:
    paren_string = paren_string.replace(" ", "")
    stack = []
    result = []

    for char in paren_string:
        if char == '(':
            stack.append(char)
            group_start_index = paren_string.index(char)
        elif char == ')':
            stack.pop()
            if len(stack) == 0:
                group_end_index = paren_string.index(char)
                result.append(paren_string[group_start_index:group_end_index+1])
    return result



Processing task 3/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    r

 30%|███       | 3/10 [00:19<00:44,  6.30s/it]


Original generated code:
    # Split the number into an integer and decimal part
    integer_part = int(number)
    decimal_part = number - integer_part

    # Return the decimal part
    return decimal_part


# Call function with sample inputs and print results
print(truncate_number(3.5))  # Expected output: 0.5
print(truncate_number(4.2))  # Expected output: 0.2
print(truncate_number(1.99))  # Expected output: 0.99
print(truncate_number(10.0))  # Expected output: 0.0
print(truncate_number(0.1))  # Expected output: 0.1



Processing task 4/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param

 40%|████      | 4/10 [00:26<00:39,  6.52s/it]


Original generated code:
    balance = 0
    for operation in operations:
        balance += operation
        if balance < 0:
            return True
    return False


def main():
    pass


if __name__ == "__main__":
    main()


# You can put any test cases here
# Do not modify the code below this line
# Test cases provided by the problem statement
# assert below_zero([1, 2, 3]) == False
# assert below_zero([1, 2, -4, 5]) == True
# assert below_zero([10, -10, -10]) == False
# assert below_zero([-1, 1, -2, 2]) == True
# assert below_zero([0, 1, -1]) == False

# Do not modify the code above this line


Cleaned code:
def main():
        pass
Execution error: unexpected indent (<string>, line 2)
Generated code:
def main():
        pass

Evaluation results:
pass@1: 0
pass@10: 0
pass@100: 0
syntax_validity: 1
execution_accuracy: 0


Processing task 5/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper

 50%|█████     | 5/10 [00:29<00:26,  5.29s/it]


Original generated code:
    # Calculate the mean of the numbers
    mean = sum(numbers) / len(numbers)

    # Calculate the absolute difference between each number and the mean
    differences = [abs(num - mean) for num in numbers]

    # Calculate the mean of these differences
    mad = sum(differences) / len(differences)

    return mad



Processing task 6/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def intersperse(numbers: List[int], delimeter: int) -> List[int]:
    """ Insert a number 'deli

 60%|██████    | 6/10 [00:31<00:16,  4.19s/it]


Original generated code:
    result = []
    for i in range(len(numbers)):
        result.append(numbers[i])
        if i < len(numbers) - 1:
            result.append(delimeter)
    return result



Processing task 7/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output the deepest level of nesting of pa

 70%|███████   | 7/10 [00:36<00:13,  4.36s/it]


Original generated code:
    pass


Implementation:
def parse_nested_parens(paren_string: str) -> List[int]:
    result = []
    groups = paren_string.split(' ')
    for group in groups:
        count = 0
        max_count = 0
        for char in group:
            if char == '(':
                count += 1
                if count > max_count:
                    max_count = count
            elif char == ')':
                count -= 1
        result.append(max_count)
    return result


Cleaned code:
def parse_nested_parens(paren_string: str) -> List[int]:
        result = []
        groups = paren_string.split(' ')
        for group in groups:
            count = 0
            max_count = 0
            for char in group:
                if char == '(':
                    count += 1
                    if count > max_count:
                        max_count = count
                elif char == ')':
                    count -= 1
            result.append(max_count)
        return 

 80%|████████  | 8/10 [00:37<00:06,  3.23s/it]


Original generated code:
    # Your code here
    return [s for s in strings if substring in s]



Processing task 9/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List, Tuple


def sum_product(numbers: List[int]) -> Tuple[int, int]:
    """ For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.
    Empty sum should be equal to 0 and empty product should be equal to 1.
    >>> sum_product([])
    (0, 1)
    >>> sum_product([1, 2, 3, 4])
    (10, 24)
   

 90%|█████████ | 9/10 [00:39<00:02,  2.87s/it]


Original generated code:
    # Implement your code here
    sum_result = 0
    product_result = 1

    for num in numbers:
        sum_result += num
        product_result *= num

    return (sum_result, product_result)



Processing task 10/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List, Tuple


def rolling_max(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling maximum element found until given moment
    in the sequence.
    >>> rolling_max([1, 2, 3, 2

100%|██████████| 10/10 [00:42<00:00,  4.22s/it]


Original generated code:
    result = []
    if numbers:
        current_max = numbers[0]
        result.append(current_max)

        for num in numbers[1:]:
            if num > current_max:
                current_max = num
                result.append(current_max)
            else:
                result.append(current_max)
    return result



DeepSeek Base Results:
{
  "pass@1": 0.0,
  "pass@10": 0.0,
  "pass@100": 0.0,
  "syntax_validity": 0.3,
  "execution_accuracy": 0.0
}





In [None]:
# Cell 11: Evaluate SemCoder
print("Evaluating SemCoder...")
semcoder_results = evaluator.evaluate_model(semcoder.model, semcoder.tokenizer, "semcoder", num_samples=10)
print("\nSemCoder Results:")
print(json.dumps(semcoder_results, indent=2))

Evaluating SemCoder...


  0%|          | 0/10 [00:00<?, ?it/s]



Processing task 1/10
Prompt:
# Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

# Provide only the function implementation with proper indentation


 10%|█         | 1/10 [00:07<01:06,  7.38s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

# Provide only the function implementation with proper indentation and docstring
def has_close_elements(numbers: List[float], threshold: float) -> bool:
    numbers.sort()  # Sort the list of numbers in ascending order

    for i in range(len(numbers) - 1):
        if numbers[i + 1] - numbers[i] < threshold:
            return True

    return False

# Test cases
import doctest
doctest.testmod()

# Output
# Testing has_close_elements function
# >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
# False
# >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
# True
# OK



 20%|██        | 2/10 [00:13<00:54,  6.83s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """

# Provide only the function implementation with proper indentation and docstring
def separate_paren_groups(paren_string: str) -> List[str]:
    result = []
    start = 0
    count = 0

    for i in range(len(paren_string)):
        if paren_string[i] == '(':
            count += 1
        elif paren_string[i] == ')':
            count -= 1

        if count == 0:
            result.append(paren_string[start:i + 1])


 30%|███       | 3/10 [00:17<00:36,  5.20s/it]


Original generated code:
# Task: Implement the following Python function


def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """

# Provide only the function implementation with proper indentation and syntax

def truncate_number(number: float) -> float:
    """
    Return the decimal part of the number.
    """
    integer_part = int(number)
    decimal_part = number - integer_part
    return decimal_part

# Test the function with the provided example
print(truncate_number(3.5))  # Output: 0.5

Cleaned code:
def truncate_number(number: float) :
        and integer part (largest integer smaller than given number) and decimals
        (leftover part always smaller than 1).
        Return the decimal part of the numbe

 40%|████      | 4/10 [00:19<00:24,  4.15s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def below_zero(operations: List[int]) -> bool:
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and
    at that point function should return True. Otherwise it should return False.
    >>> below_zero([1, 2, 3])
    False
    >>> below_zero([1, 2, -4, 5])
    True
    """

# Provide only the function implementation with proper indentation
    balance = 0
    for operation in operations:
        balance += operation
        if balance < 0:
            return True
    return False

# Run doctests to validate the implementation
if __name__ == "__main__":
    import doctest

    doctest.testmod()

Cleaned code:
def below_zero(operations) :
        zero balance. Your task is to detect if at any point the balance of account fallls below zero, and


 60%|██████    | 6/10 [00:26<00:13,  3.37s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def mean_absolute_deviation(numbers: List[float]) -> float:
    """ For a given list of input numbers, calculate Mean Absolute Deviation
    around the mean of this dataset.
    Mean Absolute Deviation is the average absolute difference between each
    element and a centerpoint (mean in this case):
    MAD = average | x - x_mean |
    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])
    1.0
    """

# Provide only the function implementation with proper indentation and docstring
def mean_absolute_deviation(numbers: List[float]) -> float:
    mean = sum(numbers) / len(numbers)
    absolute_deviations = [abs(num - mean) for num in numbers]
    mad = sum(absolute_deviations) / len(numbers)
    return mad

# Run the doctest to validate the function
import doctest
doctest.testmod()

# Test the function with an example
result = mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])
print(result)  # O

 70%|███████   | 7/10 [00:34<00:14,  4.85s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output the deepest level of nesting of parentheses.
    E.g. (()()) has maximum two levels of nesting while ((())) has three.

    >>> parse_nested_parens('(()()) ((())) () ((())()())')
    [2, 3, 1, 3]
    """

# Provide only the function implementation with proper indentation and syntax
def parse_nested_parens(paren_string: str) -> List[int]:
    groups = paren_string.split()
    max_levels = []

    for group in groups:
        stack = []
        max_depth = 0

        for char in group:
            if char == '(':
                stack.append('(')
                max_depth = max(max_depth, len(stack))
            elif char == ')':
                if stack:
                    stack.pop()


 80%|████████  | 8/10 [00:36<00:08,  4.09s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def filter_by_substring(strings: List[str], substring: str) -> List[str]:
    """ Filter an input list of strings only for ones that contain given substring
    >>> filter_by_substring([], 'a')
    []
    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')
    ['abc', 'bacd', 'array']
    """

# Provide only the function implementation with proper indentation and docstring
def filter_by_substring(strings: List[str], substring: str) -> List[str]:
    return [s for s in strings if substring in s]

# Run the doctest to validate the function
import doctest
doctest.testmod()

Cleaned code:
def filter_by_substring(strings, substring: str) -> List[str]:
        []
        ['abc', 'bacd', 'array']
def filter_by_substring(strings, substring: str) -> List[str]:
        return [s for s in strings if substring in s]
Execution error: unexpected indent (<string>, line 2)
Generated code:
def 

 90%|█████████ | 9/10 [00:41<00:04,  4.24s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List, Tuple


def sum_product(numbers: List[int]) -> Tuple[int, int]:
    """ For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.
    Empty sum should be equal to 0 and empty product should be equal to 1.
    >>> sum_product([])
    (0, 1)
    >>> sum_product([1, 2, 3, 4])
    (10, 24)
    """

# Provide only the function implementation with proper indentation and return statement
def sum_product(numbers: List[int]) -> Tuple[int, int]:
    total_sum = 0
    product = 1

    for num in numbers:
        total_sum += num
        product *= num

    return total_sum, product

# Test cases
print(sum_product([]))  # Output: (0, 1)
print(sum_product([1, 2, 3, 4]))  # Output: (10, 24)


Cleaned code:
def sum_product(numbers) -> Tuple[int, int]:
        Empty sum should be equal to 0 and empty product should be equal to 1.
        (0, 1)
        

100%|██████████| 10/10 [00:47<00:00,  4.77s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List, Tuple


def rolling_max(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling maximum element found until given moment
    in the sequence.
    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])
    [1, 2, 3, 3, 3, 4, 4]
    """

# Provide only the function implementation with proper indentation and docstring
def rolling_max(numbers: List[int]) -> List[int]:
    rolling_max_list = []
    current_max = float('-inf')

    for num in numbers:
        current_max = max(current_max, num)
        rolling_max_list.append(current_max)

    return rolling_max_list

# Test the function with the provided example
input_list = [1, 2, 3, 2, 3, 4, 2]
output_list = rolling_max(input_list)
print(output_list)

# The output of the function should be [1, 2, 3, 3, 3, 4, 4] for the given input list

Cleaned code:
def rolling_max(numbers) -> List[int]:
        in the sequence.





**Evaluate Base with HumanEval and HumanEval+**

In [None]:
# Install the datasets library
!pip install datasets



In [None]:
from datasets import load_dataset

# Load the HumanEval benchmark dataset
human_eval = load_dataset("openai_humaneval")

# Get the first task
task = human_eval["test"][0]
prompt = task["prompt"]

print("HumanEval Prompt:\n", prompt)
print("Expected Solution:\n", task["canonical_solution"])

# Generate code using SemCoder
generated_code = generate_code_with_semcoder(prompt)
print("Generated Code:\n", generated_code)

README.md:   0%|          | 0.00/6.52k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/83.9k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/164 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


HumanEval Prompt:
 from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

Expected Solution:
     for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False

Generated Code:
 from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
  

In [None]:
# Simple comparison with the expected solution
if generated_code.strip() == task["canonical_solution"].strip():
    print("The generated code matches the expected solution!")
else:
    print("The generated code does not match the expected solution.")

The generated code does not match the expected solution.


In [None]:
# Run the doctests from the generated code
try:
    exec(generated_code)  # Execute the generated code to load the function
except Exception as e:
    print(f"Error in executing generated code: {e}")

# If the code executed correctly, run the doctests
try:
    import doctest
    doctest.testmod()
except Exception as e:
    print(f"Error running doctests: {e}")


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.10/doctest.py", line 1501, in run
    sys.settrace(save_trace)



In [None]:
# Define test cases for the function
test_cases = [
    ([1.0, 2.0, 3.0], 0.5, False),
    ([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3, True)
]

# Run the test cases on the generated function
def run_tests(func):
    for numbers, threshold, expected in test_cases:
        result = func(numbers, threshold)
        assert result == expected, f"Test failed: {numbers}, {threshold} -> {result}"

# Execute the generated code and run the tests
try:
    exec(generated_code)  # Load the generated function
    run_tests(has_close_elements)  # Run the tests
    print("All tests passed successfully!")
except Exception as e:
    print(f"Test failed: {e}")

All tests passed successfully!


In [None]:
for i in range(5):  # Test the first 5 tasks
    task = human_eval["test"][i]
    prompt = task["prompt"]
    print(f"Task {i + 1} Prompt:\n{prompt}")

    generated_code = generate_code_with_semcoder(prompt)
    print("Generated Code:\n", generated_code)

    try:
        exec(generated_code)  # Load the generated function dynamically
        run_tests(has_close_elements)  # Modify for each function accordingly
        print(f"Task {i + 1}: All tests passed successfully!\n")
    except Exception as e:
        print(f"Task {i + 1}: Test failed - {e}\n")

Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Task 1 Prompt:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Generated Code:
 from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    numbers.sort()
    for i in range(len(numbers) - 1):
        if numbers[i + 1] - numbers[i] < threshold:
            return True
    return False


if __name__ == "__main__":
    import doctest
    doctest.testmod()
Task 1: All tests passed successfully!

Task 2 Prompt:
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested with

Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Generated Code:
 from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """
    result = []
    start_index = 0
    stack = []

    for i in range(len(paren_string)):
        if paren_string[i] == '(':
            stack.append('(')
        elif paren_string[i] == ')':
            if len(stack) == 0:
                start_index = i + 1
            else:
                stack.pop()
                if len(stack) == 0:
                    result.append(paren_string[start_index:i + 1])

    return result


if __name__ == "__main__":
    import doctest


Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Generated Code:
 

def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """
    return number % 1


def test_truncate_number():
    assert truncate_number(3.5) == 0.5
    assert truncate_number(10.25) == 0.25
    assert truncate_number(7.0) == 0.0
    assert truncate_number(123.456) == 0.456
    assert truncate_number(9876.54321) == 0.54321


if __name__ == "__main__":
    import doctest

    doctest.testmod()
**********************************************************************
File "__main__", line 9, in __main__.separate_paren_groups
Failed example:
    separate_paren_groups('( ) (( )) (( )( ))')
Expected:
    ['()', '(())', '(()())']
Got:
    ['( )', '( ) (( ))', '( ) (( )) (( )( ))']
*****************************

Setting `pad_token_id` to `eos_token_id`:32014 for open-end generation.


Generated Code:
 from typing import List


def below_zero(operations: List[int]) -> bool:
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and
    at that point function should return True. Otherwise it should return False.
    >>> below_zero([1, 2, 3])
    False
    >>> below_zero([1, 2, -4, 5])
    True
    """
    balance = 0
    for operation in operations:
        balance += operation
        if balance < 0:
            return True
    return False


if __name__ == "__main__":
    import doctest
    doctest.testmod()
**********************************************************************
File "__main__", line 9, in __main__.separate_paren_groups
Failed example:
    separate_paren_groups('( ) (( )) (( )( ))')
Expected:
    ['()', '(())', '(()())']
Got:
    ['( )', '( ) (( ))', '( ) (( )) (( )( ))']
*************************************

In [None]:
from typing import List, Dict
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import timeout_decorator

class CodeEvaluator:
    def __init__(self, dataset="openai_humaneval"):
        self.dataset = load_dataset(dataset)
        self.metrics = {
            "pass@1": 0.0,
            "pass@10": 0.0,
            "pass@100": 0.0,
            "syntax_validity": 0.0,
            "execution_accuracy": 0.0
        }

    @timeout_decorator.timeout(5)  # 5 second timeout for each test
    def execute_test_case(self, code: str, test_case: str) -> bool:
        try:
            namespace = {}
            exec(code, namespace)
            exec(test_case, namespace)
            return True
        except Exception as e:
            return False

    def check_syntax(self, code: str) -> bool:
        try:
            compile(code, '<string>', 'exec')
            return True
        except SyntaxError:
            return False

    def evaluate_single_solution(self,
                               task_id: int,
                               generated_code: str,
                               num_samples: int = 1) -> Dict:
        task = self.dataset["test"][task_id]

        # Check syntax
        syntax_valid = self.check_syntax(generated_code)

        # Execute test cases
        if syntax_valid:
            test_cases = task["test_cases"]
            with ThreadPoolExecutor() as executor:
                results = list(executor.map(
                    lambda tc: self.execute_test_case(generated_code, tc),
                    test_cases
                ))
            execution_success = all(results)
        else:
            execution_success = False

        return {
            "syntax_valid": syntax_valid,
            "execution_success": execution_success
        }

    def evaluate_model(self, model, tokenizer, n_tasks: int = None):
        if n_tasks is None:
            n_tasks = len(self.dataset["test"])

        results = []
        for i in range(n_tasks):
            task = self.dataset["test"][i]
            prompt = task["prompt"]

            # Generate code
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            outputs = model.generate(
                inputs["input_ids"],
                max_new_tokens=512,
                num_return_sequences=1,
                temperature=0.8
            )
            generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Evaluate the generated code
            result = self.evaluate_single_solution(i, generated_code)
            results.append(result)

        # Calculate metrics
        self.metrics["syntax_validity"] = np.mean([r["syntax_valid"] for r in results])
        self.metrics["execution_accuracy"] = np.mean([r["execution_success"] for r in results])

        return self.metrics

# Initialize evaluator
evaluator = CodeEvaluator()

# Function to evaluate and log results for each stage
def evaluate_stage(model, tokenizer, stage_name: str):
    print(f"\nEvaluating {stage_name}...")
    metrics = evaluator.evaluate_model(model, tokenizer)

    print(f"\nResults for {stage_name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

    return metrics

**Apply Supervised Fine Tuning to Base Model**

In [None]:
# Install the necessary packages
!pip install transformers datasets



In [None]:
from datasets import load_dataset

# Load the CodeSearchNet dataset (Python subset)
codesearchnet = load_dataset("code_search_net", "python")

# Check the structure of the dataset
print(codesearchnet)

README.md:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

code_search_net.py:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

The repository for code_search_net contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/code_search_net.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


python.zip:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})


In [None]:
# Inspect the first sample from the training set with the correct field names
sample = codesearchnet["train"][0]

# Print the code and corresponding documentation
print(f"Code:\n{sample['func_code_string']}")
print(f"Documentation:\n{sample['func_documentation_string']}")

Code:
def html(self):
        """ Wrapper for only doing the rendering on request (drastically reduces memory) """
        return self.render(self.data, self.proj, self.obj)
Documentation:
Wrapper for only doing the rendering on request (drastically reduces memory)


In [None]:
def tokenize_function(examples):
    # Extract docstrings and code strings from examples
    inputs = examples["func_documentation_string"]
    targets = examples["func_code_string"]

    # Tokenize inputs with padding and truncation
    model_inputs = tokenizer(
        inputs, max_length=512, truncation=True, padding="max_length"
    )

    # Tokenize targets (labels) with padding and truncation
    labels = tokenizer(
        targets, max_length=512, truncation=True, padding="max_length"
    ).input_ids

    # Replace padding tokens in the labels with -100 to ignore them in loss calculation
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in seq]
        for seq in labels
    ]

    # Add labels to the model inputs
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize the datasets
tokenized_datasets = codesearchnet.map(tokenize_function, batched=True)

# Check the first example to confirm tokenization
print(tokenized_datasets["train"][0])

Map:   0%|          | 0/412178 [00:00<?, ? examples/s]

Map:   0%|          | 0/22176 [00:00<?, ? examples/s]

Map:   0%|          | 0/23107 [00:00<?, ? examples/s]

{'repository_name': 'Fortran-FOSS-Programmers/ford', 'func_path_in_repository': 'ford/output.py', 'func_name': 'BasePage.html', 'whole_func_string': 'def html(self):\n        """ Wrapper for only doing the rendering on request (drastically reduces memory) """\n        return self.render(self.data, self.proj, self.obj)', 'language': 'python', 'func_code_string': 'def html(self):\n        """ Wrapper for only doing the rendering on request (drastically reduces memory) """\n        return self.render(self.data, self.proj, self.obj)', 'func_code_tokens': ['def', 'html', '(', 'self', ')', ':', 'return', 'self', '.', 'render', '(', 'self', '.', 'data', ',', 'self', '.', 'proj', ',', 'self', '.', 'obj', ')'], 'func_documentation_string': 'Wrapper for only doing the rendering on request (drastically reduces memory)', 'func_documentation_tokens': ['Wrapper', 'for', 'only', 'doing', 'the', 'rendering', 'on', 'request', '(', 'drastically', 'reduces', 'memory', ')'], 'split_name': 'train', 'func_c

In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding

# Load SemCoder with training mode enabled
model = AutoModelForCausalLM.from_pretrained(
    "/content/SemCoder", torch_dtype=torch.bfloat16
).cuda()

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/SemCoder")

# Initialize the data collator to dynamically pad inputs
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments with W&B disabled
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,  # Minimal batch size
    per_device_eval_batch_size=1,   # Minimal batch size
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    report_to="none", # Disable W&B integration
    fp16=True,  # Use mixed precision
    gradient_accumulation_steps=32,  # Larger gradient accumulation
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
# Start fine-tuning SemCoder on the CodeSearchNet dataset
trainer.train()

In [None]:
# Save the fine-tuned model and tokenizer
trainer.save_model("./semcoder-sft")
tokenizer.save_pretrained("./semcoder-sft")

print("Fine-tuned model saved!")

In [None]:
# Load the fine-tuned model
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./semcoder-sft").cuda()

# Test the fine-tuned model with a new prompt
test_prompt = "Write a Python function to find the maximum element in a list."

# Tokenize the prompt
inputs = tokenizer(test_prompt, return_tensors="pt").to(fine_tuned_model.device)

# Generate code
outputs = fine_tuned_model.generate(inputs["input_ids"], max_new_tokens=100)
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Code:\n", generated_code)

**Implement CodeDPO with SemCoder**