In [1]:
#!/usr/bin/env python3
"""
Evaluation script using real samples from the training dataset
Compares model output with expected output from the dataset
"""
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
import random
import json

# =============================================================================
# CONFIGURATION
# =============================================================================

MODEL_NAME = "nareshmlx/code-reviewer-opencv-16k"
DATASET_NAME = "nareshmlx/16k_opencvpr"
MAX_SEQ_LENGTH = 16384
LOAD_IN_4BIT = True
NUM_SAMPLES = 5  # Number of random samples to test

# =============================================================================
# LOAD MODEL
# =============================================================================

print("="*80)
print("LOADING MODEL")
print("="*80)
print(f"Model: {MODEL_NAME}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")
print(f"4-bit quantization: {LOAD_IN_4BIT}\n")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=LOAD_IN_4BIT,
    trust_remote_code=True,
    device_map="auto",
)

# Enable fast inference mode
FastLanguageModel.for_inference(model)

# Set padding token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("âœ“ Model loaded successfully!\n")

# =============================================================================
# LOAD DATASET
# =============================================================================

print("="*80)
print("LOADING DATASET")
print("="*80)

dataset = load_dataset(DATASET_NAME, split="train")
print(f"Dataset size: {len(dataset)} examples")

# Filter for reasonable-sized examples (< 8000 tokens)
print("Filtering for reasonably-sized examples...")
filtered_indices = []
for i in range(len(dataset)):
    input_text = dataset[i].get('input', '')
    instruction = dataset[i].get('instruction', '')
    total_len = len(input_text) + len(instruction)
    if total_len < 32000:  # ~8000 tokens
        filtered_indices.append(i)

print(f"Filtered dataset size: {len(filtered_indices)} examples")
print(f"Testing on: {NUM_SAMPLES} random samples\n")

# Sample random examples from filtered dataset
random.seed(42)
if len(filtered_indices) < NUM_SAMPLES:
    sample_indices = filtered_indices
else:
    sample_indices = random.sample(filtered_indices, NUM_SAMPLES)
samples = [dataset[i] for i in sample_indices]

print("âœ“ Dataset loaded successfully!\n")

# =============================================================================
# INFERENCE FUNCTION
# =============================================================================

def generate_review(instruction, code, max_tokens=1024):
    """Generate code review for given prompt"""
    # Format prompt similar to training data
    if code.strip():
        prompt = f"{instruction}\n\nHere is the code:\n{code}"
    else:
        prompt = instruction
    
    messages = [{"role": "user", "content": prompt}]
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")
    
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=max_tokens,
        use_cache=True,
        temperature=0.5,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.15,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract assistant response
    if "assistant" in response:
        response = response.split("assistant")[-1].strip()
    
    return response

# =============================================================================
# EVALUATION
# =============================================================================

def truncate_text(text, max_lines=15, max_chars=2000):
    """Truncate text to max_lines or max_chars for display"""
    if len(text) > max_chars:
        return text[:max_chars] + f"\n... (truncated, {len(text) - max_chars} more chars)"
    
    lines = text.split('\n')
    if len(lines) > max_lines:
        return '\n'.join(lines[:max_lines]) + f"\n... ({len(lines) - max_lines} more lines)"
    return text

def extract_code_diff(code_input):
    """Extract only the code diff from input JSON"""
    try:
        code_data = json.loads(code_input)
        # Try to get target_hunk first (the specific change being reviewed)
        if 'target_hunk' in code_data and code_data['target_hunk'].strip():
            return code_data['target_hunk']
        # Otherwise get full_diff
        elif 'full_diff' in code_data:
            return code_data['full_diff']
        else:
            return code_input
    except (json.JSONDecodeError, TypeError):
        return code_input

def main():
    print("="*80)
    print("RUNNING EVALUATION ON DATASET SAMPLES")
    print("="*80 + "\n")
    
    for i, sample in enumerate(samples, 1):
        print(f"\n{'='*80}")
        print(f"SAMPLE {i}/{len(samples)} (Index: {sample_indices[i-1]})")
        print(f"{'='*80}\n")
        
        # Extract fields
        instruction = sample.get('instruction', '')
        code_input = sample.get('input', '')
        expected_output = sample.get('output', '')
        
        # Display instruction (truncated)
        print(f"INSTRUCTION:")
        print(f"{'-'*80}")
        print(truncate_text(instruction, max_lines=5, max_chars=500))
        print(f"{'-'*80}\n")
        
        # Extract and display only the code diff
        code_diff = extract_code_diff(code_input)
        print(f"CODE DIFF:")
        print(f"{'-'*80}")
        print(truncate_text(code_diff, max_lines=30, max_chars=3000))
        print(f"{'-'*80}\n")
        
        # Display expected output
        print(f"EXPECTED OUTPUT (from dataset):")
        print(f"{'-'*80}")
        print(expected_output)
        print(f"{'-'*80}\n")
        
        # Generate model output
        print(f"MODEL OUTPUT:")
        print(f"{'-'*80}")
        model_output = generate_review(instruction, code_input, max_tokens=1024)
        print(model_output)
        print(f"{'-'*80}\n")
        
        # Simple comparison
        print(f"COMPARISON:")
        print(f"{'-'*80}")
        
        # Check if key phrases match
        expected_words = set(expected_output.lower().split())
        model_words = set(model_output.lower().split())
        overlap = len(expected_words & model_words)
        total = len(expected_words)
        similarity = (overlap / total * 100) if total > 0 else 0
        
        print(f"Word overlap: {overlap}/{total} ({similarity:.1f}%)")
        print(f"Expected length: {len(expected_output)} chars")
        print(f"Model length: {len(model_output)} chars")
        
        print(f"{'-'*80}\n")
        
        if i < len(samples):
            choice = input("Press Enter for next sample, 'q' to quit, 'f' for full output: ").strip().lower()
            if choice == 'q':
                break
            elif choice == 'f':
                print("\n" + "="*80)
                print("FULL CODE DIFF")
                print("="*80)
                print(code_diff)
                print("\n" + "="*80)
                print("FULL EXPECTED OUTPUT")
                print("="*80)
                print(expected_output)
                print("\n" + "="*80)
                print("FULL MODEL OUTPUT")
                print("="*80)
                print(model_output)
                print("="*80 + "\n")
                input("Press Enter to continue...")
    
    print("\n" + "="*80)
    print("EVALUATION COMPLETE!")
    print("="*80)
    
    # Summary statistics
    print("\n" + "="*80)
    print("SUMMARY")
    print("="*80)
    print(f"Tested {len(samples)} samples from dataset")
    print(f"Model: {MODEL_NAME}")
    print(f"Dataset: {DATASET_NAME}")
    print("="*80)

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\nEvaluation interrupted by user")
    except Exception as e:
        print(f"\n\nError: {e}")
        import traceback
        traceback.print_exc()

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.14.0         Please see GitHub issue #2919 for more info


Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
LOADING MODEL
Model: nareshmlx/code-reviewer-opencv-16k
Max sequence length: 16384
4-bit quantization: True

Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.10.5: Fast Qwen2 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA RTX PRO 6000 Blackwell Server Edition. Num GPUs = 1. Max memory: 94.971 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled 

INFO:accelerate.utils.modeling: We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Unsloth 2025.10.5 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


âœ“ Model loaded successfully!

LOADING DATASET
Dataset size: 10282 examples
Filtering for reasonably-sized examples...
Filtered dataset size: 7320 examples
Testing on: 5 random samples

âœ“ Dataset loaded successfully!

RUNNING EVALUATION ON DATASET SAMPLES


SAMPLE 1/5 (Index: 7392)

INSTRUCTION:
--------------------------------------------------------------------------------
# OpenCV Expert Code Reviewer

You are a **world-class expert code reviewer** for the OpenCV computer vision library. Your mission is to conduct thorough, actionable reviews that maintain OpenCV's high standards.

## ðŸŽ¯ **Core Objectives**
1. **Improve code health** - Every change should enhance the codebase
2. **Maintain OpenCV standards** - Enforce C++11 compliance and cross-platform compatibility  
3. **Provide actionable feedback** - Specific, educational comments with clear paths forward
4. 
... (truncated, 2981 more chars)
--------------------------------------------------------------------------------

