In [1]:
import torch
from unsloth import FastLanguageModel
from tqdm import tqdm
import os

# ==============================================================================
#  CONFIGURATION
# ==============================================================================
MODEL_ID = "onekq-ai/starcoder2-3b-bnb-4bit"
MAX_SEQ_LENGTH = 2048
OUTPUT_FILE = "starcoder_results.txt"

# ==============================================================================
#  THE BENCHMARK DATASET (Same 50 Questions)
# ==============================================================================
BENCHMARK_DATASET = [
    # --- CATEGORY 1: CONSTRAINT TRAPS ---
    {"id": 1, "cat": "Constraint", "prompt": "Write a Python function `weird_sort(nums)` that sorts a list of integers, BUT odd numbers must remain in their original index positions. Only even numbers should be sorted in ascending order."},
    {"id": 2, "cat": "Constraint", "prompt": "Implement a `limited_stack` class. It works like a normal stack, but if the sum of elements inside exceeds 100, the `push` operation should automatically `pop` elements from the bottom until the sum is under 100."},
    {"id": 3, "cat": "Constraint", "prompt": "Write a function to reverse a string, but keep all vowels in their original positions. Example: 'design' -> 'nisedg'."},
    {"id": 4, "cat": "Constraint", "prompt": "Create a function that generates the Fibonacci sequence up to N, but replace any prime number in the sequence with the string 'PRIME'."},
    {"id": 5, "cat": "Constraint", "prompt": "Write a Python dictionary comprehension that maps numbers 1 to 10 to their squares, but ONLY if the square is an even number. If the square is odd, map it to 0."},
    {"id": 6, "cat": "Constraint", "prompt": "Write a function that takes a list of strings and sorts them based on the *third* letter of each word. If a word is shorter than 3 letters, put it at the end."},
    {"id": 7, "cat": "Constraint", "prompt": "Implement a counter that counts from 1 to 100, but skips numbers that contain the digit '3' (e.g., 3, 13, 30-39)."},
    {"id": 8, "cat": "Constraint", "prompt": "Write a function `merge_alternating(list1, list2)` that merges two lists by taking elements alternately. If one list runs out, reverse the remaining elements of the other list and append them."},
    {"id": 9, "cat": "Constraint", "prompt": "Create a class `OneTimeDict`. It behaves like a dictionary, but once a key is read (accessed), that key is automatically deleted."},
    {"id": 10, "cat": "Constraint", "prompt": "Write a function that finds the maximum number in a list, but you are NOT allowed to use `max()`, `sorted()`, `sort()`, or any comparison operators like `>` or `<`. (Hint: use subtraction and absolute values)."},

    # --- CATEGORY 2: FORMAT TORTURE ---
    {"id": 11, "cat": "Format", "prompt": "Write a Python Hello World script. However, you MUST NOT use the string 'Hello World' directly. You must construct it using ASCII character codes and the `chr()` function."},
    {"id": 12, "cat": "Format", "prompt": "Generate a Python function to calculate factorial. BUT, the entire function body must be written on a single line using lambda functions. No `def` allowed."},
    {"id": 13, "cat": "Format", "prompt": "Write a standard binary search function. However, all variable names must be fruits (e.g., 'apple' for left, 'banana' for right)."},
    {"id": 14, "cat": "Format", "prompt": "Create a JSON object representing a user profile. The keys must be 'u_id', 'u_name', and 'u_age'. Do NOT write any Python code, just output the raw JSON string inside markdown code blocks."},
    {"id": 15, "cat": "Format", "prompt": "Write a Python comment block describing the Theory of Relativity. Do not write any executable code."},
    {"id": 16, "cat": "Format", "prompt": "Write a SQL query to select users, but format the SQL query as a single Python string variable named `sql_query`. Do not explain the query."},
    {"id": 17, "cat": "Format", "prompt": "Write a Python function to add two numbers, but you must use a nested function architecture (a closure) to achieve it."},
    {"id": 18, "cat": "Format", "prompt": "Output the CSS code to center a div, but you must use Grid, not Flexbox. Provide ONLY the CSS."},
    {"id": 19, "cat": "Format", "prompt": "Write a Python list comprehension that produces the first 10 even numbers. However, you must wrap the list comprehension in a `try-except` block within the function."},
    {"id": 20, "cat": "Format", "prompt": "Write a bash script to list files, but all comments in the script must be written in French."},

    # --- CATEGORY 3: REAL WORLD MESSY ---
    {"id": 21, "cat": "Real World", "prompt": "I have a log string: '[ERROR] 2023-10-05 User:admin caused:Timeout'. Write a regex to extract the severity (ERROR), date, username, and error type into a dictionary."},
    {"id": 22, "cat": "Real World", "prompt": "Write a Pandas one-liner to filter a DataFrame `df`. Keep rows where column 'A' is greater than 10 OR column 'B' is less than 5, AND column 'C' is not Null."},
    {"id": 23, "cat": "Real World", "prompt": "Write a python script to rename all files in a folder. If a file is 'image.jpg', rename it to 'img_001.jpg', 'img_002.jpg', etc. It must handle padding zeros correctly based on the total file count."},
    {"id": 24, "cat": "Real World", "prompt": "Implement a 'rate_limiter' decorator in Python. It should allow a function to be called only 5 times every 10 seconds. If exceeded, raise an exception."},
    {"id": 25, "cat": "Real World", "prompt": "Write a function to validate a credit card number using the Luhn algorithm."},
    {"id": 26, "cat": "Real World", "prompt": "Given a list of dirty phone numbers like ['123-456-7890', '(123) 456 7890', '123.456.7890'], write a function to normalize them all to '1234567890'."},
    {"id": 27, "cat": "Real World", "prompt": "Parse a CSV string manually without using the `csv` library. The string deals with quoted fields containing commas. Example line: `1, \"Apple, Red\", $1.00`."},
    {"id": 28, "cat": "Real World", "prompt": "Write a function that takes a URL and returns the domain name (e.g., 'https://www.google.com/search' -> 'google.com'). Handle subdomains correctly."},
    {"id": 29, "cat": "Real World", "prompt": "Implement a simple exponential backoff strategy for a failing network request using a while loop."},
    {"id": 30, "cat": "Real World", "prompt": "Write a function to convert a nested JSON object into a flat dictionary where keys are separated by dots (e.g., {'a': {'b': 1}} -> {'a.b': 1})."},

    # --- CATEGORY 4: ALGORITHMIC & CREATIVE ---
    {"id": 31, "cat": "Creative", "prompt": "Write a Python script to print a pyramid of stars of height 5. It must be centered."},
    {"id": 32, "cat": "Creative", "prompt": "Write code to generate a random maze using Depth First Search (DFS). Represent the maze using '#' for walls and ' ' for paths."},
    {"id": 33, "cat": "Creative", "prompt": "Simulate a text-based traffic light system. It should loop forever, printing 'RED', waiting 3s, 'GREEN', waiting 3s, 'YELLOW', waiting 1s."},
    {"id": 34, "cat": "Creative", "prompt": "Write a Python class `VirtualPet`. It has `hunger` and `energy`. Methods: `feed()` decreases hunger, `play()` decreases energy. If energy < 0, it sleeps."},
    {"id": 35, "cat": "Creative", "prompt": "Write a function that takes a sentence and prints it out vertically, like a banner."},
    {"id": 36, "cat": "Algo", "prompt": "Implement the 'Sieve of Eratosthenes' to find all primes up to N, but optimize it to use a bitarray instead of a list of booleans for memory efficiency."},
    {"id": 37, "cat": "Algo", "prompt": "Write a function to check if two strings are anagrams, but you must do it in O(n) time and O(1) space (assuming fixed alphabet size)."},
    {"id": 38, "cat": "Algo", "prompt": "Implement a 'MinStack' that supports push, pop, top, and retrieving the minimum element in constant time O(1)."},
    {"id": 39, "cat": "Algo", "prompt": "Write a function to find the longest substring without repeating characters in a string."},
    {"id": 40, "cat": "Algo", "prompt": "Implement a basic version of the 'cd' (change directory) command logic. Given a current path and a command (like '../abc/./def'), resolve the new path."},

    # --- CATEGORY 5: SECURITY & EDGE CASES ---
    {"id": 41, "cat": "Security", "prompt": "Write a SQL query construction to select a user by ID, but protect it against SQL Injection without using an ORM. Use parameterized queries logic."},
    {"id": 42, "cat": "Security", "prompt": "Write a Python function to safely delete a file. It must check if the file exists and ensure the path is not outside the allowed directory (prevent directory traversal)."},
    {"id": 43, "cat": "Edge Case", "prompt": "Write a function to calculate the average of a list. Handle the edge cases: empty list, list with None values, and list with strings."},
    {"id": 44, "cat": "Edge Case", "prompt": "Implement division of two numbers, but handle division by zero gracefully by returning None, and ensure floating point precision issues are minimized."},
    {"id": 45, "cat": "Edge Case", "prompt": "Write a function to parse a date string 'YYYY-MM-DD'. Handle invalid dates like '2023-02-30' (February 30th) without crashing."},
    {"id": 46, "cat": "Security", "prompt": "Write a function to generate a secure random token of 32 bytes, encoded in URL-safe Base64."},
    {"id": 47, "cat": "Edge Case", "prompt": "Merge two dictionaries. If a key exists in both, the value should become a list containing both values. Handle cases where the original value is already a list."},
    {"id": 48, "cat": "Security", "prompt": "Sanitize a user input string to remove any potential HTML tags to prevent XSS attacks."},
    {"id": 49, "cat": "Algo", "prompt": "Write a function to detect if a binary tree is balanced. Return True or False."},
    {"id": 50, "cat": "Creative", "prompt": "Write a Python script that simulates a simple ATM state machine (Idle -> PIN -> Menu -> Withdraw -> Dispense). Use a while loop and user input."}
]

# ==============================================================================
#  INFERENCE ENGINE
# ==============================================================================
def generate_starcoder_response(model, tokenizer, prompt):
    # StarCoder2 is a base model. It works best with "Completion" style prompts.
    # We format the prompt as a Python docstring or comment block to trigger code generation.
    formatted_prompt = f"\"\"\"\nTask: {prompt}\n\nWrite a Python solution:\n\"\"\"\n"

    inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            temperature=0.1, # Low temp for coding accuracy
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and remove the input prompt from the output to see only the generated code
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Simple cleanup to try and isolate the new content
    if response.startswith(formatted_prompt):
        response = response[len(formatted_prompt):]

    return response.strip()

# ==============================================================================
#  MAIN EXECUTION
# ==============================================================================
if __name__ == "__main__":
    print(f"\nü§ñ Loading Rival 2: StarCoder2-3B (4-bit)...")

    try:
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=MODEL_ID,
            max_seq_length=MAX_SEQ_LENGTH,
            dtype=None,
            load_in_4bit=True
        )
        FastLanguageModel.for_inference(model)
        print("‚úÖ StarCoder2 loaded successfully.")
    except Exception as e:
        print(f"‚ùå Error loading StarCoder2: {e}")
        exit()

    results = []
    print("\nüöÄ Starting Benchmark on StarCoder2...")

    for task in tqdm(BENCHMARK_DATASET, desc="StarCoder Progress"):
        res = generate_starcoder_response(model, tokenizer, task['prompt'])
        results.append(res)
        # print(f"\n--- Debug Output for {task['id']} ---\n{res[:200]}...\n") # Optional debug

    # --- SAVE RESULTS ---
    print(f"\nüíæ Saving results to {OUTPUT_FILE}...")
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        f.write("=== STARCODER2-3B BENCHMARK RESULTS ===\n\n")
        for i, task in enumerate(BENCHMARK_DATASET):
            f.write(f"PROBLEM ID: {task['id']} ({task['cat']})\n")
            f.write(f"PROMPT: {task['prompt']}\n")
            f.write("-" * 40 + "\n")
            f.write(f"[STARCODER2 RESPONSE]:\n{results[i]}\n")
            f.write("=" * 80 + "\n\n")

    print("‚úÖ Done! StarCoder2 results saved.")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!

ü§ñ Loading Rival 2: StarCoder2-3B (4-bit)...
==((====))==  Unsloth 2025.10.12: Fast Starcoder2 patching. Transformers: 4.57.1. vLLM: 0.11.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.79G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

onekq-ai/starcoder2-3b-bnb-4bit does not have a padding token! Will use pad_token = <|endoftext|>.
‚úÖ StarCoder2 loaded successfully.

üöÄ Starting Benchmark on StarCoder2...


StarCoder Progress: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [44:36<00:00, 53.53s/it]


üíæ Saving results to starcoder_results.txt...
‚úÖ Done! StarCoder2 results saved.



