In [0]:
dbutils.widgets.text("BASE_MODEL_NAME", "", "Base model")
dbutils.widgets.text("NEW_MODEL_NAME", "", "Train name")
dbutils.widgets.text("CHALLENGER_VERSION", "", "Challenger version")

In [0]:
!pip install -q transformers tf-keras torch evaluate pylint radon pyyaml bitsandbytes accelerate git+https://github.com/google-research/bleurt.git

%restart_python

In [0]:
import mlflow
import os
import multiprocessing


# Get the number of available CPU threads
num_threads = os.cpu_count() or multiprocessing.cpu_count()
print(f"Number of available threads: {num_threads}")

# Set the number of threads for OpenMP and MKL
os.environ["OMP_NUM_THREADS"] = f"{num_threads}"
os.environ["MKL_NUM_THREADS"] = f"{num_threads}"
os.environ["NUMEXPR_NUM_THREADS"] = f"{num_threads}"
os.environ["BLAS_NUM_THREADS"] = f"{num_threads}"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


mlflow.set_registry_uri("databricks")
client = mlflow.tracking.MlflowClient()

BASE_MODEL_NAME = dbutils.widgets.get("BASE_MODEL_NAME")
NEW_MODEL_NAME = dbutils.widgets.get("NEW_MODEL_NAME")
CHALLENGER_VERSION = dbutils.widgets.get("CHALLENGER_VERSION")

print(f"Starting evaluation for model: {NEW_MODEL_NAME}")
print(f"Challenger version: {CHALLENGER_VERSION}")

In [0]:
import torch
import ast
import re


def generate_from_original(model, tokenizer, original_code, device="cpu"):
    prompt = f"""
You are a coding assistant. Your task is to refactore a provided Python code.
Respond ONLY with a clean and efficient Python code and nothing else.
No comments or other text. No explanations. No markdown.<|EOT|>

User:
{original_code}
<|EOT|>

Assistant:
```python
"""
    print("Tokenizing")
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)

    print("Generating")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=32021
        )

    response = tokenizer.decode(
        output_ids[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    )
    
    # Extract code block from response
    code_match = re.search(r"(.*?)```", response, re.DOTALL)
    code = code_match.group(1).strip() if code_match else response

    # print(f"Generated code:\n{response}")
    print(f"Trimmed code:\n{code}")

    return code.strip()

In [0]:
import yaml
import torch
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer


py_tests = Path("./tests.yaml").read_text()
py_tests = yaml.safe_load(py_tests)["tests"]
print(f"Successfully loaded {len(py_tests)} tests")


def run_tests(model_uri):
    print(f"Running genration for model at: {model_uri}")

    loaded_model_pipeline = mlflow.transformers.load_model(
        model_uri,
        # torch_dtype=torch.float16,
        device_map="cpu",
        low_cpu_mem_usage=True
    )

    model = loaded_model_pipeline.model
    tokenizer = loaded_model_pipeline.tokenizer
    # tokenizer.pad_token = tokenizer.eos_token

    results = []
    for index, test in enumerate(py_tests):
        refactored_code = test["refactored"]
        original_code = test["original"]

        print(f"Generating code for test {index + 1}...")
        generated_code = generate_from_original(model, tokenizer, original_code)

        results.append({
            "original": original_code,
            "generated": generated_code,
        })
    
    del model
    del tokenizer
    del loaded_model_pipeline
    
    return results

In [0]:
def run_benchmark(tests, metric, model_uri):
    print(f"Running banchmark for model at: {model_uri}")

    scores = []

    for index, test in enumerate(tests):
        original_code = test["original"]
        generated_code = test["generated"]

        print(f"Computing benchmark score for test {index + 1}...")
        benchmark_score = metric.compute(
            predictions=[generated_code],
            references=[original_code]
        )['scores'][0]
        
        scores.append(benchmark_score)
    
    average_score = sum(scores) / len(scores)
    print(f"Average benchmark score: {average_score}")

    return average_score

In [0]:
challenger_uri = f"models:/{NEW_MODEL_NAME}/{CHALLENGER_VERSION}"
challenger_tests = run_tests(challenger_uri)

latest_prod = client.get_latest_versions(NEW_MODEL_NAME, stages=["Production"])

if latest_prod:
    champion = latest_prod[0]
    champion_version = champion.version
    print(f"Found Champion: version {champion_version}")
    champion_uri = f"models:/{NEW_MODEL_NAME}/{champion_version}"
    champion_tests = run_tests(champion_uri)


In [0]:
import evaluate


bleurt = evaluate.load("bleurt", module_type="metric", config_name="BLEURT-20")
challenger_score = run_benchmark(challenger_tests, bleurt, challenger_uri)
champion_score = run_benchmark(champion_tests, bleurt, champion_uri) if latest_prod else None
del bleurt


# Return scores
dbutils.notebook.exit({
    "champion_score": champion_score,
    "challenger_score": challenger_score
})