In [None]:
import os
os.environ["HF_HOME"] = "/network/scratch/a/aghajohm/hf_home" # set before transformers
os.environ["CUDA_HOME"] = "/cvmfs/ai.mila.quebec/apps/arch/common/cuda/12.5.1" # Hardcoded for now
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import sglang
import sys
# Add the parent directory to the path so we can import from aha.py
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from aha import initialize_model, generate_r1_prompt

In [2]:
def format_response(response):
    from IPython.display import HTML

    # Format the response with syntax highlighting
    formatted_html = f"""
    <div style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; border: 1px solid #ddd;">
        <h3 style="color: #333; margin-top: 0;">Generated Response:</h3>
        <pre style="background-color: #f5f5f5; padding: 10px; border-radius: 3px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word;">{response}</pre>
    </div>
    """

    return HTML(formatted_html)

In [None]:
CHECKPOINT_OR_NAME = '/network/scratch/a/aghajohm/aha_models/test_checkpoint'
CHAT_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # should have the tokenizer we trained the checkpoint with
tokenizer = AutoTokenizer.from_pretrained(CHAT_MODEL_NAME)
CHECKPOINT_OR_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [13]:
def generate_r1_prompt():
    r1_prefix = [
        {
            "role": "system",
            "content": "You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer.",
        },
        {
            "role": "user",
            "content": f"What is gravity?",
        },
        {"role": "assistant", "content": "Gravity"},
    ]
    input_ids = tokenizer.apply_chat_template(
        r1_prefix, tokenize=True, continue_final_message=True
    )
    prompt = tokenizer.decode(
        input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
    )
    return {"prompt": prompt, "input_ids": input_ids}

In [None]:
generate_r1_prompt()

In [None]:
sglang_engine = sglang.Engine(
        model_path=CHECKPOINT_OR_NAME,
        enable_memory_saver=True,
        skip_tokenizer_init=True,
        mem_fraction_static=0.20,
        schedule_policy="fcfs",
        schedule_conservativeness=0.001,
        max_running_requests=10000,
    )

In [None]:
PRETRAINED_CHECKPOINT_OR_NAME = "Qwen/Qwen2.5-1.5B"
pretrained_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_CHECKPOINT_OR_NAME)
pretrained_sglang_engine = sglang.Engine(
    model_path=PRETRAINED_CHECKPOINT_OR_NAME,
    enable_memory_saver=True,
    skip_tokenizer_init=True,
    mem_fraction_static=0.20,
    schedule_policy="fcfs",
    schedule_conservativeness=0.001,
)

In [None]:
import random
full_text = "In physics, gravity (from Latin gravitas 'weight'[1]) is a fundamental interaction primarily observed as a mutual attraction between all things that have mass. Gravity is, by far, the weakest of the four fundamental interactions, approximately 1038 times weaker than the strong interaction, 1036 times weaker than the electromagnetic force, and 1029 times weaker than the weak interaction. As a result, it has no significant influence at the level of subatomic particles.[2] However, gravity is the most significant interaction between objects at the macroscopic scale, and it determines the motion of planets, stars, galaxies, and even light."
random_cutoff = random.randint(0, len(full_text) - 100)

In [None]:
print(f"random cutoff: {random_cutoff}")
text_to_complete = full_text[:random_cutoff]
ground_truth_completion = full_text[random_cutoff:]
text_tokenized = pretrained_tokenizer(text_to_complete)
pretrain_sampling_params = {
        "temperature": 1.0,
        "max_new_tokens": 1024,
        "top_p": 1.0,
        "n": 1,  # Only generate one completion per text
    }
    
generation = pretrained_sglang_engine.generate(input_ids=text_tokenized["input_ids"], sampling_params=pretrain_sampling_params)
response = tokenizer.decode(generation["output_ids"])
pretrained_completion = response
format_response(response)

In [234]:
chat = [
        {
            "role": "system",
            "content": "You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer. Always announce your score in \\score{} like this: \\score{your_score}\n",
        },
        {
            "role": "user",
            "content": f"Your task is to compare A and B. "\
                        "Give the completion a score from 0 to 100 based on how closely B matches A in terms of content. It can have a different style, but the content should be the same. "\
                        "Always announce your score in \\score{} like this: \\score{your_score}\n"\
                       f"A:\n{ground_truth_completion}\n"\
                       f"B:\n{pretrained_completion}\n",
        },
        {"role": "assistant", "content": "Let's compare B against A.<think>"},
]


In [None]:
prompt

In [236]:
input_ids = tokenizer.apply_chat_template(chat, tokenize=True, continue_final_message=True)
prompt = tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
chat_tokenized = {"prompt": prompt, "input_ids": input_ids}

In [237]:
def extract_score(response):
    import re
    
    # Try to extract score from \score{} format
    score_pattern = r'\\score\{(\d+)\}'
    score_match = re.search(score_pattern, response)
    if score_match:
        try:
            return int(score_match.group(1))
        except (ValueError, IndexError):
            return None
        
    return None

In [None]:
eval_sampling_params = {
        "temperature": 0.7,
        "max_new_tokens": 1024*4,
        "top_p": 1.0,
        "n": 40,  # Only generate one response per question
    }
    
generations = sglang_engine.generate(input_ids=chat_tokenized["input_ids"], sampling_params=eval_sampling_params)
scores = []
for generation in generations:
    response = tokenizer.decode(generation["output_ids"])
    scores.append(extract_score(response)) 

import numpy as mean
print(scores)
scores = [score for score in scores if score is not None]
if len(scores) > 0:
    print(f"mean score: {np.mean(scores)}, std score: {np.std(scores)}")
else:
    print("No valid scores found")

# plot the scores
import matplotlib.pyplot as plt
plt.hist(scores, bins=20, range=(0, 100), density=True)
plt.show()
format_response(response)  

In [None]:
format_response(tokenizer.decode(generations[-2]["output_ids"]))

In [None]:
extract_score(response)

In [None]:
def rank_completion(ground_truth_completion, pretrained_completion):
    chat = [
        {
            "role": "system",
            "content": "You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer. Always announce your score in \\score{} like this: \\score{your_score}\n",
        },
        {
            "role": "user",
            "content": f"Your task is to compare A and B. "\
                        "Give the completion a score from 0 to 100 based on how closely B matches A in terms of content. It can have a different style, but the content should be the same. "\
                        "Always announce your score in \\score{} like this: \\score{your_score}\n"\
                       f"A:\n{ground_truth_completion}\n"\
                       f"B:\n{pretrained_completion}\n",
        },
        {"role": "assistant", "content": "Let's compare B against A.<think>"},
    ]
    input_ids = tokenizer.apply_chat_template(chat, tokenize=True, continue_final_message=True)
    prompt = tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
    chat_tokenized = {"prompt": prompt, "input_ids": input_ids}
    generations = sglang_engine.generate(input_ids=chat_tokenized["input_ids"], sampling_params=eval_sampling_params)
    scores = []
    for generation in generations:
        response = tokenizer.decode(generation["output_ids"])
        scores.append(extract_score(response))
    scores = [score for score in scores if score is not None]
    if len(scores) > 0:
        return np.mean(scores), np.std(scores)
    else:
        return None, None

In [None]:
import random

full_text = "In physics, gravity (from Latin gravitas 'weight'[1]) is a fundamental interaction primarily observed as a mutual attraction between all things that have mass. Gravity is, by far, the weakest of the four fundamental interactions, approximately 1038 times weaker than the strong interaction, 1036 times weaker than the electromagnetic force, and 1029 times weaker than the weak interaction. As a result, it has no significant influence at the level of subatomic particles.[2] However, gravity is the most significant interaction between objects at the macroscopic scale, and it determines the motion of planets, stars, galaxies, and even light."
for cutoff in range(50, len(full_text), 50):
    print(f"random cutoff: {cutoff}")
    for try_num in range(3):
        print(f"try_num: {try_num}")
        text_to_complete = full_text[:cutoff]
        ground_truth_completion = full_text[cutoff:]
        text_tokenized = pretrained_tokenizer(text_to_complete)
        pretrain_sampling_params = {
                "temperature": 1.0,
                "max_new_tokens": 1024,
                "top_p": 1.0,
                "n": 1,  # Only generate one completion per text
            }
            
        generation = pretrained_sglang_engine.generate(input_ids=text_tokenized["input_ids"], sampling_params=pretrain_sampling_params)
        response = tokenizer.decode(generation["output_ids"])
        pretrained_completion = response
        mean_score, std_score = rank_completion(ground_truth_completion, pretrained_completion)
        print(f"mean score: {mean_score}, std score: {std_score}", end='\t')
    print()
