In [1]:
import gc
import re
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm

# Import from unsloth
from unsloth import FastModel
from unsloth.chat_templates import get_chat_template, standardize_data_formats

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer


In [None]:
#########################
# 1. Load the Fine-Tuned Model
#########################
# IMPORTANT: Make sure you pass the local path you used to save your model
# e.g. new_model_local = "Gemma-3-12B-it-FirstResponder"
FINETUNED_MODEL_PATH = "google/gemma-3-12b-it"
torch.cuda.empty_cache()

print("Loading the fine-tuned model...")
model, tokenizer = FastModel.from_pretrained(
    model_name = FINETUNED_MODEL_PATH,
    max_seq_length = 1024,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False
    # token = "hf_..."  # If your model is gated and requires a token
)

# # Because we used the gemma-3 chat template, fetch it again
tokenizer = get_chat_template(
    tokenizer,
    chat_template="gemma-3",  # same template used during finetuning
)

model.eval()

In [5]:
#########################
# 1. Load the QWAN Model
#########################
# IMPORTANT: Replace with your actual model path if using a local fine-tuned version
FINETUNED_MODEL_PATH = "Qwen/Qwen1.5-7B-Chat"
torch.cuda.empty_cache()

print("Loading the QWAN model...")
model, tokenizer = FastModel.from_pretrained(
    model_name = FINETUNED_MODEL_PATH,
    max_seq_length = 1024,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False
    # token = "hf_..."  # If gated
)

# # Use the Qwen chat template
# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template="qwen",  # match your finetuning template
# )

model.eval()


Loading the QWAN model...
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.643 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.61s/it]


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 4096)
    (layers): ModuleList(
      (0-31): 32 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((4096,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((4096,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((4096,)

In [6]:
model.device

device(type='cuda', index=0)

In [7]:
from Config import Config
config = Config()

In [8]:
#########################
# 2. Load the Test Data
########################
data = 0
snapshot = -1
name = f"{'CW' if data == 0 else 'FC'}sn{'first' if snapshot == 0 else 'last'}"

path_df = config.path_saved_codeworkout if data == 0 else config.path_saved_falcon
questions_df = pd.read_excel(config.codeworkout_questions_path) if data == 0 else pd.read_csv(config.falconcode_questions_path)
id_question = 'ProblemID' if data == 0 else 'id'
prompt = 'Requirement' if data == 0 else 'prompt'

df = pd.read_pickle(path_df)
df = df[['student_id', 'prev_tasks_id', 'prev_tasks']] if data == 0 else df[['student_id', 'course_id', 'prev_tasks_id', 'prev_tasks']]
df = df[~df['student_id'].duplicated()]

In [9]:
system_prompt = (
    "You are a computer science teacher evaluating student code. "
    "Evaluate the following programming skills on a scale from 1 to 5, where "
    "1 = weak, 2 = fair, 3 = average, 4 = strong, 5 = excellent. "
    "Provide ONLY a JSON object, no extra text, no explanation, no code block, no markdown.Format strictly like this:\n"
    "{\"Decomposition\": 3, \"Algorithmic Design\": 4, \"Reading Comprehension\": 5}"
    "Definitions:\n"
    "- Decomposition: breaking the problem into parts — 1 = no understanding, 2 = only simple parts, "
    "3 = basic decomposition, 4 = mostly complete, 5 = excellent decomposition.\n"
    "- Algorithmic Design: choosing and structuring algorithms — 1 = no algorithm or wrong approach, "
    "2 = basic approach, 3 = reasonable structure, 4 = good design, 5 = excellent design.\n"
    "- Reading Comprehension: understanding the problem description — 1 = misunderstood task, "
    "2 = partially understood, 3 = mostly understood, 4 = good understanding, 5 = excellent understanding."
)

In [24]:
#########################
# 4. Helper: Build Chat Template
#########################
def build_prompt(user_input):
    """
    Build the conversation with roles: system, user.
    We'll pass that to tokenizer.apply_chat_template(...).
    """
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": system_prompt}
            ]
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_input}
            ]
        }
    ]
    # Return the formatted text ready for generation
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,               # ← important!
        add_generation_prompt=True
    )

In [82]:
def build_prompt(user_input):
    """
    Build prompt for Qwen1.5-Chat using role-based messages.
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_input}
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

In [83]:
if data == 0:
    df['user_input'] = df.apply(
        lambda row: [
            f'Task: {questions_df[questions_df[id_question] == int(row["prev_tasks_id"][i])][prompt].iloc[0]}\n\n'
            f'Student’s code:\n{row["prev_tasks"][i][snapshot]}'
            for i in range(len(row['prev_tasks_id']))
        ],
        axis=1
    )
else:
    # clean question
    questions_df['prompt'] = questions_df['prompt'].apply(lambda x: x.split('PROBLEM STATEMENT:')[-1] if x.__contains__("PROBLEM STATEMENT:") else x)
    questions_df['prompt'] = questions_df.prompt.apply(lambda text: re.sub(r'\bPROBLEM STATEMENT: \b', '', text).strip())
    questions_df[id_question] = questions_df[id_question].apply(lambda x: x.lower())
    df['user_input'] = df.apply(
        lambda row: [
            f"Task: {questions_df[(questions_df[id_question] == row['prev_tasks_id'][i]) & (questions_df['course_id'] == row['course_id'])][prompt].iloc[0]}\n\n"
            f"Student’s code:\n{row['prev_tasks'][i][snapshot]}"
            for i in range(len(row['prev_tasks_id']))
        ],
        axis=1
    )

In [84]:
def gen(batch_prompts):
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True)
    prompt_lens = [len(input_ids) for input_ids in inputs['input_ids']]
    inputs = inputs.to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=80,
            temperature=0.3,
            top_p=0.95,
            top_k=64,
            do_sample=True
        )

    batch_results = []
    for j in range(outputs.shape[0]):
        gen_text = tokenizer.decode(outputs[j][prompt_lens[j]:], skip_special_tokens=True)
        batch_results.append(gen_text)
        
    return batch_results

In [85]:
def gen(batch_prompts):
    tokenized = [tokenizer(p, return_tensors="pt") for p in batch_prompts]
    prompt_lens = [t["input_ids"].shape[-1] for t in tokenized]

    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=80,
            temperature=0.3,
            top_p=0.95,
            top_k=64,
            do_sample=True
        )

    results = []
    for out, prompt_len in zip(outputs, prompt_lens):
        text = tokenizer.decode(out[prompt_len:], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        results.append(text)

    return results


In [86]:
all_prompts = [build_prompt(prompt) for row in df['user_input'].head(2) for prompt in row]

In [87]:
#########################
# 5. Generate Predictions
#########################
batch_size = 8
all_outputs = []

for i in tqdm(range(0, len(all_prompts), batch_size)):
    batch = all_prompts[i:i + batch_size]
    outputs = gen(batch)  # your existing function
    all_outputs.extend(outputs)
    if (i // batch_size) % 50 == 0 and i != 0:
        print(f"{i} / {len(all_prompts)}")
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

100%|██████████| 8/8 [00:36<00:00,  4.51s/it]


In [88]:
import re
import json

def extract_scores(text, i, loop_num=0):
    # Find the first JSON block in the text
    text = re.sub(r'[^\x00-\x7F]+', '', text).lower()  # removes non-ASCII
    match = re.search(r'\{[\s\S]*?\}', text)
    if match:
        json_str = match.group(0)
        try:
            data = json.loads(json_str)
            decomposition = data.get("decomposition")
            algorithmic = data.get("algorithmic_design") or data.get("algorithmic design")
            reading = data.get("reading comprehension") or data.get("reading_comprehension")
            return decomposition, algorithmic, reading
        except json.JSONDecodeError:
            print("Failed to parse JSON!")
            if loop_num > 2:
                return None, None, None
            return extract_scores(gen(all_prompts[i])[0], i, loop_num+1)
    else:
        print("No JSON found!")
        if loop_num > 2:
            return None, None, None
        return extract_scores(gen(all_prompts[i])[0], i, loop_num+1)

In [89]:
decomposition = []
alg = []
reading = []
for i, o in enumerate(all_outputs): 
    decomp, algo, read = extract_scores(o, i)
    decomposition.append(decomp)
    alg.append(algo)
    reading.append(read)

No JSON found!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
No JSON found!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
No JSON found!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
No JSON found!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!
No JSON found!
Failed to parse JSON!
Failed to parse JSON!
Failed to parse JSON!


In [90]:
re.sub(r'[^\x00-\x7F]+', '', all_outputs[35])

' JSON:\n\n```java\n{\n  "decomposition": {\n    "decomposition_level": 4,\n    "algorithmic_design": 4,\n    "reading_comprehension": 4\n  },\n  "feedback": {\n    "strengths": [\n      "Decomposition is well-structured and organized.",\n      "Algorithmic design'

In [79]:
all_outputs[35]

'熟读并评估以下代码，然后给出相应的 JSON 对象表示\n\n```json\n{\n  "decomposition": 4,\n  "algorithmic_design": 4,\n  "reading_comprehension": 5\n}\n```\n\nThe given code snippet is a simple Java function named `answerCell` that takes three boolean parameters: `isMorning`, `isMom`, and `is'

In [80]:
reading

[5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5]

In [68]:
d = []
a = []
r = []
for i in range(0, len(decomposition), 30):
    d.append(decomposition[i:i+30])
    a.append(alg[i:i+30])
    r.append(reading[i:i+30])

df['decomposition'] = d
df['alg'] = a
df['reading'] = r

df.to_csv(r'/home/nogaschw/Codeworkout/Thesis/gemma1to5.csv')

ValueError: Length of values (2) does not match length of index (630)