# Fine-tuning GPT-2 on SPOC (pseudo-code → Python)

This notebook shows my step-by-step work: I unzip the dataset, build pseudo-code → code pairs,
tokenize with special markers, fine-tune GPT-2 on the causal LM objective (only code tokens
contribute to loss), and run simple automatic evaluations (BLEU + instructions for CodeBLEU).

Notes from me:
- I prefer short runnable cells; run them in order.
- Use a GPU runtime in Colab.
- I used friendly variable names and inline comments so you can follow what I did.


In [None]:
!pip install -q transformers datasets accelerate evaluate sacrebleu sentencepiece
!pip install -q git+https://github.com/huggingface/transformers.git@main

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m502.0/502.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the f

# Unzip and inspect dataset

In [None]:
import zipfile, os
from pathlib import Path

ZIP_PATH = "/content/spoc.zip"
WORK_DIR = Path("/content/spoc_data")
WORK_DIR.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(WORK_DIR)

for p in sorted(WORK_DIR.rglob("*"))[:80]:
    print(p.relative_to(WORK_DIR))


LICENSE
README.md
test
test/spoc-testp.tsv
test/spoc-testw.tsv
testcases
testcases/1000A
testcases/1000A/1000A_testcases.txt
testcases/1000A/1000A_testcases_hidden.txt
testcases/1000A/1000A_testcases_public.txt
testcases/1003A
testcases/1003A/1003A_testcases.txt
testcases/1003A/1003A_testcases_hidden.txt
testcases/1003A/1003A_testcases_public.txt
testcases/1004A
testcases/1004A/1004A_testcases.txt
testcases/1004A/1004A_testcases_hidden.txt
testcases/1004A/1004A_testcases_public.txt
testcases/1005A
testcases/1005A/1005A_testcases.txt
testcases/1005A/1005A_testcases_hidden.txt
testcases/1005A/1005A_testcases_public.txt
testcases/1006A
testcases/1006A/1006A_testcases.txt
testcases/1006A/1006A_testcases_hidden.txt
testcases/1006A/1006A_testcases_public.txt
testcases/1007A
testcases/1007A/1007A_testcases.txt
testcases/1007A/1007A_testcases_custom.txt
testcases/1007A/1007A_testcases_hidden.txt
testcases/1007A/1007A_testcases_public.txt
testcases/1008A
testcases/1008A/1008A_testcases.txt
test

# Build pseudo-code → code pairs (code)

In [None]:
import pandas as pd
from pathlib import Path

root = Path("/content/spoc_data")
train_file = root / "train" / "spoc-train.tsv"
test_file = root / "test" / "spoc-testp.tsv"

def process_spoc_file(file_path, split_name):
    print(f"\nProcessing {split_name} data from: {file_path}")
    print(f"File exists: {file_path.exists()}")

    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    print(f"Total rows loaded: {len(df)}")
    print(f"Columns: {df.columns.tolist()}")

    pairs = []
    grouped = df.groupby(['probid', 'subid'])
    print(f"Grouping into {len(grouped)} unique programs...")

    for (probid, subid), group in grouped:
        group = group.sort_values('line')
        pseudo_lines = group['text'].fillna('').tolist()
        code_lines = group['code'].fillna('').tolist()

        pseudo = '\n'.join(pseudo_lines).strip()
        code = '\n'.join(code_lines).strip()

        if pseudo and code:
            pairs.append({'pseudo': pseudo, 'code': code})

    print(f"Created {len(pairs)} valid pairs for {split_name}")
    return pairs

train_pairs = process_spoc_file(train_file, "train")
test_pairs = process_spoc_file(test_file, "test")

train_df = pd.DataFrame(train_pairs)
test_df = pd.DataFrame(test_pairs)

train_csv = "/content/spoc_train_pairs.csv"
test_csv = "/content/spoc_test_pairs.csv"

train_df.to_csv(train_csv, index=False)
test_df.to_csv(test_csv, index=False)

print(f"\n✓ Saved train to: {train_csv} ({len(train_df)} pairs)")
print(f"✓ Saved test to: {test_csv} ({len(test_df)} pairs)")
print(f"\nFirst train example:")
print(f"Pseudo: {train_df.iloc[0]['pseudo'][:200]}...")
print(f"Code: {train_df.iloc[0]['code'][:200]}...")


Processing train data from: /content/spoc_data/train/spoc-train.tsv
File exists: True
Total rows loaded: 293854
Columns: ['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent']
Grouping into 14548 unique programs...
Created 14548 valid pairs for train

Processing test data from: /content/spoc_data/test/spoc-testp.tsv
File exists: True
Total rows loaded: 52057
Columns: ['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent']
Grouping into 1778 unique programs...
Created 1778 valid pairs for test

✓ Saved train to: /content/spoc_train_pairs.csv (14548 pairs)
✓ Saved test to: /content/spoc_test_pairs.csv (1778 pairs)

First train example:
Pseudo: create a map from strings to integers mp

create new integers n and sum with sum = 0
create new string variable s
read from the input to n
for i from 1 to n inclusive, read standard input to s and inc...
Code: map<string, int> mp;
int main() {
int n, sum = 0;
string s;
cin >> n;
for (int i = 1; i <= n; i++) cin >> s, mp[s]++;


Preprocessing notes (my choices):
- I add two simple special markers to separate input (pseudo) and output (code): <|pseudo|> and <|code|>.
- For GPT-2 training we concatenate: "<|pseudo|>\n{pseudo}\n<|code|>\n{code}\n"
- During training I mask (set label = -100) for the prompt tokens so loss is computed only on the code.
- I use a max sequence length (e.g., 512) — change if you have more VRAM.
- I prefer reproducibility: we save a CSV and re-load it.


In [None]:
import pandas as pd
pairs_csv = "/content/spoc_train_pairs.csv"

df = pd.read_csv(pairs_csv)

print(f"CSV shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head(3))

print("\n" + "="*60)
print("Sample example:")
print(f"\nPseudocode:\n{df.iloc[0]['pseudo'][:300]}")
print(f"\nCode:\n{df.iloc[0]['code'][:300]}")
print("\n" + "="*60)
print(f"Null values:\n{df.isnull().sum()}")

CSV shape: (14548, 2)
Columns: ['pseudo', 'code']

First few rows:
                                              pseudo  \
0  create a map from strings to integers mp\n\ncr...   
1  INF = const int with INF = 0x3f3f3f3f\n\ni, j,...   
2  create int n and ans, set ans to 0\ncreate map...   

                                                code  
0  map<string, int> mp;\nint main() {\nint n, sum...  
1  const int INF = 0x3f3f3f3f;\nint main() {\nint...  
2  int main() {\nint n, ans = 0;\nmap<string, int...  

Sample example:

Pseudocode:
create a map from strings to integers mp

create new integers n and sum with sum = 0
create new string variable s
read from the input to n
for i from 1 to n inclusive, read standard input to s and increment mp[s]
in a loop, change i from 1 to n inclusive
read s from the user input
if mp[s] is true
d

Code:
map<string, int> mp;
int main() {
int n, sum = 0;
string s;
cin >> n;
for (int i = 1; i <= n; i++) cin >> s, mp[s]++;
for (int i = 1; i <= n; i++) {
c

# Tokenizer & helper functions (code)

In [None]:
from transformers import GPT2TokenizerFast
from datasets import load_dataset, Dataset

MODEL = "gpt2"
SPECIAL_PSEUDO = "<|pseudo|>"
SPECIAL_CODE = "<|code|>"
MAX_LEN = 512

tokenizer = GPT2TokenizerFast.from_pretrained(MODEL)
tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_PSEUDO, SPECIAL_CODE]})
tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer length:", len(tokenizer))

train_ds = load_dataset("csv", data_files="/content/spoc_train_pairs.csv")["train"]
test_ds = load_dataset("csv", data_files="/content/spoc_test_pairs.csv")["train"]

train_ds = train_ds.filter(lambda ex: ex["pseudo"] is not None and ex["code"] is not None)
test_ds = test_ds.filter(lambda ex: ex["pseudo"] is not None and ex["code"] is not None)

def build_example(pseudo, code):
    pseudo = pseudo.strip()
    code = code.strip()
    prefix = f"{SPECIAL_PSEUDO}\n{pseudo}\n{SPECIAL_CODE}\n"
    full = prefix + code + "\n"
    enc = tokenizer(full, truncation=True, max_length=MAX_LEN, padding="max_length")

    prefix_enc = tokenizer(prefix, truncation=True, max_length=MAX_LEN)["input_ids"]
    prefix_len = min(len(prefix_enc), MAX_LEN)
    labels = enc["input_ids"].copy()

    for i in range(prefix_len):
        if i < len(labels):
            labels[i] = -100

    enc["labels"] = labels
    return enc

def tokenize_batch(batch):
    inputs = {"input_ids": [], "attention_mask": [], "labels": []}
    for p, c in zip(batch["pseudo"], batch["code"]):
        ex = build_example(p, c)
        inputs["input_ids"].append(ex["input_ids"])
        inputs["attention_mask"].append(ex["attention_mask"])
        inputs["labels"].append(ex["labels"])
    return inputs

train_tokenized = train_ds.map(tokenize_batch, batched=True, batch_size=8,remove_columns=train_ds.column_names)
test_tokenized = test_ds.map(tokenize_batch, batched=True, batch_size=8,remove_columns=test_ds.column_names)

for col in train_tokenized.column_names:
    if col not in ["input_ids", "attention_mask", "labels"]:
        train_tokenized = train_tokenized.remove_column(col)

for col in test_tokenized.column_names:
    if col not in ["input_ids", "attention_mask", "labels"]:
        test_tokenized = test_tokenized.remove_column(col)

print(f"Train samples: {len(train_tokenized)}")
print(f"Test samples: {len(test_tokenized)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Tokenizer length: 50259


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/14548 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1778 [00:00<?, ? examples/s]

Map:   0%|          | 0/14548 [00:00<?, ? examples/s]

Map:   0%|          | 0/1778 [00:00<?, ? examples/s]

Train samples: 14548
Test samples: 1778


# Prepare model, data collator, Trainer (code)

In [None]:
from transformers import GPT2LMHeadModel, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
import torch

model = GPT2LMHeadModel.from_pretrained(MODEL)
model.resize_token_embeddings(len(tokenizer))

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"],
    bias="none",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print("✓ LoRA applied successfully!")
print(f"Trainable params: {model.num_parameters(only_trainable=True):,}")
print(f"Total params: {model.num_parameters():,}")
print(f"Trainable %: {100 * model.num_parameters(only_trainable=True) / model.num_parameters():.2f}%")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


trainable params: 1,622,016 || all params: 126,063,360 || trainable%: 1.2867
✓ LoRA applied successfully!
Trainable params: 1,622,016
Total params: 126,063,360
Trainable %: 1.29%




# Start training (code)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/gpt2_spoc_lora",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),
    learning_rate=3e-4,
    warmup_steps=500,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    data_collator=data_collator,
)

print("✓ Trainer initialized!")
print(f"Training samples: {len(train_tokenized)}")
print(f"Eval samples: {len(test_tokenized)}")

trainer.train()
trainer.save_model("/content/gpt2_spoc_finetuned_final")
tokenizer.save_pretrained("/content/gpt2_spoc_finetuned_final")
print("Saved model & tokenizer to /content/gpt2_spoc_finetuned_final")


✓ Trainer initialized!
Training samples: 14548
Eval samples: 1778


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
from google.colab import files
import shutil

# Zip the folders
shutil.make_archive('gpt2_spoc_lora', 'zip', 'gpt2_spoc_lora')
shutil.make_archive('gpt2_spoc_finetuned_final', 'zip', 'gpt2_spoc_finetuned_final')


# Download the zip files
files.download('gpt2_spoc_lora.zip')
files.download('gpt2_spoc_finetuned_final.zip')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Simple inference function (code)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

print("="*70)
print("LOADING LORA MODEL")
print("="*70)

base_model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("/content/gpt2_spoc_finetuned_final", local_files_only=True)
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, "/content/gpt2_spoc_finetuned_final")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

print(f"✓ MODEL READY ON {device}!\n")

SPECIAL_PSEUDO = "<|pseudo|>"
SPECIAL_CODE = "<|code|>"

def generate_code(pseudo, max_new_tokens=120, num_beams=5, temperature=None):
    """
    Generate C++ code from pseudocode

    Args:
        pseudo: Pseudocode input
        max_new_tokens: Maximum tokens to generate
        num_beams: Number of beams for beam search (higher = better quality)
        temperature: If set, uses sampling instead of beam search
    """
    prompt = f"{SPECIAL_PSEUDO}\n{pseudo.strip()}\n{SPECIAL_CODE}\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        if temperature is None:
            # Beam search (default - best results)
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                num_beams=num_beams,
                num_return_sequences=1,
                early_stopping=True,
                no_repeat_ngram_size=4,
                repetition_penalty=1.3,
                length_penalty=0.8,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        else:

            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_k=40,
                top_p=0.85,
                do_sample=True,
                repetition_penalty=1.5,
                no_repeat_ngram_size=4,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    if SPECIAL_CODE in generated_text:
        parts = generated_text.split(SPECIAL_CODE, 1)
        generated = parts[1] if len(parts) > 1 else ""
    else:
        generated = generated_text[len(prompt):]

    for stop in [SPECIAL_PSEUDO, SPECIAL_CODE, '<|endoftext|>', tokenizer.eos_token]:
        if stop and stop in generated:
            generated = generated.split(stop)[0]
            break

    lines = generated.strip().split('\n')
    cleaned_lines = []
    seen_lines = set()
    brace_count = 0

    for line in lines:
        stripped = line.strip()

        if not stripped and cleaned_lines:
            continue

        brace_count += stripped.count('{') - stripped.count('}')

        if stripped in seen_lines and len(stripped) > 5:
            continue

        cleaned_lines.append(line.rstrip())
        seen_lines.add(stripped)

        if brace_count == 0 and cleaned_lines and stripped == '}':
            break

    return '\n'.join(cleaned_lines).strip()

def quick_generate(pseudo):
    """Quick generation with optimal settings"""
    return generate_code(pseudo, max_new_tokens=100, num_beams=5)

print("="*70)
print("COMPREHENSIVE CODE GENERATION TESTS")
print("="*70)

test_cases = [
    ("read integer n\nprint n", "Simple I/O"),
    ("read integer n\nif n greater than 0\n  print n", "Conditional"),
    ("read integer x\nread integer y\nprint x plus y", "Addition"),
    ("for i from 1 to 10\n  print i", "Simple Loop"),
    ("read integer n\nfor i from 1 to n\n  print i", "Loop with variable"),
    ("read integer n\nfor i from 1 to n\n  if i modulo 2 equals 0\n    print i", "Nested condition"),
    ("read two integers a and b\nprint maximum of a and b", "Max function"),
    ("read integer n\nset sum to 0\nfor i from 1 to n\n  add i to sum\nprint sum", "Sum loop")
]

for pseudo, description in test_cases:
    print(f"\n{'='*70}")
    print(f"Test: {description}")
    print(f"{'='*70}")
    print(f"Pseudocode:\n{pseudo}\n")

    code = quick_generate(pseudo)
    print(f"Generated Code:\n{code}\n")

print("\n" + "="*70)
print("COMPARING DIFFERENT BEAM SIZES")
print("="*70)

example = "read integer n\nfor i from 1 to n\n  if i modulo 2 equals 0\n    print i"
print(f"Pseudocode:\n{example}\n")

for beams in [3, 5, 7]:
    print(f"\nBeam size = {beams}:")
    code = generate_code(example, max_new_tokens=120, num_beams=beams)
    print(code)
    print("-" * 50)

print("\n" + "="*70)
print("READY FOR EVALUATION!")
print("="*70)
print("\nUse: quick_generate(pseudocode) for best results")
print("Example: quick_generate('read integer n\\nprint n')")

LOADING LORA MODEL
✓ MODEL READY ON cuda!

COMPREHENSIVE CODE GENERATION TESTS

Test: Simple I/O
Pseudocode:
read integer n
print n

Generated Code:
int main() {
cin >> n;
cout << n << endl;
return 0;
}


Test: Conditional
Pseudocode:
read integer n
if n greater than 0
  print n

Generated Code:
int main() {
cin >> n;
if (n > 0) {
cout << n << endl;
return 0;
}
}


Test: Addition
Pseudocode:
read integer x
read integer y
print x plus y

Generated Code:
int main() {
cin >> x;
cin >> y;
cout << x + y << endl;
return 0;
}


Test: Simple Loop
Pseudocode:
for i from 1 to 10
  print i

Generated Code:
int main() {
for (int i = 1; i <= 10; i++) {
cin >> i;
}
return 0;
}


Test: Loop with variable
Pseudocode:
read integer n
for i from 1 to n
  print i

Generated Code:
int main() {
cin >> n;
for (int i = 1; i <= n; i++) {
int cout << i;
return 0;
}
}


Test: Nested condition
Pseudocode:
read integer n
for i from 1 to n
  if i modulo 2 equals 0
    print i

Generated Code:
int main() {
cin >> n;

# BLEU evaluation (code)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

torch.cuda.empty_cache()

base_model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("/content/gpt2_spoc_finetuned_final", local_files_only=True)
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, "/content/gpt2_spoc_finetuned_final")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

SPECIAL_PSEUDO = "<|pseudo|>"
SPECIAL_CODE = "<|code|>"

def quick_generate(pseudo, max_new_tokens=100):
    """Safe code generation"""
    try:
        if len(pseudo) > 1200:
            pseudo = pseudo[:1200]

        prompt = f"{SPECIAL_PSEUDO}\n{pseudo.strip()}\n{SPECIAL_CODE}\n"
        inputs = tokenizer(prompt, return_tensors="pt", max_length=450, truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                num_beams=5,
                early_stopping=True,
                no_repeat_ngram_size=4,
                repetition_penalty=1.3,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

        if SPECIAL_CODE in generated_text:
            generated = generated_text.split(SPECIAL_CODE, 1)[1]
        else:
            generated = generated_text[len(prompt):]

        for stop in [SPECIAL_PSEUDO, SPECIAL_CODE, '<|endoftext|>']:
            if stop in generated:
                generated = generated.split(stop)[0]
                break

        return generated.strip()
    except:
        torch.cuda.empty_cache()
        return ""

print("✓ Model loaded successfully!")

✓ Model loaded successfully!


In [None]:
import evaluate
from tqdm import tqdm
import pandas as pd
import numpy as np

print("="*70)
print("BLEU EVALUATION")
print("="*70)

test_df = pd.read_csv("/content/spoc_test_pairs.csv")
test_df = test_df[test_df['pseudo'].str.len() < 1500].copy()  # Filter long samples

sample_size = 100
test_sample = test_df.sample(n=sample_size, random_state=42).reset_index(drop=True)

predictions = []
references = []

for idx in tqdm(range(len(test_sample)), desc="Generating"):
    try:
        pred = quick_generate(test_sample.iloc[idx]['pseudo'])
        predictions.append(pred)
        references.append([test_sample.iloc[idx]['code']])
    except:
        predictions.append("")
        references.append([test_sample.iloc[idx]['code']])

# Calculate BLEU on valid predictions
valid_preds = [p for p in predictions if p.strip()]
valid_refs = [r for p, r in zip(predictions, references) if p.strip()]

bleu = evaluate.load("sacrebleu")
bleu_result = bleu.compute(predictions=valid_preds, references=valid_refs)

print("\n" + "="*70)
print("BLEU RESULTS")
print("="*70)
print(f"BLEU Score:        {bleu_result['score']:.2f}")
print(f"Valid Predictions: {len(valid_preds)}/{sample_size} ({100*len(valid_preds)/sample_size:.1f}%)")
print(f"Precisions:        {[f'{p:.2f}' for p in bleu_result['precisions']]}")
print("="*70)

# Save results
results_df = pd.DataFrame({
    'pseudocode': test_sample['pseudo'],
    'reference': test_sample['code'],
    'prediction': predictions
})
results_df.to_csv('/content/bleu_results.csv', index=False)
print(f"✓ Results saved to /content/bleu_results.csv")

BLEU EVALUATION


Generating: 100%|██████████| 100/100 [03:27<00:00,  2.08s/it]


Downloading builder script: 0.00B [00:00, ?B/s]


BLEU RESULTS
BLEU Score:        13.93
Valid Predictions: 100/100 (100.0%)
Precisions:        ['91.10', '77.05', '65.35', '56.96']
✓ Results saved to /content/bleu_results.csv


In [None]:
print("\n" + "="*70)
print("CODE QUALITY METRICS (Alternative to CodeBLEU)")
print("="*70)

import re

def comprehensive_quality_score(prediction, reference):
    """Calculate detailed code quality metrics"""
    scores = {}

    pred_has_main = bool(re.search(r'int\s+main\s*\(', prediction))
    ref_has_main = bool(re.search(r'int\s+main\s*\(', reference))
    scores['structure'] = 1.0 if pred_has_main == ref_has_main else 0.0
    braces_balanced = prediction.count('{') == prediction.count('}')
    has_semicolons = prediction.count(';') >= 1
    scores['syntax'] = (int(braces_balanced) + int(has_semicolons)) / 2

    keywords = ['int', 'cin', 'cout', 'for', 'while', 'if', 'else', 'return', 'long', 'void']
    pred_keywords = set(kw for kw in keywords if kw in prediction.lower())
    ref_keywords = set(kw for kw in keywords if kw in reference.lower())

    if ref_keywords:
        scores['keywords'] = len(pred_keywords & ref_keywords) / len(ref_keywords)
    else:
        scores['keywords'] = 0.0


    pred_io = ('cin' in prediction, 'cout' in prediction)
    ref_io = ('cin' in reference, 'cout' in reference)
    io_match = sum(p == r for p, r in zip(pred_io, ref_io)) / 2
    scores['io_operations'] = io_match

    control_flow = ['for', 'while', 'if']
    pred_cf = set(cf for cf in control_flow if cf in prediction)
    ref_cf = set(cf for cf in control_flow if cf in reference)

    if ref_cf:
        scores['control_flow'] = len(pred_cf & ref_cf) / len(ref_cf)
    else:
        scores['control_flow'] = 1.0 if not pred_cf else 0.5

    pred_len = len(prediction.split())
    ref_len = len(reference.split())
    length_ratio = min(pred_len, ref_len) / max(pred_len, ref_len) if max(pred_len, ref_len) > 0 else 0
    scores['length_similarity'] = length_ratio

    scores['overall'] = np.mean(list(scores.values()))

    return scores

print(f"\nAnalyzing {len(valid_preds)} predictions...\n")

all_quality_scores = []
for pred, ref in zip(valid_preds, [r[0] for r in valid_refs]):
    quality = comprehensive_quality_score(pred, ref)
    all_quality_scores.append(quality)

avg_scores = {
    'structure': np.mean([s['structure'] for s in all_quality_scores]),
    'syntax': np.mean([s['syntax'] for s in all_quality_scores]),
    'keywords': np.mean([s['keywords'] for s in all_quality_scores]),
    'io_operations': np.mean([s['io_operations'] for s in all_quality_scores]),
    'control_flow': np.mean([s['control_flow'] for s in all_quality_scores]),
    'length_similarity': np.mean([s['length_similarity'] for s in all_quality_scores]),
    'overall': np.mean([s['overall'] for s in all_quality_scores])
}

print("="*70)
print("CODE QUALITY ASSESSMENT RESULTS")
print("="*70)
print(f"\n📊 Component Scores (0.0 - 1.0):")
print(f"  Structure Match:       {avg_scores['structure']:.4f}")
print(f"  Syntax Correctness:    {avg_scores['syntax']:.4f}")
print(f"  Keyword Overlap:       {avg_scores['keywords']:.4f}")
print(f"  I/O Operations Match:  {avg_scores['io_operations']:.4f}")
print(f"  Control Flow Match:    {avg_scores['control_flow']:.4f}")
print(f"  Length Similarity:     {avg_scores['length_similarity']:.4f}")

print(f"\n OVERALL QUALITY SCORE: {avg_scores['overall']:.4f}")


combined_score = (
    bleu_result['score']/100 * 0.25 +
    avg_scores['syntax'] * 0.25 +
    avg_scores['keywords'] * 0.25 +
    avg_scores['control_flow'] * 0.25
)

print(f"\n🔬 COMBINED METRIC (BLEU + Quality):")
print(f"  BLEU Component:        {bleu_result['score']/100:.4f}")
print(f"  Quality Component:     {np.mean([avg_scores['syntax'], avg_scores['keywords'], avg_scores['control_flow']]):.4f}")
print(f"  Combined Score:        {combined_score:.4f}")

print("\n" + "="*70)
print("✓ This provides similar insights to CodeBLEU")
print("="*70)

# Save detailed results
quality_df = pd.DataFrame(all_quality_scores)
quality_df['prediction'] = valid_preds
quality_df['reference'] = [r[0] for r in valid_refs]
quality_df.to_csv('/content/quality_metrics.csv', index=False)
print(f"\n✓ Detailed quality metrics saved to /content/quality_metrics.csv")


CODE QUALITY METRICS (Alternative to CodeBLEU)

Analyzing 100 predictions...

CODE QUALITY ASSESSMENT RESULTS

📊 Component Scores (0.0 - 1.0):
  Structure Match:       0.8200
  Syntax Correctness:    0.5500
  Keyword Overlap:       0.6768
  I/O Operations Match:  0.5800
  Control Flow Match:    0.7450
  Length Similarity:     0.4692

🎯 OVERALL QUALITY SCORE: 0.6402

🔬 COMBINED METRIC (BLEU + Quality):
  BLEU Component:        0.1393
  Quality Component:     0.6573
  Combined Score:        0.5278

✓ This provides similar insights to CodeBLEU

✓ Detailed quality metrics saved to /content/quality_metrics.csv


In [None]:
import random
from IPython.display import display, HTML, clear_output
from ipywidgets import Button, VBox, HBox, Label, Textarea, RadioButtons

print("="*70)
print("HUMAN EVALUATION")
print("="*70)

num_human_eval = 20
human_eval_indices = random.sample(range(len(valid_preds)), min(num_human_eval, len(valid_preds)))
human_eval_samples = []

for idx in human_eval_indices:
    valid_idx = 0
    for i, p in enumerate(predictions):
        if p.strip():
            if valid_idx == idx:
                human_eval_samples.append({
                    'pseudocode': test_sample.iloc[i]['pseudo'],
                    'reference': test_sample.iloc[i]['code'],
                    'prediction': predictions[i]
                })
                break
            valid_idx += 1

human_eval_results = []
current_sample = 0

def show_sample():
    """Display current sample for evaluation"""
    clear_output(wait=True)

    if current_sample >= len(human_eval_samples):
        show_results()
        return

    sample = human_eval_samples[current_sample]

    html = f"""
    <div style="font-family: monospace; background: #f5f5f5; padding: 20px; border-radius: 10px;">
        <h2 style="color: #2c3e50;">Sample {current_sample + 1} of {len(human_eval_samples)}</h2>

        <h3 style="color: #27ae60;">📝 Pseudocode:</h3>
        <pre style="background: white; padding: 10px; border-left: 3px solid #27ae60;">{sample['pseudocode']}</pre>

        <h3 style="color: #3498db;">🎯 Reference Code:</h3>
        <pre style="background: white; padding: 10px; border-left: 3px solid #3498db;">{sample['reference'][:300]}...</pre>

        <h3 style="color: #e74c3c;"> Generated Code:</h3>
        <pre style="background: white; padding: 10px; border-left: 3px solid #e74c3c;">{sample['prediction']}</pre>
    </div>
    """

    display(HTML(html))

    # Rating options
    print("\n" + "="*70)
    print("Rate the generated code on the following criteria:")
    print("="*70)

    print("\n1. CORRECTNESS: Does the code correctly implement the pseudocode?")
    correctness = input("   Score (1-5): ")

    print("\n2. SYNTAX: Is the C++ syntax correct?")
    syntax = input("   Score (1-5): ")

    print("\n3. COMPLETENESS: Does it have all necessary components (main, I/O, etc.)?")
    completeness = input("   Score (1-5): ")

    print("\n4. READABILITY: Is the code well-structured and readable?")
    readability = input("   Score (1-5): ")

    print("\n5. OVERALL: Overall quality of the generated code?")
    overall = input("   Score (1-5): ")

    # Store results
    human_eval_results.append({
        'sample_num': current_sample + 1,
        'correctness': int(correctness) if correctness.isdigit() else 3,
        'syntax': int(syntax) if syntax.isdigit() else 3,
        'completeness': int(completeness) if completeness.isdigit() else 3,
        'readability': int(readability) if readability.isdigit() else 3,
        'overall': int(overall) if overall.isdigit() else 3,
        'pseudocode': sample['pseudocode'],
        'prediction': sample['prediction']
    })

    print(f"\n✓ Evaluation saved! ({current_sample + 1}/{len(human_eval_samples)})")

def show_results():
    """Display human evaluation results"""
    clear_output(wait=True)

    print("="*70)
    print("HUMAN EVALUATION RESULTS")
    print("="*70)

    df = pd.DataFrame(human_eval_results)

    print(f"\nTotal samples evaluated: {len(df)}")
    print("\nAverage Scores (out of 5):")
    print(f"  Correctness:   {df['correctness'].mean():.2f}")
    print(f"  Syntax:        {df['syntax'].mean():.2f}")
    print(f"  Completeness:  {df['completeness'].mean():.2f}")
    print(f"  Readability:   {df['readability'].mean():.2f}")
    print(f"  Overall:       {df['overall'].mean():.2f}")

    print("\nScore Distribution:")
    for criterion in ['correctness', 'syntax', 'completeness', 'readability', 'overall']:
        print(f"\n{criterion.capitalize()}:")
        for score in range(1, 6):
            count = (df[criterion] == score).sum()
            bar = '█' * count
            print(f"  {score}: {bar} ({count})")

    # Save results
    df.to_csv('/content/human_evaluation_results.csv', index=False)
    print(f"\n✓ Results saved to /content/human_evaluation_results.csv")
    print("="*70)

# Start evaluation
print(f"\nPrepared {len(human_eval_samples)} samples for human evaluation")
print("\nINSTRUCTIONS:")
print("  • Rate each criterion from 1 (poor) to 5 (excellent)")
print("  • Press Enter after each score")
print("  • Evaluation will proceed automatically through all samples")
print("\n" + "="*70)

input("\nPress Enter to start human evaluation...")

# Run evaluation
for i in range(len(human_eval_samples)):
    current_sample = i
    show_sample()

show_results()


Rate the generated code on the following criteria:

1. CORRECTNESS: Does the code correctly implement the pseudocode?

2. SYNTAX: Is the C++ syntax correct?
   Score (1-5): 4

3. COMPLETENESS: Does it have all necessary components (main, I/O, etc.)?
