In [1]:
from google.colab import files

uploaded = files.upload()  # a popup will appear


Saving spoc-train-eval.tsv to spoc-train-eval.tsv
Saving spoc-train-test.tsv to spoc-train-test.tsv
Saving spoc-train-train.tsv to spoc-train-train.tsv


In [2]:
import pandas as pd
import re

# Paths
train_path = "spoc-train-train.tsv"
eval_path  = "spoc-train-eval.tsv"
test_path  = "spoc-train-test.tsv"

# Load TSV files
train_df = pd.read_csv(train_path, sep="\t")
eval_df  = pd.read_csv(eval_path, sep="\t")
test_df  = pd.read_csv(test_path, sep="\t")

# Inspect structure
print(train_df.columns)
print(eval_df.columns)
print(test_df.columns)
print("----------------------------------------------")
print(train_df.head(2))
print("----------------------------------------------")

print(eval_df.head(2))
print("----------------------------------------------")

print(test_df.head(2))


Index(['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'], dtype='object')
Index(['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'], dtype='object')
Index(['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'], dtype='object')
----------------------------------------------
              text          code  workerid probid     subid  line  indent
0              NaN  int main() {         1     3A  41470897     0       0
1  create string s     string s;         1     3A  41470897     1       1
----------------------------------------------
                            text                code  workerid probid  \
0                            NaN        int main() {        32   478A   
1  let a, b, c, d, e be integers  int a, b, c, d, e;        32   478A   

      subid  line  indent  
0  41682790     0       0  
1  41682790     1       1  
----------------------------------------------
               text          code  workerid probid     subid  line

In [3]:
print("Train data rows:", len(train_df))
print("Eval data rows:", len(eval_df))
print("Test data rows:", len(test_df))



Train data rows: 246086
Eval data rows: 27288
Test data rows: 20480


In [4]:
def clean_text(text):
    if not isinstance(text, str):
        return ""                          # (1)
    text = text.strip()                    # (2)
    text = re.sub(r'\s+', ' ', text)       # (3)
    text = text.replace("’", "'").replace("`", "'")  # (4)
    return text


In [5]:
# -----------------------------
# 3 Cleaning functions
# -----------------------------
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # normalize whitespace
    text = text.replace("’", "'").replace("`", "'")  # normalize quotes
    return text

def clean_code(code):
    if not isinstance(code, str):
        return ""
    return code.strip()

# -----------------------------
# 4 Apply cleaning + drop NaN
# -----------------------------
for df in [train_df, eval_df, test_df]:
    df['text'] = df['text'].apply(clean_text)
    df['code'] = df['code'].apply(clean_code)
    # Drop rows where both pseudo or code are empty
    df.dropna(subset=['text', 'code'], inplace=True)
    df = df[df['text'].str.strip() != ""]
    df = df[df['code'].str.strip() != ""]

In [6]:
# -----------------------------
# 5 Group by subid (one program per row)
# -----------------------------
def group_by_subid(df):
    grouped = df.groupby('subid').agg({
        'text': lambda x: ' '.join(x.astype(str).tolist()),
        'code': lambda x: '\n'.join(x.astype(str).tolist())
    }).reset_index()
    return grouped

train_grouped = group_by_subid(train_df)
eval_grouped  = group_by_subid(eval_df)
test_grouped  = group_by_subid(test_df)

print("After grouping:")
print("Train:", train_grouped.shape)
print("Eval:", eval_grouped.shape)
print("Test:", test_grouped.shape)

# -----------------------------
# 6 Format for the model
# -----------------------------
def format_pair(pseudo, code):
    return f"<|startoftext|>Pseudo-code: {pseudo}\nPython code:\n{code}<|endoftext|>"

train_grouped['formatted'] = train_grouped.apply(lambda x: format_pair(x['text'], x['code']), axis=1)
eval_grouped['formatted']  = eval_grouped.apply(lambda x: format_pair(x['text'], x['code']), axis=1)
test_grouped['formatted']  = test_grouped.apply(lambda x: format_pair(x['text'], x['code']), axis=1)

# -----------------------------
# 7 Save as plain text files
# -----------------------------
train_grouped['formatted'].to_csv("/content/train.txt", index=False, header=False)
eval_grouped['formatted'].to_csv("/content/valid.txt", index=False, header=False)
test_grouped['formatted'].to_csv("/content/test.txt", index=False, header=False)

print("Preprocessing complete!")
print("Files saved as train.txt, valid.txt, test.txt in /content/")

After grouping:
Train: (11528, 3)
Eval: (2001, 3)
Test: (1019, 3)
Preprocessing complete!
Files saved as train.txt, valid.txt, test.txt in /content/


**TOKENIZATION**

In [10]:
# =============================
# TOKENIZATION & DATASET PREP   FIXED
# =============================
from datasets import load_dataset
from transformers import GPT2Tokenizer

# Load dataset (from preprocessed text files)
dataset = load_dataset('text', data_files={
    'train': '/content/train.txt',
    'valid': '/content/valid.txt'
})

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add special tokens for start and end markers
special_tokens_dict = {
    'pad_token': '[PAD]',
    'additional_special_tokens': ['<|startoftext|>', '<|endoftext|>']
}
tokenizer.add_special_tokens(special_tokens_dict)

# Tokenize function (adds labels for causal LM)
def tokenize_function(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    # Add labels for causal language modeling loss
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Check one sample
print(tokenized_datasets["train"][0].keys())
# Should print: dict_keys(['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/269142 [00:00<?, ? examples/s]

Map:   0%|          | 0/31290 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [19]:
# =============================
# FINE-TUNING GPT-2 (Colab Optimized)
# =============================

from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

# ---- Load model & tokenizer ----
model = GPT2LMHeadModel.from_pretrained("gpt2")
# avoid long warning and re-init randomness
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

# ---- ⚙️ Create smaller subsets for faster training ----
# You can increase these numbers later for better accuracy
small_train_dataset = tokenized_datasets["train"].select(range(min(2000, len(tokenized_datasets["train"]))))
small_eval_dataset = tokenized_datasets["valid"].select(range(min(500, len(tokenized_datasets["valid"]))))

print(f"Using {len(small_train_dataset)} training samples and {len(small_eval_dataset)} validation samples.")

# ---- Training Arguments ----
training_args = TrainingArguments(
    output_dir="./results",
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_steps=100,
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,                       # ⚡ mixed precision (faster on GPU)
    max_steps=5000,                  # optional: stop early even if data is large
)

# ---- Trainer ----
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

# ---- Train ----
print(" Starting fine-tuning...")
trainer.train()

# ---- Save model ----
print("Saving fine-tuned model...")
trainer.save_model("./finetuned_gpt2")
tokenizer.save_pretrained("./finetuned_gpt2")

print(" Training complete! Model saved to './finetuned_gpt2'")


Epoch,Training Loss,Validation Loss
1,0.093,0.059126
2,0.0489,0.059814
3,0.0481,0.054524
4,0.0604,0.059467
5,0.0267,0.062093


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Saving fine-tuned model...
 Training complete! Model saved to './finetuned_gpt2'
Using 2000 training samples and 500 validation samples.
 Starting fine-tuning...


Epoch,Training Loss,Validation Loss
1,0.1,0.055211
2,0.0495,0.056725
3,0.0478,0.055593
4,0.0609,0.056141
5,0.0271,0.059368


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Saving fine-tuned model...
 Training complete! Model saved to './finetuned_gpt2'


In [20]:
# =============================
# EVALUATION: BLEU + CodeBLEU
# =============================

import torch
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm

# ---- Load your fine-tuned model and tokenizer ----
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "./finetuned_gpt2"  # from fine-tuning step
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# ---- Evaluate on a subset of validation data ----
n_samples = 100  # you can increase later
refs, hyps = [], []

for i in tqdm(range(min(n_samples, len(tokenized_datasets["valid"])))):
    # Get pseudo-code input (the natural language instruction)
    input_text = tokenized_datasets["valid"][i]

    # For this dataset, the original text file is under 'dataset', not tokenized_datasets
    # So we get from dataset["valid"]
    input_text = dataset["valid"][i]["text"]

    # Generate code from pseudo-code
    inputs = tokenizer("<|startoftext|>" + input_text + "<|endoftext|>", return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=256,
        temperature=0.7,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )

    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Append reference and hypothesis for BLEU
    reference = input_text.split("Python code:")[-1].strip()  # assuming structure
    refs.append([reference.split()])
    hyps.append(generated_code.split())

# ---- Compute BLEU ----
chen = SmoothingFunction().method1
bleu_scores = [sentence_bleu(r, h, smoothing_function=chen) for r, h in zip(refs, hyps)]
print(f"\nAverage BLEU Score: {np.mean(bleu_scores):.4f}")


100%|██████████| 100/100 [04:42<00:00,  2.83s/it]


Average BLEU Score: 0.0718





In [None]:
# =============================
# CODEBLEU EVALUATION
# =============================

# 1 Clone CodeXGLUE (contains CodeBLEU)
!git clone https://github.com/microsoft/CodeXGLUE.git

# 2 Create reference and generated text files
ref_path = "/content/reference.txt"
gen_path = "/content/generated.txt"

with open(ref_path, "w") as ref, open(gen_path, "w") as hyp:
    for i in range(min(100, len(dataset["valid"]))):
        input_text = dataset["valid"][i]["text"]
        inputs = tokenizer("<|startoftext|>" + input_text + "<|endoftext|>", return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_length=256,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        reference = input_text.split("Python code:")[-1].strip()

        ref.write(reference + "\n")
        hyp.write(prediction + "\n")

# 3 Run CodeBLEU script
!python CodeXGLUE/Code-Code/code-to-code-trans/CodeBLEU/calc_code_bleu.py \
    --refs /content/reference.txt \
    --hypo /content/generated.txt \
    --lang python


In [21]:
# =============================
# GRADIO APP
# =============================
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("./finetuned_gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("./finetuned_gpt2")

def generate_code(pseudocode):
    input_text = f"<|startoftext|>Pseudo-code: {pseudocode}\nPython code:"
    inputs = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        inputs,
        max_length=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        num_return_sequences=1
    )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Return only the Python part
    if "Python code:" in result:
        result = result.split("Python code:")[-1]
    return result.strip()

gr.Interface(
    fn=generate_code,
    inputs=gr.Textbox(lines=6, placeholder="Enter pseudo-code here..."),
    outputs="text",
    title="Pseudo-code -> Python Code Generator",
    description="Fine-tuned GPT-2 model that converts structured pseudo-code into executable Python."
).launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a54860a8ffbfc2334d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


