In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Hugging Face - Fine-Tuning CodeT5 for Code Translation (AI4SE Focus)

# This notebook demonstrates how to fine-tune the CodeT5 model using Hugging Face Transformers
# for a Software Engineering task: translating Python code to Java.

# ------------------------
# 1. Install Required Libraries
# ------------------------
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
!pip install transformers datasets evaluate
!pip install scikit-learn pandas sacrebleu
!pip install codebleu tree-sitter==0.23.1 tree_sitter_languages==1.7.0

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch==2.5.1
  Downloading https://download.pytorch.org/whl/cu124/torch-2.5.1%2Bcu124-cp311-cp311-linux_x86_64.whl (908.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.20.1
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.20.1%2Bcu124-cp311-cp311-linux_x86_64.whl (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.5.1
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.5.1%2Bcu124-cp311-cp311-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu1

✅ This following loads a pre-trained models & tokenizer from Hugging Face using the checkpoint name (e.g., "Salesforce/codet5-small").


*  The tokenizer knows how to convert text into tokens that the model

*   It also handles things like padding, truncation, special tokens, etc.

*	It comes with a fixed vocabulary learned during pretraining, that however we can expand if needed as shown

In [None]:
# ------------------------------------------------------------------------
# 3. Load Pre-trained Model & Tokenizer
# ------------------------------------------------------------------------
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from datasets import DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

model_checkpoint = "Salesforce/codet5-small"

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
if "<mask>" not in tokenizer.get_vocab(): #add <mask> token to vocabulary
    tokenizer.add_tokens(["<mask>"])
    model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

⚠️⚠️⚠️ If you add new tokens like this, you must also resize the model’s embedding layer: model.resize_token_embeddings(len(tokenizer))

Otherwise, the model won’t know what to do with the new token IDs!


In [None]:
# 4: Preprocess Function

def preprocess_function(example):
    input_text = example["input"]
    target_text = example["target"]

    model_inputs = tokenizer(input_text, max_length=256, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_text, max_length=64, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Check one sample
print(tokenized_datasets["train"][0])


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

{'input': 'def get_paths():\n    """Parses command-line arguments, if present; else uses defaults"""\n    src = join(dirname(argv[0]), \'../extracted-files/tags\')\n    doc = join(dirname(argv[0]), \'../specification\')\n<mask>:\n        doc, src, dst = argv[1:]\n    if len(argv) == 3:\n        src, dst = argv[1:]\n    elif len(argv) == 2:\n        dst = argv[1]\n    else:\n        raise Exception(\'ERROR: Must specify path to local clone of gedcom.io repository on command line\')\n    if not isdir(src):\n        raise Exception(\'ERROR: Source directory \' + repr(src) + \' is not a directory\')\n    if not isdir(join(dst, \'.git\')):\n        raise Exception(\'ERROR: Destination directory \' + repr(dst) + \' is not git repostory\')\n    dst = join(dst, \'_pages\', \'tag-def\')\n    if not isdir(dst):\n        makedirs(dst)\n    return (doc, src, dst)', 'target': 'len(argv) == 4', 'tokens_in_method': 100, 'input_ids': [1, 536, 336, 67, 4481, 13332, 203, 565, 3536, 6656, 1296, 17, 1369,

In [None]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpjmitchell[0m ([33mpjmitchell-william-mary[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# ------------------------------------------------------------------------
# 5. Define Training Arguments and Trainer
# ------------------------------------------------------------------------


training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Gen_AI_Homework_2/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=50,
    push_to_hub=False,
    report_to="wandb",  # 👈 W&B enabled
    run_name="codet5-masked-if-prediction"  # 👈 Custom name in W&B dashboard
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


In [None]:
# ------------------------
# 6. Train the Model
# ------------------------
trainer.train(resume_from_checkpoint=True)

# ------------------------
# 7. Evaluate on Test Set
# ------------------------
metrics = trainer.evaluate(tokenized_datasets["test"])
print("Test Evaluation Metrics:", metrics)

# ------------------------
# 8. Test Code Translation
# ------------------------
input_code = "def foo(x):\n    <mask>:\n        return True"

inputs = tokenizer(input_code, return_tensors="pt", padding=True, truncation=True)

# 🧠 Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate output
outputs = model.generate(**inputs, max_length=256)

# Decode and print
print("Generated Prediction:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Epoch,Training Loss,Validation Loss
7,0.0839,0.163179


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Test Evaluation Metrics: {'eval_loss': 0.15267856419086456, 'eval_runtime': 33.4578, 'eval_samples_per_second': 149.442, 'eval_steps_per_second': 18.68, 'epoch': 7.0}
Generated Prediction:
 x is not None


In [3]:
#------------- Ran after Testing ----------------------#
#list all saved checkpoints
!ls /content/drive/MyDrive/Gen_AI_Homework_2/results

checkpoint-12500  checkpoint-43750


In [5]:
#------------- Ran after Testing ----------------------#
from transformers import T5ForConditionalGeneration

checkpoint_path = "/content/drive/MyDrive/Gen_AI_Homework_2/results/checkpoint-43750"
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)
model.to("cuda")

T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [6]:
#------------- Ran after Testing ----------------------#
import pandas as pd
from datasets import Dataset, DatasetDict

# dataset path in Google Drive
base_path = "/content/drive/MyDrive/Gen_AI_Homework_2/dataset"

# Load each CSV split
train_df = pd.read_csv(f"{base_path}/train.csv")
val_df = pd.read_csv(f"{base_path}/val.csv")
test_df = pd.read_csv(f"{base_path}/test.csv")

# columns named
train_df = train_df.rename(columns={"cleaned_method": "input", "target_block": "target"})
val_df = val_df.rename(columns={"cleaned_method": "input", "target_block": "target"})
test_df = test_df.rename(columns={"cleaned_method": "input", "target_block": "target"})

# Convert to Hugging Face datasets
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})

# Sample check
print("Example input:", dataset["train"][0]["input"])
print("Example target:", dataset["train"][0]["target"])

Example input: def get_paths():
    """Parses command-line arguments, if present; else uses defaults"""
    src = join(dirname(argv[0]), '../extracted-files/tags')
    doc = join(dirname(argv[0]), '../specification')
<mask>:
        doc, src, dst = argv[1:]
    if len(argv) == 3:
        src, dst = argv[1:]
    elif len(argv) == 2:
        dst = argv[1]
    else:
        raise Exception('ERROR: Must specify path to local clone of gedcom.io repository on command line')
    if not isdir(src):
        raise Exception('ERROR: Source directory ' + repr(src) + ' is not a directory')
    if not isdir(join(dst, '.git')):
        raise Exception('ERROR: Destination directory ' + repr(dst) + ' is not git repostory')
    dst = join(dst, '_pages', 'tag-def')
    if not isdir(dst):
        makedirs(dst)
    return (doc, src, dst)
Example target: len(argv) == 4


In [7]:
#------------- Ran after Testing ----------------------#
from transformers import T5ForConditionalGeneration, RobertaTokenizer

# ✅ Restore fine-tuned model from saved checkpoint
checkpoint_path = "/content/drive/MyDrive/Gen_AI_Homework_2/results/checkpoint-43750"
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)
model.to("cuda")  # not to "cuda" because its out of GPU access

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

# Optional: re-add <mask> token if needed
if "<mask>" not in tokenizer.get_vocab():
    tokenizer.add_tokens(["<mask>"])
    model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

In [8]:
#------------- Ran after Testing ----------------------#
import torch
# Generate predictions on the test set
test_inputs = dataset["test"]["input"]
generated_outputs = []

model.eval()
for input_code in test_inputs:
    inputs = tokenizer(input_code, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_outputs.append(prediction)


In [10]:
!git clone https://github.com/k4black/codebleu.git
%cd codebleu

Cloning into 'codebleu'...
remote: Enumerating objects: 761, done.[K
remote: Counting objects: 100% (243/243), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 761 (delta 205), reused 138 (delta 138), pack-reused 518 (from 2)[K
Receiving objects: 100% (761/761), 1.32 MiB | 3.16 MiB/s, done.
Resolving deltas: 100% (407/407), done.
/content/codebleu


In [11]:
import sys
sys.path.append("/content/codebleu")

In [14]:
!pip uninstall tree-sitter -y
!pip install tree-sitter==0.23.1

[0mCollecting tree-sitter==0.23.1
  Using cached tree_sitter-0.23.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.8 kB)
Downloading tree_sitter-0.23.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (561 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.9/561.9 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree-sitter
Successfully installed tree-sitter-0.23.1


In [22]:
!pip uninstall tree_sitter_languages -y
!pip install tree_sitter_languages==1.7.0 --force-reinstall

Found existing installation: tree-sitter-languages 1.5.0
Uninstalling tree-sitter-languages-1.5.0:
  Successfully uninstalled tree-sitter-languages-1.5.0
Collecting tree_sitter_languages==1.7.0
  Using cached tree_sitter_languages-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting tree-sitter (from tree_sitter_languages==1.7.0)
  Using cached tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.8 kB)
Using cached tree_sitter_languages-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
Using cached tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (575 kB)
Installing collected packages: tree-sitter, tree_sitter_languages
  Attempting uninstall: tree-sitter
    Found existing installation: tree-sitter 0.24.0
    Uninstalling tree-sitter-0.24.0:
      Successfully uninstalled tree-sitter-0.24.0
Successfully installed tree-sitter-0.24.0 tree_sitter_languages-1.7

In [25]:
# Download precompiled .so for Python parser only (hosted on GitHub)
!wget https://huggingface.co/datasets/paulmitchell/codebleu-grammars/resolve/main/my-languages.so -O /content/my-languages.so


--2025-04-10 06:43:10--  https://huggingface.co/datasets/paulmitchell/codebleu-grammars/resolve/main/my-languages.so
Resolving huggingface.co (huggingface.co)... 18.164.174.118, 18.164.174.17, 18.164.174.23, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.118|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized

Username/Password Authentication Failed.


In [26]:
from codebleu import bleu, weighted_ngram_match, syntax_match, dataflow_match
from tree_sitter_languages import get_language
from tree_sitter import Parser

# Custom weights for CodeBLEU components
alpha = 0.3   # BLEU
beta = 0.1    # weighted n-gram
gamma = 0.5   # syntax match
theta = 0.1   # dataflow match

# This line causes the TypeError in Colab (Python 3.11)
language = get_language("python")  # ⛔ TypeError: __init__() takes exactly 1 argument (2 given)
parser = Parser()
parser.set_language(language)

# Custom-weighted CodeBLEU scoring
codebleu_scores = []
for ref, pred in zip(test_df["target"], test_df["prediction"]):
    try:
        bleu_score = bleu.compute_bleu([pred], [ref]) * 100
        ngram_score = weighted_ngram_match.weighted_ngram_match(pred, ref, lang="python") * 100
        syntax_score = syntax_match.syntax_match(pred, ref, lang="python", parser=parser) * 100
        dataflow_score = dataflow_match.dataflow_match(pred, ref, lang="python", parser=parser) * 100

        final_score = round(
            alpha * bleu_score +
            beta * ngram_score +
            gamma * syntax_score +
            theta * dataflow_score, 2
        )

    except Exception as e:
        print(f"CodeBLEU failed: {e}")
        final_score = "N/A"

    codebleu_scores.append(final_score)

test_df["CodeBLEU prediction score (0-100)"] = codebleu_scores


TypeError: __init__() takes exactly 1 argument (2 given)

In [28]:
import sacrebleu

# Step 1: Assign model predictions
test_df["prediction"] = generated_outputs

# Step 2: Compute Exact Match
test_df["Whether the prediction is correct"] = test_df.apply(
    lambda row: row["prediction"].strip() == row["target"].strip(), axis=1
)

# Step 3: Compute BLEU-4
def compute_bleu4(ref, pred):
    return sacrebleu.sentence_bleu(pred, [ref]).score

test_df["BLEU-4 prediction score (0-100)"] = test_df.apply(
    lambda row: compute_bleu4(row["target"], row["prediction"]), axis=1
)

# Step 4: Add placeholder for CodeBLEU
test_df["CodeBLEU prediction score (0-100)"] = "N/A"

# Step 5: Rename for submission format
test_df.rename(columns={
    "input": "Input function with masked if condition",
    "target": "Expected if condition",
    "prediction": "Predicted if condition"
}, inplace=True)

# Step 6: Save to Drive
test_df.to_csv("/content/drive/MyDrive/Gen_AI_Homework_2/testset-results.csv", index=False)
print("Saved testset-results.csv to Google Drive.")


Saved testset-results.csv to Google Drive.


In [51]:
import pandas as pd
import sacrebleu
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Load predictions and references
df = pd.read_csv("/content/drive/MyDrive/Gen_AI_Homework_2/testset_predictions.csv")
preds = df["prediction"].astype(str).tolist()
refs = df["ground_truth"].astype(str).tolist()

# BLEU score using SacreBLEU
refs_wrapped = [[r] for r in refs]
bleu = sacrebleu.corpus_bleu(preds, refs_wrapped)
print(f"BLEU score: {bleu.score:.2f}")

# F1 Score (token-level micro average)
pred_tokens = [p.split() for p in preds]
ref_tokens = [r.split() for r in refs]
mlb = MultiLabelBinarizer().fit(pred_tokens + ref_tokens)
y_pred = mlb.transform(pred_tokens)
y_true = mlb.transform(ref_tokens)
f1 = f1_score(y_true, y_pred, average="micro") * 100
print(f"F1 Score: {f1:.2f}%")

# Exact Match
exact_match = np.mean([p.strip() == r.strip() for p, r in zip(preds, refs)]) * 100
print(f"Exact Match: {exact_match:.2f}%")


BLEU score: 75.98
F1 Score: 49.32%
Exact Match: 33.62%


In [54]:
# Final evaluation metrics
results = {
    "Metric": ["BLEU", "F1 Score", "Exact Match"],
    "Value": [75.98, 49.32, 33.62]
}

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Save to Drive folder
output_path = "/content/drive/MyDrive/Gen_AI_Homework_2/testset_eval_summary.csv"
results_df.to_csv(output_path, index=False)
