In [13]:
# Hugging Face - Fine-Tuning CodeT5 for Code Translation (AI4SE Focus)

# This notebook demonstrates how to fine-tune the CodeT5 model using Hugging Face Transformers
# for a Software Engineering task: translating Python code to Java.

# ------------------------
# 1. Install Required Libraries
# ------------------------
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
!pip install transformers datasets evaluate -q

Looking in indexes: https://download.pytorch.org/whl/cu124


In [None]:
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from datasets import DatasetDict, Dataset
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

from datasets import load_dataset
import warnings
import pandas as pd

In [37]:
# ------------------------------------------------------------------------
# 2. Load Dataset (CodeXGLUE - Code Translation Java <=> C#)
# ------------------------------------------------------------------------
import re
warnings.simplefilter(action='ignore', category=FutureWarning)

# CodeXGLUE is a benchmark dataset collection by Microsoft for code-related tasks.
# Here, we use the code-translation-python-java dataset.
#dataset = load_dataset("google/code_x_glue_cc_code_to_code_trans")
df_train = pd.read_csv('ft_train.csv')
df_valid = pd.read_csv('ft_valid.csv')
df_test = pd.read_csv('ft_test.csv')
df_train = df_train[['cleaned_method', 'target_block']]
df_valid = df_valid[['cleaned_method', 'target_block']]
df_test = df_test[['cleaned_method', 'target_block']]

def mask_and_flatten_methods(df):
    result = []

    for i in range(len(df)):
        method = df.at[i, "cleaned_method"]
        target = df.at[i, "target_block"]

        lines = method.splitlines()
        processed_lines = [
            "<mask>" if line.replace(" ", "") == target.replace(" ", "") else line
            for line in lines
        ]

        flattened = " ".join(processed_lines)
        cleaned = re.sub(r"\s+", " ", flattened).strip()
        result.append(cleaned)

    return pd.Series(result, index=df.index)



df_train["masked_method"] = mask_and_flatten_methods(df_train)
df_test["masked_method"] = mask_and_flatten_methods(df_test)
df_valid["masked_method"] = mask_and_flatten_methods(df_valid)

test = df_test
train = df_train
valid = df_valid

df_train.to_csv("masked_train.csv", index=False)
df_test.to_csv("masked_test.csv", index=False)
df_valid.to_csv("masked_valid.csv", index=False)

✅ This following loads a pre-trained models & tokenizer from Hugging Face using the checkpoint name (e.g., "Salesforce/codet5-small").


*  The tokenizer knows how to convert text into tokens that the model

*   It also handles things like padding, truncation, special tokens, etc.

*	It comes with a fixed vocabulary learned during pretraining, that however we can expand if needed as shown

In [38]:
# ------------------------------------------------------------------------
# 3. Load Pre-trained Model & Tokenizer
# ------------------------------------------------------------------------
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from datasets import DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

model_checkpoint = "Salesforce/codet5-small"

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(["<mask>"]) #Imagine we need an extra token. This line adds the extra token to the vocabulary

model.resize_token_embeddings(len(tokenizer))




Embedding(32100, 512)

⚠️⚠️⚠️ If you add new tokens like this, you must also resize the model’s embedding layer: model.resize_token_embeddings(len(tokenizer))

Otherwise, the model won’t know what to do with the new token IDs!


In [39]:
# ------------------------------------------------------------------------------------------------
# 4. We prepare now the fine-tuning dataset using the tokenizer we preloaded
# ------------------------------------------------------------------------------------------------

def preprocess_function(examples):
    inputs = examples["masked_method"]
    targets = examples["target_block"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(valid)
test_dataset = Dataset.from_pandas(test)
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [40]:
# ------------------------------------------------------------------------
# 5. Define Training Arguments and Trainer
# ------------------------------------------------------------------------


training_args = TrainingArguments(
    output_dir="./codet5-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=2,
    num_train_epochs=7,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [41]:

trainer.train()


metrics = trainer.evaluate(tokenized_datasets["test"])

trainer.save_model("t5_Model")

print("Test Evaluation Metrics:", metrics)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [42]:
trainer.save_model("t5_Model")

In [43]:
trainer = AutoModelForSeq2SeqLM.from_pretrained("t5_Model")

In [44]:
! pip install transformers
!pip install tree_sitter==0.2.0
! git clone -q https://github.com/microsoft/CodeXGLUE.git

fatal: destination path 'CodeXGLUE' already exists and is not an empty directory.


In [56]:
# Initialize the output_file DataFrame
output_file = pd.DataFrame(columns=[
    'Masked Method',
    'Exact Match',
    'Expected if Condition',
    'Predicted if Condition',
    'CodeBLEU Score',
    'Bleu4 Score'
])

# Evaluate each sample
for idx, row in test.iterrows():
    print(f"Evaluating sample {idx + 1} of {len(test)}")

    input_code = row['masked_method']
    expected_condition = row['target_block']

    # Ensure the masked token exists in input
    if "<mask>" not in input_code:
        print(f"Skipping index {idx}: no <mask> token found.")
        continue

    # Tokenize and generate prediction
    encoded_input = tokenizer(input_code, return_tensors="pt", padding=True, truncation=True)
    encoded_input = encoded_input.to(model.device)
    generated_output = model.generate(**encoded_input, max_length=256)

    predicted_condition = tokenizer.decode(generated_output[0], skip_special_tokens=True)

    # Write predictions and ground truth to files for external eval
    with open("predicted.txt", "w") as pred_file, open("actual.txt", "w") as actual_file:
        pred_file.write(input_code.replace("<mask>", predicted_condition))
        actual_file.write(input_code.replace("<mask>", expected_condition))

    # Determine exact match
    is_exact = str(predicted_condition == expected_condition)

    # Run external CodeBLEU script
    codebleu_output = !cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && python calc_code_bleu.py --refs /content/actual.txt --hyp /content/predicted.txt --lang java --params 0.25,0.25,0.25,0.25
    codebleu_score = codebleu_output[1].split()[-1]
    bleu4_score = codebleu_output[0].split()[2][:-1]

    # Append row to output_file
    output_file.loc[idx] = [
        input_code,
        is_exact,
        expected_condition,
        predicted_condition,
        codebleu_score,
        bleu4_score
    ]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing example 38 of 5000
Processing example 39 of 5000
Processing example 40 of 5000
Processing example 41 of 5000
Processing example 42 of 5000
Processing example 43 of 5000
Processing example 44 of 5000
Processing example 45 of 5000
Processing example 46 of 5000
Processing example 47 of 5000
Processing example 48 of 5000
Processing example 49 of 5000
Processing example 50 of 5000
Processing example 51 of 5000
Processing example 52 of 5000
Processing example 53 of 5000
Processing example 54 of 5000
Processing example 55 of 5000
Processing example 56 of 5000
Processing example 57 of 5000
Processing example 58 of 5000
Processing example 59 of 5000
Processing example 60 of 5000
Processing example 61 of 5000
Processing example 62 of 5000
Processing example 63 of 5000
Processing example 64 of 5000
Processing example 65 of 5000
Processing example 66 of 5000
Processing example 67 of 5000
Processing example 68 of 5000
Proce

In [57]:
output_file

Unnamed: 0,Masked Method,Exact Match,Expected if Condition,Predicted if Condition,CodeBLEU Score,Bleu4 Score
0,"def read(self, count=True, timeout=None, ignor...",False,if ignore_timeouts and is_timeout ( e ) :,,0.8360444067589783,0.7702328629053108
1,"def _cache_mem(curr_out, prev_mem, mem_len, re...",False,if prev_mem is None :,,0.8645191431479302,0.8685781707542944
2,def filtered(gen): for example in gen: example...,False,if example_len > max_length :,,0.8520675515339557,0.846919166520185
3,"def search(self, query): # ""Search.ashx?query=...",False,"if item . get ( ""type"" , """" ) == ""audio"" :",,0.843963905291848,0.7573779250257932
4,"def _check_script(self, script, directive): fo...",False,"if var . must_contain ( ""/"" ) :",if var .,0.8975264088228275,0.8760919591529776
...,...,...,...,...,...,...
4995,"def _super_function(args): passed_class, passe...",False,"if isinstance ( pyclass , pyobjects . Abstract...",,0.6738689620562354,0.6770034465014849
4996,"def get_data(row): data = [] for field_name, f...",False,if result :,,0.8141464481572617,0.8816789818696822
4997,"def say(jarvis, s): """"""Reads what is typed.""""""...",False,if not voice_state :,,0.8445670309367515,0.7936553953409219
4998,"def __import__(name, globals=None, locals=None...",False,"if ""*"" in fromlist :",,0.8711695857742497,0.8658669363158178


In [58]:
output_file.to_csv("testset-results.csv")