# Notebook to evaluate code-t5+ model for naming methods in java code

Made for the Jetbrains internship application

Inspired by [this notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb)


In [2]:
import json
import re

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, T5ForConditionalGeneration
from datasets import Dataset, DatasetDict

torch.set_float32_matmul_precision('medium')
SEED = 42

import os

# os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
def tokenize(batch):
    model_inputs = tokenizer(batch["method"], padding="max_length", max_length=max_input_length, return_tensors="pt", truncation=True)

    labels = tokenizer(batch["name"], padding="max_length", max_length=max_target_length, return_tensors="pt", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [4]:
CHECKPOINT = "Salesforce/codet5p-220m"
batch_size = 6
num_workers = 32
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
mask_token = "<extra_id_0>"
device = "cuda"
max_input_length = 512
max_target_length = 64
data_file = "extract_methods/output.json"

with open(data_file, "r") as f:
    data = json.loads(f.read())
data = pd.DataFrame(data["input"])
# remove all samples with a method body longer than max_length tokens or a method name longer than 64 tokens
data = data[
    (data["method"].str.len() <= max_input_length) & (data["name"].str.len() <= max_target_length)]

# remove all samples with no method body
data = data[data["hasBody"] == True]

# replace in each row in the method body the method name by <extra_id_0>
data.loc[:, "method"] = data.apply(lambda row: row["method"].replace(row["name"], mask_token),
                                             axis=1)

# add the <extra_id_0> token before each method name
data.loc[:, "name"] = data.apply(lambda row: mask_token + row["name"], axis=1)

dataset = Dataset.from_pandas(data)

# tokenize the data
dataset = dataset.map(tokenize, batched=True, batch_size=1000, num_proc=os.cpu_count(), remove_columns=["method", "name", "hasBody", "__index_level_0__"])
dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

Map (num_proc=64):   0%|          | 0/99040 [00:00<?, ? examples/s]

In [5]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 99040
})

In [6]:
# split the data into train, validation and test sets
dataset = dataset.shuffle(seed=SEED)
train_test_dataset = dataset.train_test_split(test_size=0.2, seed=SEED)
val_test_dataset = train_test_dataset["test"].train_test_split(test_size=0.5, seed=SEED)

dataset_split = DatasetDict({
    # !!!! IMPORTANT !!!!
    # decease the training set size to 1/8th of the original size for faster training
    'train':train_test_dataset["train"].shard(index=0, num_shards=8),
    'val':val_test_dataset["train"],
    'test':val_test_dataset["test"]
})

In [7]:
dataset_split

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9904
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9904
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9904
    })
})

# Training

In [8]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT)

In [25]:
# manually generate a batch to determine the model inputs
batch_size = 4
index = 200

batch = dataset_split["train"][index:index + batch_size]

# move to gpu
batch = {k: v.to(device) for k, v in batch.items()}
model = model.to(device)

# generate the output
outputs = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"],
                         max_length=64 + len(tokenizer.eos_token), num_beams=5, early_stopping=True)

# decode the output
decoded_outputs = tokenizer.batch_decode(outputs)

print(decoded_outputs)

# print labels
print([tokenizer.decode(label, skip_special_tokens=True) for label in batch["labels"]])

['<pad><extra_id_0> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1> main<extra_id_1>', '<pad><s>/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements.  See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (', '<pad><extra_id_0> getAgentProgressPoint<extra_id_1> getAgentProgressPoint<extra_id_1> getAgentProgressP

In [9]:
training_args = Seq2SeqTrainingArguments(
    f"{CHECKPOINT}-finetuned-method-name-generation",
    # output_dir="./results",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size * 2,
    predict_with_generate=True,
    do_train=True,
    do_eval=True,
    logging_steps=100,
    save_steps=1000,
    eval_steps=1000,
    warmup_steps=1000,
    max_steps=10000,
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True,
    fp16_opt_level="O2",
    # gradient_accumulation_steps=1,
    learning_rate=5e-5,
    adam_epsilon=1e-8,
    seed=SEED,
    dataloader_num_workers=os.cpu_count(),
    disable_tqdm=False,
    # load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

In [10]:
from evaluate import load

metric = load("rouge")

pad_token_id = tokenizer.pad_token_id

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    # extract method name from the prediction
    decoded_preds = map(lambda x: re.findall(r"<extra_id_0>[^\w<]*(\w+)", x), decoded_preds)
    decoded_preds = map(lambda x: x[0] if len(x) > 0 else "", decoded_preds)
    # split at camel case
    camel_case_split = lambda x: re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', x))
    decoded_preds = list(map(camel_case_split, decoded_preds))
    decoded_labels = list(map(camel_case_split, decoded_labels))

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)#, use_stemmer=True)
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return result

In [11]:
# manually test the compute_metrics function
preds = ["<extra_id_0> print", "<extra_id_0> getItems", "<extra_id_0> testSomeFunction"]
labels = ["print", "getItems", "testNoFunction"]
preds = tokenizer(preds, padding=True, truncation=True, return_tensors="pt")
labels = tokenizer(labels, padding=True, truncation=True, return_tensors="pt")
compute_metrics((preds["input_ids"], labels["input_ids"]))

{'rouge1': 0.8888888888888888,
 'rouge2': 0.3333333333333333,
 'rougeL': 0.8888888888888888,
 'rougeLsum': 0.8888888888888888,
 'gen_len': 4.666666666666667}

In [12]:
# evaluate before training
trainer = Seq2SeqTrainer(
    model=model,
args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.evaluate()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenize

{'eval_loss': 17.450483322143555,
 'eval_rouge1': 0.30621238783994986,
 'eval_rouge2': 0.11262148556631454,
 'eval_rougeL': 0.3055586402679647,
 'eval_rougeLsum': 0.305375283636965,
 'eval_gen_len': 18.999697092084006,
 'eval_runtime': 365.029,
 'eval_samples_per_second': 27.132,
 'eval_steps_per_second': 2.263}

In [13]:
# train the model
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenize

Step,Training Loss


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenize

TrainOutput(global_step=10000, training_loss=0.40386996908187867, metrics={'train_runtime': 1941.1047, 'train_samples_per_second': 30.91, 'train_steps_per_second': 5.152, 'total_flos': 3.653016593891328e+16, 'train_loss': 0.40386996908187867, 'epoch': 6.06})

In [ ]:
# evaluate after training
trainer.evaluate()

In [ ]:
# save the model

trainer.save_model(f"{CHECKPOINT}-finetuned-method-name-generation")

In [ ]:
# test the model

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.evaluate()


# Evaluation

In [31]:
# batch_size = 16
# index = 400
# input_ids = model_inputs["input_ids"][index].unsqueeze(0).to(device)
# attention_mask = model_inputs["attention_mask"][index].unsqueeze(0).to(device)
# label = tokenizer.decode(model_inputs["labels"][index][model_inputs["labels"][index] != -100], skip_special_tokens=True)
# 
# outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64 + len(tokenizer.eos_token),
#                          num_beams=5, early_stopping=True)
# 
# predicted_name = tokenizer.decode(outputs[0], skip_special_tokens=True).split()[0]
# method_body = tokenizer.decode(input_ids.squeeze()).replace("<pad>", "")
# print(f"predicted method name: {predicted_name} \nlabel: {label}")
# print(f"method body: {method_body}")

predicted method name: delete 
label: visitElement
method body: <s>@Override
public void<extra_id_0>(@NotNull PsiElement element) {
    if (used) {
        return;
    }
    super.<extra_id_0>(element);
}</s>
