In [1]:
import json

with open("step_definitions.json", "r") as f:
    data = json.load(f)

data[:3]


[{'AttributeText': 'Given("Returns a Task")',
  'MethodSignature': 'ReturnsATask()',
  'MethodBody': '{\n            throw new NotSupportedException("should be mocked");\n        }',
  'SourceFile': '/tmp/repos_56e52f16-5434-4b1d-a1e1-219bacac2163/reqnroll_Reqnroll/Tests/Reqnroll.RuntimeTests/StepExecutionTests.cs'},
 {'AttributeText': 'Then("SpecificBindingRegistryTests")',
  'MethodSignature': 'Transform(string val)',
  'MethodBody': '{\n                return 42;\n            }',
  'SourceFile': '/tmp/repos_56e52f16-5434-4b1d-a1e1-219bacac2163/reqnroll_Reqnroll/Tests/Reqnroll.RuntimeTests/RuntimeBindingRegistryBuilderTests.cs'},
 {'AttributeText': 'Then("SpecificBindingRegistryTests")',
  'MethodSignature': 'Transform(string val)',
  'MethodBody': '{\n                return 24;\n            }',
  'SourceFile': '/tmp/repos_56e52f16-5434-4b1d-a1e1-219bacac2163/reqnroll_Reqnroll/Tests/Reqnroll.RuntimeTests/RuntimeBindingRegistryBuilderTests.cs'}]

In [3]:
!pip install transformers datasets scikit-learn evaluate --quiet

import json
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import evaluate

with open("step_definitions.json", "r") as f:
    raw_data = json.load(f)

pairs = []
for item in raw_data:
    input_text = item["AttributeText"]
    output_text = item["MethodSignature"] + "\n" + item["MethodBody"]
    pairs.append({"input": input_text, "output": output_text})

train_data, eval_data = train_test_split(pairs, test_size=0.1, random_state=42)
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

model_name = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    model_inputs = tokenizer(example["input"], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(example["output"], max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, batched=False)
tokenized_eval = eval_dataset.map(tokenize_function, batched=False)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
exact_match_metric = evaluate.load("exact_match")

def postprocess(preds, labels):
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    return decoded_preds, decoded_labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds, decoded_labels = postprocess(preds, labels)

    result = {}
    result.update(bleu.compute(predictions=[p.split() for p in decoded_preds],
                               references=[[l.split()] for l in decoded_labels]))
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result["rougeL"] = rouge_result["rougeL"].mid.fmeasure
    result.update(exact_match_metric.compute(predictions=decoded_preds, references=decoded_labels))
    return result

training_args = TrainingArguments(
    output_dir="./model_output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

best_model_dir = "./best_model"
trainer.save_model(best_model_dir)
tokenizer.save_pretrained(best_model_dir)


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\blaga\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Map:   0%|          | 0/14383 [00:00<?, ? examples/s]

Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['nltk', 'rouge_score'] using 'pip install # Here to have a nice missing dependency error message early on rouge_score' for instance'

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(best_model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(best_model_dir)

test_inputs = [
    "Given I log in as user",
    "When I click the submit button"
]

inputs = tokenizer(test_inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")

outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=256,
    num_beams=5,
    early_stopping=True
)

predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i, prediction in enumerate(predicted_texts):
    print(f"Input: {test_inputs[i]}")
    print(f"Generated output:\n{prediction}\n")
