In [1]:
import json

with open("step_definitions.json", "r") as f:
    data = json.load(f)

data[:3]


[{'AttributeText': 'Given("Returns a Task")',
  'MethodSignature': 'ReturnsATask()',
  'MethodBody': '{\n            throw new NotSupportedException("should be mocked");\n        }',
  'SourceFile': '/tmp/repos_56e52f16-5434-4b1d-a1e1-219bacac2163/reqnroll_Reqnroll/Tests/Reqnroll.RuntimeTests/StepExecutionTests.cs'},
 {'AttributeText': 'Then("SpecificBindingRegistryTests")',
  'MethodSignature': 'Transform(string val)',
  'MethodBody': '{\n                return 42;\n            }',
  'SourceFile': '/tmp/repos_56e52f16-5434-4b1d-a1e1-219bacac2163/reqnroll_Reqnroll/Tests/Reqnroll.RuntimeTests/RuntimeBindingRegistryBuilderTests.cs'},
 {'AttributeText': 'Then("SpecificBindingRegistryTests")',
  'MethodSignature': 'Transform(string val)',
  'MethodBody': '{\n                return 24;\n            }',
  'SourceFile': '/tmp/repos_56e52f16-5434-4b1d-a1e1-219bacac2163/reqnroll_Reqnroll/Tests/Reqnroll.RuntimeTests/RuntimeBindingRegistryBuilderTests.cs'}]

In [4]:
!pip install transformers datasets scikit-learn --quiet

import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

with open("step_definitions.json", "r") as f:
    raw_data = json.load(f)

pairs = []
for item in raw_data:
    input_text = item["AttributeText"]
    output_text = item["MethodSignature"] + "\n" + item["MethodBody"]
    pairs.append({"input": input_text, "output": output_text})

train_data, eval_data = train_test_split(pairs, test_size=0.1, random_state=42)

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

model_name = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    model_inputs = tokenizer(example["input"], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(example["output"], max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, batched=False)
tokenized_eval = eval_dataset.map(tokenize_function, batched=False)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./model_output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

def compute_metrics(eval_preds):
    from datasets import load_metric
    import numpy as np
    metric = load_metric("accuracy")
    preds, labels = eval_preds
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer
)

trainer.train()

best_model_dir = "./best_model"
trainer.save_model(best_model_dir)
tokenizer.save_pretrained(best_model_dir)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/14383 [00:00<?, ? examples/s]

Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.319,0.340659
2,0.3333,0.300148
3,0.2957,0.282697
4,0.2262,0.277155


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Input: Given I have a logged in user
Generated output:
GivenIHaveALoggedInUser()
{
            //Create an instance for the HomePage
            HomePage homePage = new HomePage(driver);

            homePage.Login();
        }

Input: When I click the submit button
Generated output:
WhenIClickTheSubmitButton()
{
            _submitPage.ClickSubmitButton();
        }



In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(best_model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(best_model_dir)

test_inputs = [
    "Given I log in as user",
    "When I click the submit button"
]

inputs = tokenizer(test_inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")

outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=256,
    num_beams=5,
    early_stopping=True
)

predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i, prediction in enumerate(predicted_texts):
    print(f"Input: {test_inputs[i]}")
    print(f"Generated output:\n{prediction}\n")


Input: Given I log in as user
Generated output:
GivenILogInAsUser()
{
            loginPage.Login();
        }

Input: When I click the submit button
Generated output:
WhenIClickTheSubmitButton()
{
            _submitPage.ClickSubmitButton();
        }

