# Prepare the data

In [91]:
import pandas as pd

data = pd.read_csv("it5_fine_tuned/nl2optim_augmented.tsv", delimiter="\t")
data

Unnamed: 0,split,xml_tagged_preferences,constraint_representation
0,train,voglio che l'acqua calda sia <CONSTR_TIME>semp...,s_t=1 for each t
1,train,Desidero avere l'acqua calda <CONSTR_TIME>semp...,s_t=1 for each t
2,train,Mi piacerebbe che l'acqua calda fosse <CONSTR_...,s_t=1 for each t
3,train,Voglio che l'acqua calda sia <CONSTR_TIME>disp...,s_t=1 for each t
4,train,Vorrei che l'acqua calda rimanesse <CONSTR_TIM...,s_t=1 for each t
...,...,...,...
281,test,Richiedo che la temperatura sia di <CONSTR_TEM...,h_t=23 for t>=18 ; s_t=1 for t>=18
282,test,È importante per me che la temperatura sia a <...,h_t=23 for t>=18 ; s_t=1 for t>=18
283,test,Mi occorre mantenere la temperatura a <CONSTR_...,h_t=23 for t>=18 ; s_t=1 for t>=18
284,test,Vorrei che la temperatura rimanesse a <CONSTR_...,h_t=23 for t>=18 ; s_t=1 for t>=18


In [92]:
from datasets import Dataset

dataset = Dataset.from_pandas(data)
print(dataset)
dataset_train = dataset.filter(lambda d: d["split"]=="train")
split = dataset_train.train_test_split(test_size=0.2)
train = split["train"]
val = split["test"]
test = dataset.filter(lambda d: d["split"]=="test")
train, val, test

Dataset({
    features: ['split', 'xml_tagged_preferences', 'constraint_representation'],
    num_rows: 286
})


Filter: 100%|████████████████████████████████████████████████████████████████████████████| 286/286 [00:00<00:00, 57253.29 examples/s]
Filter: 100%|████████████████████████████████████████████████████████████████████████████| 286/286 [00:00<00:00, 73426.64 examples/s]


(Dataset({
     features: ['split', 'xml_tagged_preferences', 'constraint_representation'],
     num_rows: 184
 }),
 Dataset({
     features: ['split', 'xml_tagged_preferences', 'constraint_representation'],
     num_rows: 47
 }),
 Dataset({
     features: ['split', 'xml_tagged_preferences', 'constraint_representation'],
     num_rows: 55
 }))

In [93]:
import torch
from transformers import AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

tokenizer = AutoTokenizer.from_pretrained('gsarti/it5-large')

def tokenization_function(data):
  inputs = tokenizer(data["xml_tagged_preferences"], padding=True, truncation=True, max_length=60, return_tensors="pt")
  target_constraints = tokenizer(data["constraint_representation"], padding=True, truncation=True, max_length=30, return_tensors="pt")
  inputs["labels"] = target_constraints["input_ids"]

  return inputs


tokenized_train = train.map(lambda t: tokenization_function(t), batched=True)
tokenized_val = val.map(lambda t: tokenization_function(t), batched=True)
tokenized_test = test.map(lambda t: tokenization_function(t), batched=True)

print(tokenized_val)
print(tokenizer.decode(tokenized_val["input_ids"][4]))
print(tokenizer.decode(tokenized_val["labels"][4]))

cpu


Map: 100%|████████████████████████████████████████████████████████████████████████████████| 184/184 [00:00<00:00, 7435.78 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [00:00<00:00, 3533.91 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████| 55/55 [00:00<00:00, 3506.73 examples/s]

Dataset({
    features: ['split', 'xml_tagged_preferences', 'constraint_representation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 47
})
voglio che l'acqua calda sia <constr_time>disponibile in ogni momento</constr_time>.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
s_t=1 for each t</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>





# Training steps

In [94]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="gsarti/it5-large")

- define the evaluation metric

In [95]:
import datasets
import numpy as np
from transformers import EvalPrediction
from sklearn.metrics import f1_score, precision_score, recall_score


def eval(predictions, labels):
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    print(decoded_preds)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(decoded_labels)
    chrf_metric = datasets.load_metric("chrf")
    chrf=chrf_metric.compute(predictions=[decoded_preds], references=[decoded_labels])
    precision = precision_score(decoded_labels, decoded_preds, average='micro')
    recall = recall_score(decoded_labels, decoded_preds, average='micro')
    f1 = f1_score(decoded_labels, decoded_preds, average='micro')
    
    return {"micro-precision": round(precision, 4),
          "micro-recall": round(recall, 4),
          "f1": round(f1, 4),
          "chrf": round(chrf["score"], 4),
        }

def compute_metric(eval_pred: EvalPrediction):
  predictions, labels = eval_pred
  res = eval(predictions, labels)

  return res

#eval([[34, 67, 32, 45]], [[34, 67, 32]])

- define the hyperparameters



In [99]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
model_name = "it5_nl2optim"

if device == "cpu":
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

batch_size = 8
model_dir = f"it5_fine_tuned/{model_name}"
args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="steps",
    eval_steps=20,
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="steps",
    save_steps=20,
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=10,
    predict_with_generate=True,
    #fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="chrf",
    report_to=[]
)

print(model_dir)

it5_fine_tuned/it5_nl2optim


- load the model and configure the training process

In [100]:
from transformers import AutoModelForSeq2SeqLM
if device == "cpu":
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_name = "gsarti/it5-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metric,
)


Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


- start the training process

In [None]:
if device == "cpu":
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    
trainer.train()

- validate

In [None]:
trainer.evaluate()

# Evaluation

In [104]:
ft_config=f"{model_dir}/checkpoint-160"
tokenizer = AutoTokenizer.from_pretrained(ft_config)
model = AutoModelForSeq2SeqLM.from_pretrained(ft_config)

def inference(dataset, model, tokenizer):
  inputs = dataset["xml_tagged_preferences"]
  input_ids = tokenizer(inputs, return_tensors="pt", max_length=60, truncation=True, padding=True)
  input_ids.to(device)
  output_ids = model.generate(**input_ids, max_length=30)
  output = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]

  for utterance, constraint in zip(dataset["xml_tagged_preferences"], output):
      print(f"{constraint}")
  #return output

inference(tokenized_test, model, tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


h_t=22 for t<=6
h_t=22 for t<=6
h_t=22 for t<=6
h_t=22 for t<=6
h_t=22 for t<=6
h_t=22 for t<=6
h_t=22 for t<=6
h_t=22 for t<=6
h_t=22 for t<=6
h_t=22 for t<=6
h_t=22 for t<=6
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
s_t=1 for 9<=t<=18
h_t=1 for t<=9
h_t=1 for t<=9
s_t=1 for t<=17
h_t=1 for t<=9
h_t=1 for t<=17
h_t=1 for t<=17
s_t=1 for t<=1:30 ; s_t=1 for t<=1
h_t=1 for t<=9
s_t=1 for t<=1
s_t=1 for t<=1
s_t=1 for t<=9
s_t=1 for t<=7
s_t=1 for 7<=t<=8
s_t=1 for 7<=t<=8
s_t=1 for 7<=t<=8
s_t=1 for 7<=t<=8
s_t=1 for 7<=t<=8
s_t=1 for t<=7
s_t=1 for t<=7
s_t=1 for t<=7
s_t=1 for 7<=t<=8
s_t=1 for 7<=t<=8
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
h_t=23 for 18<=t<=18
