In [1]:
! pip install datasets transformers nltk evaluate sacrebleu ipdb

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ipdb
  Downloading ipdb-0.13.13-py3-none-any.whl.metadata (14 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from 

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import nltk
nltk.download('punkt_tab')
import transformers

from IPython.core import error
import json
from fnmatch import fnmatchcase as match
import random
import os
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
import numpy as np
from argparse import ArgumentParser
from typing import Dict, List, Union
import ast
import csv
import ipdb
import ast
import copy
import shutil

## params
int_seed = 1203
dataset_name = "LTL_koreauniv"
init_weight = "with_pre-train"
data_size = "0.1-0.9"
model_checkpoint = "t5-base"
print(model_checkpoint)
print('\n')

home_path = "/content/drive/MyDrive/Colab_Notebooks/github/NL2TL"
data_dir = "dataset"
data_filename = "command_LTL_dataset_v01.csv"
data_augment_filename = "aug.csv"
data_path = os.path.join(home_path, data_dir, data_filename)
data_augment_path = os.path.join(home_path, data_dir, data_augment_filename)

symbol_to_word_map = {
    "F(": "finally(",
    "G(": "globally(",
    "U(": "until(",
    "&": "and",
    "¬": "not",
    "∧": "and",
    "∨": "or",
    "→": "imply",
}

# read original csv
with open(data_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    data = []
    for row in reader:
        data.append([ast.literal_eval(row["generated_command"]),
                        ast.literal_eval(row["generated_LTL"])])

# align dataset format and write to csv
align_data_filepath = os.path.join(home_path, data_dir, data_filename[:-4] + "_realign.csv")
align_data = []
with open(align_data_filepath, 'w') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["id", "ltl", "sentence"])
    i = 1
    for item in data:
        sents, ltls = item[0], item[1]
        for s in sents:
            for l in ltls:
                # convert symbol to word
                for k, v in symbol_to_word_map.items():
                    l = l.replace(k, v)
                csv_writer.writerow([i, l, s])
                align_data.append([i, l, s])
                i += 1
align_data_num = i - 1

# augment data and write csv
align_aug_data_filepath = align_data_filepath[:-4] + "_aug.csv"
aug_filepath = os.path.join(home_path, data_dir, "aug.csv")
shutil.copy(align_data_filepath, align_aug_data_filepath)
with open(align_aug_data_filepath, 'a') as f:
    csv_writer = csv.writer(f)
    with open(aug_filepath, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            ltl_obj, com_obj = row["ltl_obj"], row["com_obj"]
            aug_ltl_obj = ast.literal_eval(row["ltl_obj_aug"])
            aug_com_obj = ast.literal_eval(row["com_obj_aug"])
            for ad in align_data:
                if ltl_obj in ad[1] and com_obj in ad[2]:
                    for aug_l, aug_c in zip(aug_ltl_obj, aug_com_obj):
                        csv_writer.writerow([i, ad[1].replace(ltl_obj, aug_l), ad[2].replace(com_obj, aug_c)])
                        i += 1
align_aug_data_num = i - 1

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


t5-base




In [4]:
from transformers import AutoTokenizer
import evaluate

# path
input_model_dir = os.path.join(home_path, "model", "t5-base-epch20-infix-word-04-21", "checkpoint-62500")
output_model_dir = os.path.join(home_path, "model", "t5-base-transfer-learning/")
if not os.path.exists(output_model_dir):
  os.mkdir(output_model_dir)

# tokenizer and params
tokenizer = AutoTokenizer.from_pretrained(input_model_dir)
max_input_length = 1024
max_target_length = 128
prefix = "Transform the following sentence into Signal Temporal logic: "

# preprocess data
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["sentence"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["ltl"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["sentence"] = examples["sentence"]
    model_inputs["ltl"] = examples["ltl"]
    model_inputs["id"] = examples["id"]
    return model_inputs

# define compute metrics
bleu_metric = evaluate.load("bleu", force_prefix=True)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # print(predictions)
    # print(labels)
    # Replace -100 in the labels as we can't decode them.
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    count = 0

    # top-1 accuracy
    for i in range(len(decoded_preds)):
        pred = nltk.sent_tokenize(decoded_preds[i].strip())
        label = nltk.sent_tokenize(decoded_labels[i].strip())
        if pred == label:
            count += 1

    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {'top-1 accuracy': round(count / len(decoded_preds), 6),
            'bleu score': bleu['bleu'],
            'bleu precisions': bleu['precisions']}

# correct parenthesis
def correct_parenthe(input_str):
  count = 0
  original_list = input_str.split(' ')
  for index, item in enumerate(original_list):
    if len(item) >2:
      if item[-1] == '.':
        original_list[index] = original_list[index][:-1]
    if item == '(':
      count += 1
    elif item == ')':
      count -= 1
  if count >0:
    for i in range(count):
      original_list.append(')')
  if count <0:
    for i in range(-count):
      original_list.pop(-1)
  return ' '.join(original_list)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

The below is the training code block. So you would need to run the below for transfer-learning fit to the specific dataset.

In [None]:
# transfer learning
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# load realigned dataset
AUGMENT_DATA = True
dataset = load_dataset('csv', data_files=align_aug_data_filepath if AUGMENT_DATA else align_data_filepath)["train"]
train_dataset, test_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=int_seed).values()
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

# load pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(input_model_dir)
batch_size = 16
output_model_name = model_checkpoint.split("/")[-1]+'-'+dataset_name+"-epoch20-trainpoint"+ \
                    str(align_aug_data_num) if AUGMENT_DATA else str(align_data_num)
output_model_dir = output_model_dir+output_model_name

# set trainer params
args = Seq2SeqTrainingArguments(
    output_model_dir,
    output_model_name,
    evaluation_strategy = "steps",
    eval_steps=1000,
    logging_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    seed=int_seed,
    save_total_limit=1,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=False,
    #push_to_hub=True,
    #report_to="tensorboard",
    #load_best_model_at_end=True,
    #save_strategy = "no"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train_dataset ,
    eval_dataset=tokenized_test_dataset ,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# run train
trainer.train()
trainer.save_model()

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Top-1 accuracy,Bleu score,Bleu precisions
1000,0.1517,0.001714,0.2723,0.382877,"[0.9312436804853387, 0.9177489177489178, 0.9002364066193853, 0.8774422735346359]"
2000,0.0062,0.001354,0.274648,0.386235,"[0.9334006054490414, 0.9218688653317628, 0.9061763319189062, 0.8831858407079646]"
3000,0.0031,0.000954,0.274648,0.386235,"[0.9334006054490414, 0.9218688653317628, 0.9061763319189062, 0.8831858407079646]"
4000,0.0019,0.001036,0.274648,0.386235,"[0.9334006054490414, 0.9218688653317628, 0.9061763319189062, 0.8831858407079646]"


Trainer is attempting to log a value of "[0.9312436804853387, 0.9177489177489178, 0.9002364066193853, 0.8774422735346359]" of type <class 'list'> for key "eval/bleu precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.9334006054490414, 0.9218688653317628, 0.9061763319189062, 0.8831858407079646]" of type <class 'list'> for key "eval/bleu precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.9334006054490414, 0.9218688653317628, 0.9061763319189062, 0.8831858407079646]" of type <class 'list'> for key "eval/bleu precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.9334006054490414, 0.9218688653317628, 0.9061763319189062, 0.8831858407079646]" of type <class 'lis

The below code block is for testing using the above transfer-learned model or already pre-trained model. If you want to only test without training, then run from the first code block but skip the just previous training code block and run the below code block.

In [5]:
# Test with a transfer learning model
import torch
import evaluate
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from functools import reduce

int_seed = 1203
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
show_diff_parenthe_results = False

# transfer learning model with 2082 dataset
# output_model_dir = os.path.join(home_path, "model", "t5-base-transfer-learning",
#                                f"t5-base-LTL_koreauniv-epoch20-trainpoint2082", "checkpoint-2340")

# transfer learning model with 33080 dataset with augmentation
output_model_dir = os.path.join(home_path, "model", "t5-base-transfer-learning",
                                  f"t5-base-LTL_koreauniv-epoch20-trainpoint33080", "checkpoint-28500")

# pre-trained model
# output_model_dir = os.path.join(home_path, "model", "t5-base-epch20-infix-word-04-21", "checkpoint-62500")

model = AutoModelForSeq2SeqLM.from_pretrained(output_model_dir).to(device)
acc_top1 = 0.
acc_bleu_scores = 0.
acc_bleu_precisions = [0., 0., 0., 0.]

metric = evaluate.load("accuracy")
dataset = load_dataset('csv', data_files=align_data_filepath)['train']
train_dataset, test_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=int_seed).values()
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
prefix = "Transform the following sentence into Signal Temporal logic: "

with open(output_model_dir +'/result.txt', 'w') as f_result:
    for i in range(len(tokenized_test_dataset)):
        input = [prefix + tokenized_test_dataset[i]['sentence']]
        f_result.write("input:\n")
        f_result.write(f"{input[0]}\n")

        input = tokenizer(input, max_length=max_input_length, truncation=True, return_tensors="pt").to(device)
        output = model.generate(**input, num_beams=8, do_sample=True, min_length=10, max_length=64)
        decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
        label = tokenized_test_dataset[i]['ltl']
        predicted_output = correct_parenthe(decoded_output.strip())

        multi_labels = [t["ltl"] for t in dataset if test_dataset[i]["sentence"] == t["sentence"]]
        if predicted_output in multi_labels:
            acc_top1 += 1
        #if predicted_output == label:
        #    acc_top1 += 1

        f_result.write(f"label:\n")
        for l in multi_labels:
            f_result.write(f"{l}\n")
        f_result.write(f"pred:\n")
        f_result.write(f"{predicted_output}\n")
        f_result.write("\n")

        bleus = [bleu_metric.compute(predictions=[predicted_output], references=[ml]) for ml in multi_labels]
        acc_bleu_scores += reduce(lambda x, y: max(x, y), [bs["bleu"] for bs in bleus])
        acc_bleu_precisions = [acc_bleu_precisions[n] + j for n, j in enumerate(reduce(lambda x, y: max(x, y), [bs["precisions"] for bs in bleus]))]

        #bleu = bleu_metric.compute(predictions=[predicted_output], references=[label])
        #acc_bleu_scores += bleu["bleu"]
        #acc_bleu_precisions = [acc_bleu_precisions[n] + j for n, j in enumerate(bleu["precisions"])]

    test_data_num = i + 1
    acc_top1 /= test_data_num
    acc_bleu_scores /= test_data_num
    acc_bleu_precisions = [j / test_data_num for j in acc_bleu_precisions]

    print(f"The test data number = {test_data_num}")
    print(f"Top-1 accuracy = {acc_top1}")
    print(f"Bleu score = {acc_bleu_scores}")
    print(f"Bleu precision = {acc_bleu_precisions}")
    f_result.write('\n')
    f_result.write(f"The test data number = {test_data_num}\n")
    f_result.write(f"Top-1 accuracy = {acc_top1}\n")
    f_result.write(f"Bleu score = {acc_bleu_scores}\n")
    f_result.write(f"Bleu precision = {acc_bleu_precisions}\n")

f_result.close()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]



The test data number = 208
Top-1 accuracy = 0.9855769230769231
Bleu score = 0.9855769230769231
Bleu precision = [0.9913461538461539, 0.9903846153846154, 0.9855769230769231, 0.9855769230769231]
