In [5]:
import torch
import os
os.chdir("/AIHCM/KGQA/NLPCore/uyenpp/NLG/AMRBART/fine-tune")

# os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
with open("/AIHCM/KGQA/NLPCore/uyenpp/NLG/data/tmt/test.jsonl", 'r', encoding='utf-8') as f:
    data = f.readline()

print(data)

In [6]:
import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
import pytorch_lightning.callbacks as plc
from pytorch_lightning.loggers import TensorBoardLogger
from model_interface.model_amr2text import AMR2TextModelModule
from data_interface.dataset_pl import AMR2TextDataModulesP
from common.options import add_model_specific_args
from common.callbacks import LoggingCallback, get_early_stopping_callback, get_checkpoint_callback
from transformers import AutoConfig
from spring_amr.tokenization_bart import PENMANBartTokenizer

In [7]:
import pickle 

with open("objargs.pkl", 'rb') as f:
    args = pickle.load(f)

In [8]:
tokenizer_name_or_path = args.tokenizer_name_or_path if args.tokenizer_name_or_path is not None else args.model_name_or_path

amr_tokenizer = PENMANBartTokenizer.from_pretrained(
    tokenizer_name_or_path,
    collapse_name_ops=False,
    use_pointer_tokens=True,
    raw_graph=False,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
model = AMR2TextModelModule(amr_tokenizer, args)
model.eval()

Loading pretrained model from /AIHCM/KGQA/NLPCore/uyenpp/NLG/data/finetuned-model/bart-large
Ori EMbeddings:  53844
Resized EMbeddings:  53846


AMR2TextModelModule(
  (model): BartForConditionalGeneration(
    (model): BartModel(
      (shared): Embedding(53846, 1024)
      (encoder): BartEncoder(
        (embed_tokens): Embedding(53846, 1024)
        (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
        (layers): ModuleList(
          (0): BartEncoderLayer(
            (self_attn): BartAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
            (final_layer_norm): LayerNorm((1024,), eps

In [None]:
def tokenize_function(examples):
    # Remove empty lines
    # sents = examples["src"]  # text tokens
    amrs = examples  # amr tokens
    model_inputs = {}
    amr_tokens = [[amr_tokenizer.bos_token] + amr.split() + [amr_tokenizer.eos_token]
                    for amr in amrs
                ]
    src_ids = amr_batch_encode(
                amr_tokens, max_length=512, pad_to_max_length=True
            )
    model_inputs["input_ids"] = src_ids
    return model_inputs

def amr_batch_encode(input_lst, max_length, pad_to_max_length=False):
    res = []
    for itm_lst in input_lst:
        res.append(
            get_ids(itm_lst, max_length=max_length, pad_to_max_length=pad_to_max_length)
        )
    return res

def get_ids(tokens, max_length=0, pad_to_max_length=False):
    token_ids = [amr_tokenizer.encoder.get(b, amr_tokenizer.unk_token_id) for b in tokens]
    if pad_to_max_length:
        assert max_length > 0, "Invalid max-length: {}".format(max_length)
        pad_ids = [amr_tokenizer.pad_token_id for _ in range(max_length)]
        len_tok = len(token_ids)
        if max_length > len_tok:
            pad_ids[:len_tok] = map(int, token_ids)
        else:
            pad_ids = token_ids[:max_length]
        return pad_ids
    return token_ids

In [None]:
step_count = 0
val_count = -1
saved_dummy = False
vocab_size = len(amr_tokenizer)
eval_beams = 5
eval_lenpen = 1.0
eval_max_length = 512
decoder_start_token_id = amr_tokenizer.bos_token_id
decoder_end_token_id = amr_tokenizer.eos_token_id

In [None]:
examples = ["\u0120<pointer:0> \u0120cause-01 \u0120:ARG1 \u0120<pointer:1> \u0120demonstrate-01 \u0120<stop> \u0120<pointer:1> \u0120:ARG0 \u0120<pointer:2> \u0120astronomer \u0120:time \u0120<pointer:3> \u0120date-entity \u0120:mod \u0120<pointer:4> \u0120again \u0120<stop> \u0120<pointer:2> \u0120:ARG1 -of \u0120<pointer:5> \u0120dress-01 \u0120<stop> \u0120<pointer:3> \u0120:year \u01201920 \u0120<stop> \u0120<pointer:4> \u0120:mod \u0120<pointer:6> \u0120all - over \u0120<stop> \u0120<pointer:5> \u0120:ARG2 \u0120<pointer:7> \u0120and \u0120<stop> \u0120<pointer:7> \u0120:op1 \u0120<pointer:8> \u0120style \u0120:op2 \u0120<pointer:9> \u0120elegance \u0120:ARG0 -of \u0120<pointer:10> \u0120impress-01 \u0120<stop> \u0120</s>"]

input_ids = tokenize_function(examples)["input_ids"]
src_ids = torch.IntTensor(input_ids)

In [None]:
import time

t0 = time.time()

generated_ids = model.model.generate(src_ids,
            # attention_mask=src_mask,
            use_cache=True,
            decoder_start_token_id=decoder_start_token_id,
            eos_token_id=decoder_end_token_id,
            num_beams=eval_beams,
            no_repeat_ngram_size=0,
            min_length=0,
            max_length=eval_max_length,
            length_penalty=eval_lenpen,
        )
gen_time = (time.time() - t0) / src_ids.shape[0]
preds = model.ids_to_clean_text(generated_ids)
print(preds)
print("Avg gen time: ", gen_time)