###1. Chuẩn bị ngữ liệu và thư viện

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [12]:
from google.colab import files
from transformers import MBartForConditionalGeneration,MBart50Tokenizer,Trainer,TrainingArguments
import torch
from datasets import Dataset
import os
import re

In [13]:


uploaded = files.upload()

for filename in uploaded.keys():
    with open(filename, 'wb') as f:
        f.write(uploaded[filename])


Saving additions.txt to additions.txt
Saving amr-bank-struct-vietnamese.txt to amr-bank-struct-vietnamese.txt
Saving predicates.txt to predicates.txt


###2. Tiền xử lý

In [14]:
def load_file(file_path):
  with open(file_path,"r",encoding = "utf-8") as f:
    data = f.read()

  amr_blocks  = data.strip().split("\n\n")
  sentences = []
  amrs = []


  for block in amr_blocks:
    lines = block.strip().split("\n")

    sentence = ""
    amr = ""

    for line in lines:
      if line.startswith("# ::snt"):
        sentence = line.replace("# ::snt","").strip()
      elif not line.startswith("#"):
        amr += line.strip() +" "

    if sentence and amr :
      sentences.append(sentence)
      amrs.append(amr.strip())

  return {"input_text": sentences,"target_text": amrs}


file_path = "amr-bank-struct-vietnamese.txt"

raw_data = load_file(file_path)


if raw_data is None:
  print("Không tải được file")
  exit()
else:
  print("Tải file thành công")


print(f"Số lượng các câu trong corpus: {len(raw_data['input_text'])}")
print(f"Số lượng AMR trong corpus: {len(raw_data['target_text'])}")



Tải file thành công
Số lượng các câu trong corpus: 1572
Số lượng AMR trong corpus: 1572


In [15]:
file_add = "additions.txt"

with open(file_add,"r",encoding = "utf-8") as f:
  additions = [line.strip() for line in f.readlines() if line.strip()]


file_predicates  = "predicates.txt"

with open(file_predicates,"r",encoding = "utf-8") as f:
  predicates = [predicates.strip() for predicates in f.readlines() if predicates.strip()]

# file_vocab = "vocab.txt"
# with open(file_vocab,"r",encoding = "utf-8") as f:
#   vocab = [line.strip() for line in f.readlines() if line.strip()]

In [17]:
def linearize_amr(amr):
  amr = "\n".join(line for line in amr.splitlines() if not line.startswith("#"))
  amr = re.sub(r"\s+"," ",amr)
  return amr.strip()


data = []
for i in range(len(raw_data["input_text"])):
  data.append({"sentence": raw_data["input_text"][i],"amr": linearize_amr(raw_data["target_text"][i])})



###3. Thêm các token từ additions.txt và predicates.txt vào tokenizer

In [18]:
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50")

source_language = "vi_VN"
target_language = "vi_VN"
tokenizer.src_lang = source_language
tokenizer.tgt_lang = target_language

def preprocess_function(examples):
    inputs = examples['sentence']
    targets = examples['amr']

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True,
        text_target=targets
    )

    return model_inputs


dataset = Dataset.from_dict({"sentence": [item["sentence"] for item in data],"amr": [item["amr"] for item in data]})

dataset = dataset.map(preprocess_function,batched = True)
tokenized_datasets = dataset.train_test_split(test_size = 0.1)
tokenizer.add_tokens(additions)
tokenizer.add_tokens(predicates)
# tokenizer.add_tokens(vocab)


Map:   0%|          | 0/1572 [00:00<?, ? examples/s]

4436

###4. Huấn luyện mô hình

In [19]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer


model_name = "facebook/mbart-large-50"
model = MBartForConditionalGeneration.from_pretrained(model_name)



training_args = TrainingArguments(
    output_dir="./amr_model",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",*
    save_total_limit=3,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    push_to_hub=False,
    fp16=True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["test"],
    tokenizer = tokenizer
)

trainer.train()
trainer.save_model("./amr_best_model")



pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manhphuc22062004[0m ([33manhphuc22062004-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.1284,0.135001
2,0.0691,0.109296
3,0.0427,0.106242


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


###5. Generate AMR từ các câu tiếng Việt chưa có nhãn bằng model đã train ở phần 4.

In [20]:
!pip install penman



In [21]:
import penman
import re


model_name = "phuc22062004/1"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

def clean_amr_output(amr_text):
    amr_text = re.sub(r"\s+", " ", amr_text)

    amr_text = re.sub(r"(\w+)\s*-\s*(\d+)", r"\1-\2", amr_text)

    if amr_text.count("(") > amr_text.count(")"):
        amr_text += ")" * (amr_text.count("(") - amr_text.count(")"))

    return amr_text

def generate_amr(sentence):
    try:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True)
        
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=1024)
        
        amr_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        amr_text = clean_amr_output(amr_text)

        amr_graph = penman.decode(amr_text)  
        formatted_amr = penman.encode(amr_graph, indent=4)  
    except Exception as e:
        formatted_amr = f"AMR Parsing Error: {str(e)}\nOriginal Text: {amr_text}"

    return formatted_amr


In [None]:
import os
from datetime import datetime

def generate_id(index):
    return f"lpp_{index}"

def generate_date():
    return datetime.now().strftime("%Y-%m-%dT%H:%M:%S")



def format_amr_output(index, sentence, amr):
    amr_id = generate_id(index)
    date = generate_date()
    annotator = "AMR-Generator"
    file_name = f"{amr_id}.txt"

    formatted_amr = (
        f"# ::id {amr_id} ::date {date} ::annotator {annotator} ::preferred\n"
        f"# ::snt {sentence}\n"
        f"# ::save-date {date} ::file {file_name}\n"
        f"{amr}"
    )
    return formatted_amr

output_file = "all_amrs.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for index, sentence in enumerate(nolabels, start=1):
        amr = generate_amr(sentence)
        formatted_amr = format_amr_output(index, sentence, amr)
        f.write(formatted_amr + "\n\n")

