In [1]:
import torch
example = """
# ::id bolt12_64556_5627.3 ::date 2012-12-04T18:01:17 ::annotator SDL-AMR-09 ::preferred
# ::snt Fleets bumping fishing boats. Little evil Japanese ghosts stirring up trouble and unrest. With hearts of thieves and arrogant form, they again show their wolfish appearance
# ::save-date Sat Jan 10, 2015 ::file bolt12_64556_5627_3.txt
(m / multi-sentence
      :snt1 (b / bump-01
            :ARG1 (b2 / boat
                  :purpose (f / fish-01))
            :ARG2 (f2 / fleet))
      :snt2 (s / stir-up-04
            :ARG0 (g / ghost
                  :mod (c / country :wiki "Japan" :name (n / name :op1 "Japan"))
                  :mod (l / little)
                  :mod (e / evil))
            :ARG1 (a / and
                  :op1 (t / trouble)
                  :op2 (u / unrest)))
      :snt3 (s2 / show-01
            :ARG0 (t2 / they)
            :ARG1 (a3 / appearance
                  :poss t2
                  :mod (w / wolfish))
            :mod (a2 / again)
            :prep-with (a4 / and
                  :op1 (h / heart
                        :mod (p / person
                              :ARG0-of (s3 / steal-01)))
                  :op2 (f3 / form
                        :mod (a5 / arrogance)))))"""
example_path = "example.txt"
with open(example_path,"w") as f:
    f.write(example)

# Anotasi Fitur (POS, NER, LEM, TOK)
## Menggunakan CoreNLP Server 3.9.1

> Sebelum menjalankan script ini pastikan sudah ada CoreNLP server yang jalan di lokal dengan minimal 8GB RAM

> Kalau run pertama gagal, coba lagi, biasanya itu dikarenakan timeout. Karena disaat jalan awal dibutuhkan waktu lama untuk download package bahasa Inggris terlebih dahulu

Bila ingin menggunakan docker bisa dicoba docker registry berikut: [banditelol/CoreNLP](https://hub.docker.com/repository/docker/banditelol/corenlp) jangan lupa untuk membuka port yang dibutuhkan

## Compound File

Bisa di-download dari [repo ChunchuanLv](https://github.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/blob/master/data/joints.txt). Ini isinya kata kerja untuk yang ada di propbank, kalau di Indonesia compound words verb seperti apa. 

Ini berisi compound word atau kata majemuk dalam bahasa Inggris. Untuk implementasi bahasa Indonesia berarti bisa untuk menangani masalah Rumah Sakit, Rumah Makan, etc.

In [6]:
from stog.data.dataset_readers.amr_parsing.io import AMRIO
from stog.data.dataset_readers.amr_parsing.preprocess.feature_annotator import FeatureAnnotator
from stog.utils import logging
from tqdm.notebook import tqdm

logger = logging.init_logger()

compound_file= "data/AMR/amr_2.0_utils/joints.txt"
annotator = FeatureAnnotator('http://localhost:9000', compound_file)

logger.info('Processing {}'.format(example_path))
with open(example_path + '.features', 'w', encoding='utf-8') as f:
    for i, amr in enumerate(AMRIO.read(example_path), 1):
        if i % 1000 == 0:
            logger.info('{} processed.'.format(i))
        annotation = annotator(amr.sentence)
        amr.tokens = annotation['tokens']
        amr.lemmas = annotation['lemmas']
        amr.pos_tags = annotation['pos_tags']
        amr.ner_tags = annotation['ner_tags']
        AMRIO.dump([amr], f)
print(amr)

[2021-04-21 00:51:36,189 INFO] Processing example.txt
[2021-04-21 00:51:36,194 INFO] # ::id bolt12_64556_5627.3 ::date 2012-12-04T18:01:17 ::annotator SDL-AMR-09 ::preferred
# ::snt Fleets bumping fishing boats. Little evil Japanese ghosts stirring up trouble and unrest. With hearts of thieves and arrogant form, they again show their wolfish appearance
# ::save-date Sat Jan 10, 2015 ::file bolt12_64556_5627_3.txt
(m / multi-sentence
      :snt1 (b / bump-01
            :ARG1 (b2 / boat
                  :purpose (f / fish-01))
            :ARG2 (f2 / fleet))
      :snt2 (s / stir-up-04
            :ARG0 (g / ghost
                  :mod (c / country
                        :name (n / name
                              :op1 "Japan")
                        :wiki "Japan")
                  :mod (l / little)
                  :mod (e / evil))
            :ARG1 (a / and
                  :op1 (t / trouble)
                  :op2 (u / unrest)))
      :snt3 (s2 / show-01
            :ARG0 (t

# Preprocessing

Clean -> ~~Recategorize~~ -> Anonymization -> Sense Remover

In [11]:
from stog.data.dataset_readers.amr_parsing.preprocess.input_cleaner import clean

util_dir = "data/AMR/amr_2.0_utils"
example_annotated = example_path + ".features"

with open(example_annotated + '.input_clean', 'w', encoding='utf-8') as f:
    for amr in AMRIO.read(example_annotated):
        clean(amr)
        f.write(str(amr) + '\n\n')
print(amr)

# ::id bolt12_64556_5627.3 ::date 2012-12-04T18:01:17 ::annotator SDL-AMR-09 ::preferred
# ::snt Fleets bumping fishing boats. Little evil Japanese ghosts stirring up trouble and unrest. With hearts of thieves and arrogant form, they again show their wolfish appearance
# ::tokens ["Fleets", "bumping", "fishing", "boats", ".", "Little", "evil", "Japanese", "ghosts", "stirring-up", "trouble", "and", "unrest", ".", "With", "hearts", "of", "thieves", "and", "arrogant", "form", ",", "they", "again", "show", "their", "wolfish", "appearance"]
# ::lemmas ["fleet", "bump", "fishing", "boat", ".", "little", "evil", "japanese", "ghost", "stir-up", "trouble", "and", "unrest", ".", "with", "heart", "of", "thief", "and", "arrogant", "form", ",", "they", "again", "show", "they", "wolfish", "appearance"]
# ::pos_tags ["NNS", "VBG", "NN", "NNS", ".", "JJ", "JJ", "JJ", "NNS", "COMP", "NN", "CC", "NN", ".", "IN", "NNS", "IN", "NNS", "CC", "JJ", "NN", ",", "PRP", "RB", "VBP", "PRP$", "JJ", "NN"]
# ::ner_t

In [13]:
from stog.data.dataset_readers.amr_parsing.preprocess.recategorizer import Recategorizer

example_cleaned = example_annotated + ".input_clean"
amr_train_file = None
build_utils = False
dump_dir = util_dir

recategorizer = Recategorizer(
    train_data=amr_train_file,
    build_utils=build_utils,
    util_dir=dump_dir)

for amr in recategorizer.recategorize_file(example_cleaned):
    print(amr)


[2021-04-21 01:01:46,434 INFO] Done.

# ::id bolt12_64556_5627.3 ::date 2012-12-04T18:01:17 ::annotator SDL-AMR-09 ::preferred
# ::snt Fleets bumping fishing boats. Little evil Japanese ghosts stirring up trouble and unrest. With hearts of thieves and arrogant form, they again show their wolfish appearance
# ::tokens ["Fleets", "bumping", "fishing", "boats", ".", "Little", "evil", "NATIONALITY_1", "ghosts", "stirring-up", "trouble", "and", "unrest", ".", "With", "hearts", "of", "thieves", "and", "arrogant", "form", ",", "they", "again", "show", "their", "wolfish", "appearance"]
# ::lemmas ["fleet", "bump", "fishing", "boat", ".", "little", "evil", "NATIONALITY_1", "ghost", "stir-up", "trouble", "and", "unrest", ".", "with", "heart", "of", "thief", "and", "arrogant", "form", ",", "they", "again", "show", "they", "wolfish", "appearance"]
# ::pos_tags ["NNS", "VBG", "NN", "NNS", ".", "JJ", "JJ", "NNP", "NNS", "COMP", "NN", "CC", "NN", ".", "IN", "NNS", "IN", "NNS", "CC", "JJ", "NN", ",", 

### Test Data Anonymize

Untuk test data tidak perlu recategorize karena hanya perlu untuk mengambil informasi anonimisasi dari proses recategorize yang sudah dilakukan untuk training data. Karena pada dasarnya model ditrain pada vocabnya training data, jadi percuma kalau ini juga di recategorize.

In [14]:
from stog.data.dataset_readers.amr_parsing.preprocess.text_anonymizor import  TextAnonymizor

text_anonymizor = TextAnonymizor.from_json(
    os.path.join(util_dir, "text_anonymization_rules.json"))

with open(example_cleaned + ".recategorize", "w", encoding="utf-8") as f:
    for amr in AMRIO.read(example_cleaned):
        amr.abstract_map = text_anonymizor(amr)
        f.write(str(amr) + "\n\n")
print(amr)

# ::id bolt12_64556_5627.3 ::date 2012-12-04T18:01:17 ::annotator SDL-AMR-09 ::preferred
# ::snt Fleets bumping fishing boats. Little evil Japanese ghosts stirring up trouble and unrest. With hearts of thieves and arrogant form, they again show their wolfish appearance
# ::tokens ["Fleets", "bumping", "fishing", "boats", ".", "Little", "evil", "NATIONALITY_1", "ghosts", "stirring-up", "trouble", "and", "unrest", ".", "With", "hearts", "of", "thieves", "and", "arrogant", "form", ",", "they", "again", "show", "their", "wolfish", "appearance"]
# ::lemmas ["fleet", "bump", "fishing", "boat", ".", "little", "evil", "NATIONALITY_1", "ghost", "stir-up", "trouble", "and", "unrest", ".", "with", "heart", "of", "thief", "and", "arrogant", "form", ",", "they", "again", "show", "they", "wolfish", "appearance"]
# ::pos_tags ["NNS", "VBG", "NN", "NNS", ".", "JJ", "JJ", "NNP", "NNS", "COMP", "NN", "CC", "NN", ".", "IN", "NNS", "IN", "NNS", "CC", "JJ", "NN", ",", "PRP", "RB", "VBP", "PRP$", "JJ", "NN"

In [16]:
from stog.data.dataset_readers.amr_parsing.preprocess.sense_remover import SenseRemover
from stog.data.dataset_readers.amr_parsing.node_utils import NodeUtilities as NU

example_recategorized = example_cleaned + ".recategorize"    

node_utils = NU.from_json(util_dir, 0)

remover = SenseRemover(node_utils)


with open(example_recategorized + '.nosense', 'w', encoding='utf-8') as f:
    for amr in remover.remove_file(example_recategorized):
        f.write(str(amr) + '\n\n')
remover.reset_statistics()
print(amr)

# ::id bolt12_64556_5627.3 ::date 2012-12-04T18:01:17 ::annotator SDL-AMR-09 ::preferred
# ::snt Fleets bumping fishing boats. Little evil Japanese ghosts stirring up trouble and unrest. With hearts of thieves and arrogant form, they again show their wolfish appearance
# ::tokens ["Fleets", "bumping", "fishing", "boats", ".", "Little", "evil", "NATIONALITY_1", "ghosts", "stirring-up", "trouble", "and", "unrest", ".", "With", "hearts", "of", "thieves", "and", "arrogant", "form", ",", "they", "again", "show", "their", "wolfish", "appearance"]
# ::lemmas ["fleet", "bump", "fishing", "boat", ".", "little", "evil", "NATIONALITY_1", "ghost", "stir-up", "trouble", "and", "unrest", ".", "with", "heart", "of", "thief", "and", "arrogant", "form", ",", "they", "again", "show", "they", "wolfish", "appearance"]
# ::pos_tags ["NNS", "VBG", "NN", "NNS", ".", "JJ", "JJ", "NNP", "NNS", "COMP", "NN", "CC", "NN", ".", "IN", "NNS", "IN", "NNS", "CC", "JJ", "NN", ",", "PRP", "RB", "VBP", "PRP$", "JJ", "NN"

In [17]:
! mv example.txt.features.input_clean.recategorize.nosense example.txt.preproc

#  Prediction

``` bash
python -u -m stog.commands.predict \
    --archive-file ckpt-amr-2.0 \
    --weights-file ckpt-amr-2.0/best.th \
    --input-file data/AMR/amr_2.0/test.txt.features.preproc \
    --batch-size 16 \
    --use-dataset-reader \
    --cuda-device 0 \
    --output-file test.pred.txt \
    --silent \
    --beam-size 5 \
    --predictor STOG
```

In [2]:
from stog.commands.predict import _predict
from dataclasses import dataclass
import torch

@dataclass
class PredictArgs:
    archive_file: str = "ckpt-amr-2.0"
    weights_file: str = "ckpt-amr-2.0/best.th" 
    input_file: str = "example.txt.preproc"
    batch_size: int = 1
    use_dataset_reader: bool=True 
    cuda_device: int= 0 
    output_file: str= "example.pred.txt"
    silent: bool= True 
    beam_size: int= 5 
    predictor: str= "STOG"

args = PredictArgs(silent = False   )
if args.cuda_device >= 0:
    device = torch.device('cuda:{}'.format(args.cuda_device))
else:
    device = torch.device('cpu')
args.cuda_device = device

# ini masalah karena ga batch, coba overwrite biar pun ga batch tetep bisa jalan buat predict
_predict(args)



[2021-04-21 11:55:21,874 INFO] loading archive file ckpt-amr-2.0
[2021-04-21 11:55:21,877 INFO] Loading token dictionary from ckpt-amr-2.0\vocabulary.
[2021-04-21 11:55:21,919 INFO] Building the STOG Model...
[2021-04-21 11:55:21,921 INFO] loading archive file data/bert-base-cased
[2021-04-21 11:55:21,925 INFO] Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

[2021-04-21 11:55:27,373 INFO] encoder_token: 18002
[2021-04-21 11:55:27,374 INFO] encoder_chars: 113
[2021-04-21 11:55:27,378 INFO] decoder_token: 12202
[2021-04-21 11:55:27,381 INFO] decoder_chars: 87
[2021-04-21 11:55:32,092 INFO] loading vocabulary file data/bert-base-cased/bert-base-cased-vocab.txt
[2021-04-21 11:55:32,218 INFO] instantiating registered

# Post Processing

In [6]:
from stog.data.dataset_readers.amr_parsing.postprocess.node_restore import NodeRestore
from stog.data.dataset_readers.amr_parsing.node_utils import NodeUtilities as NU
util_dir = "data/AMR/amr_2.0_utils" 
file_path = "example.pred.txt"

node_utils = NU.from_json(util_dir, 0)
nr = NodeRestore(node_utils)

with open(file_path + '.frame', 'w', encoding='utf-8') as f:
    for amr in nr.restore_file(file_path):
        f.write(str(amr) + '\n\n')


In [10]:
from stog.data.dataset_readers.amr_parsing.postprocess.wikification import Wikification

file_path = "example.pred.txt.frame"
dump_spotlight_wiki = False

wikification = Wikification(util_dir=util_dir)

if dump_spotlight_wiki:
    wikification.dump_spotlight_wiki(file_path)

else:
    wikification.load_utils()
    with open(file_path + '.wiki', 'w', encoding='utf-8') as f:
        for amr in wikification.wikify_file(file_path):
            f.write(str(amr) + '\n\n')

In [11]:
from stog.data.dataset_readers.amr_parsing.postprocess.expander import Expander

file_path = "example.pred.txt.frame.wiki"

expander = Expander(util_dir=util_dir)

with open(file_path + '.expand', 'w', encoding='utf-8') as f:
    for amr in expander.expand_file(file_path):
        f.write(str(amr) + '\n\n') 

[2021-04-21 12:26:35,646 INFO] Restored 1 name nodes.
[2021-04-21 12:26:35,647 INFO] Expanded 1 name nodes.
[2021-04-21 12:26:35,650 INFO] Expanded 0 date nodes.
[2021-04-21 12:26:35,652 INFO] Expanded 0 score nodes.
[2021-04-21 12:26:35,655 INFO] Expanded 0 ordinal nodes.
[2021-04-21 12:26:35,658 INFO] Expanded 0 quantities.
[2021-04-21 12:26:35,662 INFO] Expanded 0 urls.
