In [2]:
from datasets import load_dataset
import amrlib
from amrlib.alignments.rbw_aligner import RBWAligner
from amrlib.graph_processing.annotator import add_lemmas
import spacy

class SemanticEntailmentAMRDataset:
    def __init__(self, dataset_name, amr_parser_path='/home/david/tmp/model_parse_xfm_bart_base-v0_1_0'):

        self.amr_parser = amrlib.load_stog_model(amr_parser_path)
        self.dataset_name = dataset_name
        self.nlp_tokenizer = spacy.load("en_core_web_sm")

        if dataset_name == 'hans':
            self.dataset = load_dataset('hans')
        elif dataset_name in ['mnli', 'wnli', 'qnli', 'mnli_mismatched', 'mnli_matched','cola']:
            self.dataset = load_dataset('glue', dataset_name)
        else:
            raise Exception(f'Dataset {dataset_name} does not exist.')

    def splits(self):
        return self.dataset.keys()

    def to_amr(self, split):
        if self.dataset_name != 'cola':
            raise Exception(f'Dataset {self.dataset_name} not supported.')
        sent_idx_list = [(entry['sentence'], entry['idx']) for entry in  self.dataset[split]]
        sent_list, idx_list = list(zip(*sent_idx_list))
        sent_list, idx_list = list(sent_list), list(idx_list)
        # print(sent_list, idx_list)
        parsed_sents = self.amr_parser.parse_sents(sent_list, add_metadata=True)
        return parsed_sents, idx_list

    def aligned_AMR(self, arm_graph_string):
        pg = add_lemmas(arm_graph_string, snt_key='snt')
        aligner = RBWAligner.from_penman_w_json(pg)
        penman_graph = aligner.get_penman_graph()
        return penman_graph


    def get_alignments(self, text):
        # returns an array where the i-th entry is a 2-tuple of the start and end characters index of the word (the last exclusive!)
        # TODO inclusive or exclusive?
        tokenized = self.nlp_tokenizer(text)
        alignments = []
        for i, token in enumerate(tokenized):
            alignments.append((token.idx, token.idx + len(token)))
        return alignments

    def __str__(self):
        return str(self.dataset)




a = SemanticEntailmentAMRDataset('cola')

print(a)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [17]:
ps, li = a.to_amr('test')

gid=x Start paren present but and is not a new concept
gid=x Start paren present but amr-unknown is not a new concept
gid=x Start paren present but old is not a new concept
gid=x Start paren present but likely-01 is not a new concept
gid=x Missing starting paren for node a3/any
gid=x Start paren present but name is not a new concept
gid=x Initial node constructed when triples not empty, ignoring token.
gid=x Missing starting paren for node g2/garden
gid=x Start paren present but person is not a new concept
gid=x Start paren present but asparagus is not a new concept
gid=x Start paren present but eat-01 is not a new concept
gid=x Start paren present but name is not a new concept
gid=x Start paren present but right-06 is not a new concept
gid=x Start paren present but believe-01 is not a new concept
gid=x Start paren present but play-11 is not a new concept
gid=x Start paren present but drop-01 is not a new concept
gid=x Start paren present but drop-01 is not a new concept
gid=x Start pa

In [18]:
ps[0]

'# ::snt Bill whistled past the house.\n(w / whistle-01\n      :ARG0 (p / person\n            :name (n / name\n                  :op1 "Bill"))\n      :path (p2 / past\n            :op1 (h / house)))'

In [19]:
from amrlib.graph_processing.annotator import add_lemmas
import spacy
nlp = spacy.load("en_core_web_sm")

penman_graph = add_lemmas(ps[0], snt_key='snt')

OSError: [E049] Can't find spaCy data directory: 'None'. Check your installation and permissions, or use spacy.util.set_data_path to customise the location if necessary.

In [None]:
from amrlib.alignments.rbw_aligner import RBWAligner
aligner = RBWAligner.from_string_w_json(ps[0])  # use this with a graph string that is properly annotated
penman_graph = aligner.get_penman_graph()   