In [39]:
from datasets import load_dataset
import amrlib
from amrlib.alignments.rbw_aligner import RBWAligner
from amrlib.graph_processing.annotator import add_lemmas
import spacy
import penman

class SemanticEntailmentAMRDataset:
    def __init__(self, dataset_name, amr_parser_path='/home/david/tmp/model_parse_xfm_bart_base-v0_1_0'):

        self.amr_parser = amrlib.load_stog_model(amr_parser_path)
        self.dataset_name = dataset_name
        self.nlp_tokenizer = spacy.load('en_core_web_sm')

        if dataset_name == 'hans':
            self.dataset = load_dataset('hans')
        elif dataset_name in ['mnli', 'wnli', 'qnli', 'mnli_mismatched', 'mnli_matched','cola']:
            self.dataset = load_dataset('glue', dataset_name)
        else:
            raise Exception(f'Dataset {dataset_name} does not exist.')

    def splits(self):
        return self.dataset.keys()

    def to_amr(self, split):
        if self.dataset_name != 'cola':
            raise Exception(f'Dataset {self.dataset_name} not supported.')
        sent_idx_list = [(entry['sentence'], entry['idx']) for entry in  self.dataset[split]]
        sent_list, idx_list = list(zip(*sent_idx_list))
        sent_list, idx_list = list(sent_list), list(idx_list)
        # print(sent_list, idx_list)
        parsed_sents = self.amr_parser.parse_sents(sent_list, add_metadata=True)
        print('All parsed to AMR')

        aligned_amr_sentences =  []
        for amr_sentence in parsed_sents:
            aligned_amr_sentences.append(self.__aligned_AMR(amr_sentence))

        print('All AMR aligned')

        processed_amr_sentence_alignment_pairs =  []
        for aligned_amr_sentence in aligned_amr_sentences:
            processed_amr_sentence_alignment_pairs.append((penman.encode(aligned_amr_sentence),self.__get_alignments(aligned_amr_sentence)))

        print('All Tokens aligned')

        return processed_amr_sentence_alignment_pairs

    def __aligned_AMR(self, arm_graph_string):
        pg = add_lemmas(arm_graph_string, snt_key='snt')
        aligner = RBWAligner.from_penman_w_json(pg)
        penman_graph = aligner.get_penman_graph()
        return penman_graph


    def __get_alignments(self, penman_graph):
        # returns an array where the i-th entry is a 2-tuple of the start and end characters index of the word (the last exclusive!)
        # TODO inclusive or exclusive?
        alignments = []
        ref_token = self.nlp_tokenizer(penman_graph.metadata['snt'])
        for i, token in enumerate(json.loads(penman_graph.metadata['tokens'])):
            assert token == ref_token[i].text, f'token are unexpectedly different {token} - {ref_token[i].text}'
            alignments.append((ref_token[i].idx, ref_token[i].idx + len(ref_token[i])))
        return alignments

    def __str__(self):
        return str(self.dataset)




a = SemanticEntailmentAMRDataset('cola')

print(a)

Reusing dataset glue (/home/david/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 1944.51it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})





In [40]:
l = a.to_amr('test')

gid=x Start paren present but and is not a new concept
gid=x Start paren present but amr-unknown is not a new concept
gid=x Start paren present but old is not a new concept
gid=x Start paren present but likely-01 is not a new concept
gid=x Missing starting paren for node a3/any
gid=x Start paren present but name is not a new concept
gid=x Initial node constructed when triples not empty, ignoring token.
gid=x Missing starting paren for node g2/garden
gid=x Start paren present but person is not a new concept
gid=x Start paren present but asparagus is not a new concept
gid=x Start paren present but eat-01 is not a new concept
gid=x Start paren present but name is not a new concept
gid=x Start paren present but right-06 is not a new concept
gid=x Start paren present but believe-01 is not a new concept
gid=x Start paren present but play-11 is not a new concept
gid=x Start paren present but drop-01 is not a new concept
gid=x Start paren present but drop-01 is not a new concept
gid=x Start pa

All parsed to AMR
All AMR aligned
All Tokens aligned


In [42]:
l[0]

('# ::snt Bill whistled past the house.\n# ::tokens ["Bill", "whistled", "past", "the", "house", "."]\n# ::lemmas ["Bill", "whistle", "past", "the", "house", "."]\n# ::alignments 0-1.1.1.1 1-1 2-1.2 4-1.2.1\n(w / whistle-01~e.1\n   :ARG0 (p / person\n            :name (n / name\n                     :op1 "Bill"~e.0))\n   :path (p2 / past~e.2\n             :op1 (h / house~e.4)))',
 [(0, 4), (5, 13), (14, 18), (19, 22), (23, 28), (28, 29)])

In [13]:
print(ps[0])
import penman
b = a.aligned_AMR(ps[0])
print(b.metadata['tokens'])
penman.encode(a.aligned_AMR(ps[0]))


# ::snt Bill whistled past the house.
(w / whistle-01
      :ARG0 (p / person
            :name (n / name
                  :op1 "Bill"))
      :path (p2 / past
            :op1 (h / house)))
["Bill", "whistled", "past", "the", "house", "."]


'# ::snt Bill whistled past the house.\n# ::tokens ["Bill", "whistled", "past", "the", "house", "."]\n# ::lemmas ["Bill", "whistle", "past", "the", "house", "."]\n# ::alignments 0-1.1.1.1 1-1 2-1.2 4-1.2.1\n(w / whistle-01~e.1\n   :ARG0 (p / person\n            :name (n / name\n                     :op1 "Bill"~e.0))\n   :path (p2 / past~e.2\n             :op1 (h / house~e.4)))'

In [None]:
combos = {
    'cola': ['train','validation','test']
}

In [5]:
from datasets import load_dataset
www = load_dataset('glue','cola')
print(www)

Reusing dataset glue (/home/david/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 1405.44it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})





In [None]:
combos = {
    'cola': ['train','validation','test']
}

def preprocess_all_data(tasks):
    for dataset in tasks.keys():
        ds = SemanticEntailmentAMRDataset(dataset)
        for task in tasks[dataset]:
            res = ds.to_amr('task')
            f_name = f'amr_data_{dataset}_{task}.json'
            with open(f_name, 'w') as f:
                json.dump(res,f)
            print(f'Just finished {f_name}')

if __name__ == '__main__':
    preprocess_all_data(combos)
