# Manage all the multiple type/multiple class mapping nouns and verbs

Read the definition files (noun/verb-multi.txt) for terms that map to different or multiple DNA classes. Output the results as noun/verb pickle files that can be imported into the DNA processing.

Eventually, these files will be output as TTL and loaded into the ontology directly.

In [4]:
# Map all the multiple choice and/or multiple type nouns/verbs (noun-multi.txt and verb-multi.txt)
#   into a dictionary format and then pickle the dictionary
import pickle

def get_mappings(file_text: str, is_verb: bool, word_dict: dict):
    word_maps = file_text.split("\n")
    for wmap in word_maps:
        word = wmap.split(" ['")[0]
        if word in word_dict.keys():
            print('dup term: ', word)
            continue
        class_names = wmap.split("['")[1].split("']")[0]
        indiv_names = class_names.split("', '")
        for indiv_name in indiv_names:
            if not indiv_name.startswith('urn:') and indiv_name != '+do':
                print('not urn:', word, indiv_name)
        word_dict[word] = indiv_names

verbs_multiple = dict()
with open("verb-multi-en.txt", "r") as f:   # Mapping of verb to DNA class names
    f_text = f.read()
get_mappings(f_text, True, verbs_multiple)   
nouns_multiple = dict()
with open("noun-multi-en.txt", "r") as f:   # Mapping of verb to DNA class names
    f_text = f.read()
get_mappings(f_text, True, nouns_multiple) 

In [5]:
# For example ...
print(verbs_multiple)



In [6]:
with open('nouns-multiple-en.pickle', 'wb') as out_file:
    pickle.dump(nouns_multiple, out_file)
with open('verbs-multiple-en.pickle', 'wb') as out_file:
    pickle.dump(verbs_multiple, out_file)

# Check for Duplicates in noun and verb synonyms in the ttl files

The current ontology files are loaded into the 'ontologies' db in Stardog.

In [4]:
import stardog

sd_conn_details = {'endpoint': 'http://localhost:5820',
                   'username': 'admin',
                   'password': 'admin'}

query_dup_noun_syn = 'prefix : <urn:ontoinsights:dna:> select ?class1 ?class2 ?term where ' \
                     '{?class1 :noun_synonym ?term . ?class2 :noun_synonym ?term . ' \
                     'FILTER (!CONTAINS(str(?class1), "Currency")) . FILTER (!CONTAINS(str(?class2), "Currency")) . ' \
                     'FILTER (!CONTAINS(str(?class1), ":enum:")) . FILTER (!CONTAINS(str(?class2), ":enum:")) . ' \
                     'FILTER (str(?class1) < str(?class2))}'
query_dup_verb_syn = 'prefix : <urn:ontoinsights:dna:> select ?class1 ?class2 ?term where ' \
                     '{?class1 :verb_synonym ?term . ?class2 :verb_synonym ?term . FILTER (str(?class1) < str(?class2))}'

dna_conn = stardog.Connection("ontologies", **sd_conn_details)

In [5]:
syn_results = dna_conn.select(query_dup_noun_syn, content_type='application/sparql-results+json')
for syn_result in syn_results['results']['bindings']:
    class1 = syn_result['class1']['value']
    class2 = syn_result['class2']['value']
    print('noun text: ', syn_result['term']['value'], ', in classes: ', class1, class2)
# Types of business may overlap events
# Antarctica is both a continent and country
# Daba is both a kind of Tibetan Buddhism and an ethnicity (Nigeria and Cameroon)     

noun text:  Antarctica , in classes:  urn:ontoinsights:geonames:6255152 urn:ontoinsights:geonames:6697173
noun text:  agriculture , in classes:  urn:ontoinsights:dna:Agribusiness urn:ontoinsights:dna:AgricultureApicultureAndAquacultureEvent
noun text:  apiculture , in classes:  urn:ontoinsights:dna:Agribusiness urn:ontoinsights:dna:AgricultureApicultureAndAquacultureEvent
noun text:  aquaculture , in classes:  urn:ontoinsights:dna:Agribusiness urn:ontoinsights:dna:AgricultureApicultureAndAquacultureEvent
noun text:  viticulture , in classes:  urn:ontoinsights:dna:Agribusiness urn:ontoinsights:dna:AgricultureApicultureAndAquacultureEvent
noun text:  Daba , in classes:  urn:ontoinsights:dna:Ethnicity urn:ontoinsights:dna:ReligiousBelief


In [6]:
syn_results = dna_conn.select(query_dup_verb_syn, content_type='application/sparql-results+json')
for syn_result in syn_results['results']['bindings']:
    class1 = syn_result['class1']['value']
    class2 = syn_result['class2']['value']
    print('verb text: ', syn_result['term']['value'], ', in classes: ', class1, class2)

# Check for duplicate terms in the multiple classes dictionaries and the DNA ontology TTL files

In [7]:
query_multi_noun = 'prefix : <urn:ontoinsights:dna:> select ?class where {?class :noun_synonym ?term}'
query_multi_verb = 'prefix : <urn:ontoinsights:dna:> select ?class where {?class :verb_synonym ?term}'


In [8]:
for word in verbs_multiple.keys():
    query_str = query_multi_verb.replace('?term', f'"{word}"@en')
    query_results = dna_conn.select(query_str, content_type='application/sparql-results+json')
    if 'results' in query_results and 'bindings' in query_results['results']:
        for result in query_results['results']['bindings']:
            print(f'Duplicate {word} in verbs-multi and {result["class"]["value"]}')

In [9]:
for word in nouns_multiple.keys():
    query_str = query_multi_noun.replace('?term', f'"{word}"@en')
    query_results = dna_conn.select(query_str, content_type='application/sparql-results+json')
    if 'results' in query_results and 'bindings' in query_results['results']:
        for result in query_results['results']['bindings']:
            print(f'Duplicate {word} in nouns-multi and {result["class"]["value"]}')
# Duplicates due to enumerations

Duplicate basin in nouns-multi and urn:ontoinsights:dna:enum:CanyonAndValley
Duplicate female in nouns-multi and urn:ontoinsights:dna:enum:Female
Duplicate male in nouns-multi and urn:ontoinsights:dna:enum:Male
Duplicate stream in nouns-multi and urn:ontoinsights:dna:enum:RiverAndWaterway


# Check for duplicate synset ids and create dictionaries (key=synId, value=DNA class mapping)

This is based on the ontology files loaded into the 'ontologies' database.

In [10]:
query_dup_noun_syn_id = \
    'prefix : <urn:ontoinsights:dna:> select ?class ?synset where {?class :wordnet_noun_synset ?synset } ' \
    'ORDER BY ?synset'
query_dup_verb_syn_id = \
    'prefix : <urn:ontoinsights:dna:> select ?class ?synset where {?class :wordnet_verb_synset ?synset } ' \
    'ORDER BY ?synset'

dna_prefix = 'urn:ontoinsights:dna'

mapped_noun_synsets = dict()
mapped_verb_synsets = dict()

In [11]:
# Check for overlapping wordnet synset IDs
query_results = dna_conn.select(query_dup_noun_syn_id, content_type='application/sparql-results+json')
if 'results' in query_results and 'bindings' in query_results['results']:
    noun_results = query_results['results']['bindings']
    for noun in noun_results:
        synset = noun['synset']['value']
        if synset in mapped_noun_synsets:
            print(f'Repeated noun {synset} for {mapped_noun_synsets[synset]} and {noun["class"]["value"]}')
        else:
            mapped_noun_synsets[synset] = noun['class']['value'].replace(dna_prefix, '')

In [12]:
query_results = dna_conn.select(query_dup_verb_syn_id, content_type='application/sparql-results+json')
if 'results' in query_results and 'bindings' in query_results['results']:
    verb_results = query_results['results']['bindings']
    for verb in verb_results:
        synset = verb['synset']['value']
        if synset in mapped_verb_synsets:
            print(f'Repeated verb {synset} for {mapped_verb_synsets[synset]} and {verb["class"]["value"]}')
        else:
            mapped_verb_synsets[synset] = verb['class']['value'].replace(dna_prefix, '') 

In [13]:
# For example ....
print(mapped_verb_synsets)



# Process the WordNet hypernym tree

First, create RDF representing the WordNet hypernym-hyponym trees for a synset id and the texts associated with the synset. The RDF is saved in the files, noun-hierarchy.ttl and verb-hierarchy.ttl, which are loaded into the 'noun-wn' and 'verb-wn' databases in Stardog.

Second, iterate through the complete set of synsets in the hierarchy and match them to DNA classes using the mapped_noun/verb_synsets dictionaries. Processing is explained inline, below.

## Create RDF representation of the WordNet hypo/hypernym trees

And load the results to the Stardog databases, noun-wn and verb-wn

In [14]:
# Create RDF holding each synset and its hyponym, and get any terms that are marked with a lex_id of 0
# (Lex_id identifies a sense within a lexicographer file, starts with 0, and is incremented as new senses are added)
with open("wordnet-data-noun.txt", "r") as wn:
    wn_text = wn.read()
wn_detail = wn_text.split("\n")

syn_noun_dict = dict()
for syn_detail in wn_detail:
    if syn_detail.startswith("synOffset"):  # First line of the file
        continue
    syn_split = syn_detail.split()
    if len(syn_split) < 3:                  # Blank line 
        continue
    syn = syn_split[0]
    number_words = int(syn_split[3])
    syn_words = []
    for i in range(4, 4 + (number_words * 2), 2):
        if syn_split[i+1] == '0':            # First sense
            syn_words.append(syn_split[i].replace('_', ' '))
    hypernyms = []
    if " @i " in syn_detail:
        continue                             # Synset represents a particular instance
    if " @ " in syn_detail:
        at_split = syn_detail.split(" @ ")   # Indicates a hypernym
        for i in range(1, len(at_split)):
            hyper_split = at_split[i].split()
            if hyper_split[1] == "n" and hyper_split[2] == "0000":
                hypernyms.append(f"{hyper_split[0]}")
    syn_noun_dict[syn] = (syn_words, hypernyms)

with open("noun-hierarchy.ttl", "w") as ttl:
    ttl.write('@prefix urn: <urn:ontoinsights:dna:> .\n')
    for key, value in syn_noun_dict.items():
        syn_words, hypernyms = value
        ttl.write(f"urn:{key} a urn:Synset .\n")
        for hyper in hypernyms:
            ttl.write(f"urn:{key} rdfs:subClassOf urn:{hyper} .\n")
        for word in syn_words:
            ttl.write(f'urn:{key} rdfs:label "{word}" .\n')

In [15]:
with open("wordnet-data-verb.txt", "r") as wn:
    wn_text = wn.read()
wn_detail = wn_text.split("\n")

syn_verb_dict = dict()
for syn_detail in wn_detail:
    if syn_detail.startswith("synOffset"):  # First line of the file
        continue
    syn_split = syn_detail.split()
    if len(syn_split) < 3:                  # Blank line 
        continue
    syn = syn_split[0]
    number_words = int(syn_split[3])
    if number_words > 2:
        number_words = 2                    # Restrict to first two words
    syn_words = []
    for i in range(4, 4 + (number_words * 2), 2):
        if syn_split[i+1] == '0':            # First usage
            syn_words.append(syn_split[i].replace('_', ' '))
    hypernyms = []
    if " @i " in syn_detail:
        continue                             # Synset represents a particular instance
    if " @ " in syn_detail:
        at_split = syn_detail.split(" @ ")   # Indicates a hypernym
        for i in range(1, len(at_split)):
            hyper_split = at_split[i].split()
            if hyper_split[1] == "v" and hyper_split[2] == "0000":
                hypernyms.append(f"{hyper_split[0]}")
    syn_verb_dict[syn] = (syn_words, hypernyms)

with open("verb-hierarchy.ttl", "w") as ttl:
    ttl.write('@prefix urn: <urn:ontoinsights:dna:> .\n')
    for key, value in syn_verb_dict.items():
        syn_words, hypernyms = value
        ttl.write(f"urn:{key} a urn:Synset .\n")
        for hyper in hypernyms:
            ttl.write(f"urn:{key} rdfs:subClassOf urn:{hyper} .\n")
        for word in syn_words:
            ttl.write(f'urn:{key} rdfs:label "{word}" .\n')


## Check for new terms from the WordNet hierarchy that may map to DNA classes 

In [16]:
# Database connections where noun/verb-hierarchy.ttl are loaded
noun_conn = stardog.Connection("noun-wn", **sd_conn_details)
verb_conn = stardog.Connection("verb-wn", **sd_conn_details)
# Note that dna_conn is already set up to query the ontologies database

query_top_syn_id = 'prefix urn: <urn:ontoinsights:dna:> select distinct ?synset where {?synset a urn:Synset. ' \
                   'FILTER NOT EXISTS {?synset rdfs:subClassOf ?x} }'

top_noun_syn_ids = []
top_verb_syn_ids = []

In [17]:
query_results = noun_conn.select(query_top_syn_id, content_type='application/sparql-results+json')
if 'results' in query_results and 'bindings' in query_results['results']:
    noun_results = query_results['results']['bindings']
    for noun in noun_results:
        top_noun_syn_ids.append(noun['synset']['value'])

print(len(top_noun_syn_ids))

1


In [18]:
query_results = verb_conn.select(query_top_syn_id, content_type='application/sparql-results+json')
if 'results' in query_results and 'bindings' in query_results['results']:
    verb_results = query_results['results']['bindings']
    for verb in verb_results:
        top_verb_syn_ids.append(verb['synset']['value'])

print(len(top_verb_syn_ids))
print(top_verb_syn_ids)

566
['urn:ontoinsights:dna:00001740', 'urn:ontoinsights:dna:00078513', 'urn:ontoinsights:dna:00233707', 'urn:ontoinsights:dna:00104622', 'urn:ontoinsights:dna:01835473', 'urn:ontoinsights:dna:00010428', 'urn:ontoinsights:dna:00014542', 'urn:ontoinsights:dna:00015706', 'urn:ontoinsights:dna:00016695', 'urn:ontoinsights:dna:00017275', 'urn:ontoinsights:dna:00017858', 'urn:ontoinsights:dna:00018151', 'urn:ontoinsights:dna:00126072', 'urn:ontoinsights:dna:00020126', 'urn:ontoinsights:dna:00020442', 'urn:ontoinsights:dna:00109468', 'urn:ontoinsights:dna:00099475', 'urn:ontoinsights:dna:01806476', 'urn:ontoinsights:dna:02134989', 'urn:ontoinsights:dna:02610777', 'urn:ontoinsights:dna:00173351', 'urn:ontoinsights:dna:02636270', 'urn:ontoinsights:dna:00052091', 'urn:ontoinsights:dna:01620211', 'urn:ontoinsights:dna:00057124', 'urn:ontoinsights:dna:00721987', 'urn:ontoinsights:dna:00059330', 'urn:ontoinsights:dna:00077122', 'urn:ontoinsights:dna:00077211', 'urn:ontoinsights:dna:02321848', 'urn:

## Iterating from the "top" synset IDs  

* Iterate through the list of top_noun/verb synset IDs 
* When an id is found that is in the mapped_noun/verb_synsets ...
  * Begin capturing the WordNet syns/terms (defined as rdfs:labels) and add them as synonyms for the mapped DNA class
  * The mapped class is the value of the mapped_noun/verb_synset entry for the key = matching synId
* Continue down the hypo/hypernym tree until the end or until another ID is found in mapped_noun/verb_synsets
* If another ID is found, return to processing at the second bullet

In [19]:
matched_noun_syns = [key for key in mapped_noun_synsets.keys()]
matched_verb_syns = [key for key in mapped_verb_synsets.keys()]

query_labels = 'prefix urn: <urn:ontoinsights:dna:> select ?label where {urn:synset rdfs:label ?label}'
query_subclasses = 'prefix urn: <urn:ontoinsights:dna:> select ?subCl where {?subCl rdfs:subClassOf urn:currSyn}'

query_synonyms = 'prefix : <urn:ontoinsights:dna:> select ?syn where {?dna_class :xxx_synonym ?syn}'

In [20]:
def walk_wn_hierarchy(syn_id, dna_class, syns, multi_syns, file_name, is_verb, new_syns):
    id_only = syn_id.replace(f'{dna_prefix}:', '')
    if is_verb and id_only in matched_verb_syns:
        dna_class = mapped_verb_synsets[id_only]
    if not is_verb and id_only in matched_noun_syns:
        dna_class = mapped_noun_synsets[id_only]
    if dna_class:
        # Get WordNet synonyms/terms
        query_str = query_labels.replace("synset", id_only)
        if is_verb:
            query_results = verb_conn.select(query_str, content_type='application/sparql-results+json')
        else:
            query_results = noun_conn.select(query_str, content_type='application/sparql-results+json')
        if 'results' in query_results and 'bindings' in query_results['results']:
            label_results = query_results['results']['bindings']
            for label in label_results:
                new_term = label['label']['value']
                # Is this a new label?
                if new_term not in syns and new_term not in multi_syns and new_term not in new_syns:
                    new_syns.append(new_term)
                    with open(file_name, 'a') as save_syn_file:
                        if is_verb:
                            save_syn_file.write(f'{dna_class} :verb_synonym "{new_term}"@en .\n')
                        else:
                            save_syn_file.write(f'{dna_class} :noun_synonym "{new_term}"@en .\n')
    # Get next, lower level in the hierarchy
    query_str = query_subclasses.replace('currSyn', id_only)
    if is_verb:
        query_results = verb_conn.select(query_str, content_type='application/sparql-results+json')
    else:
        query_results = noun_conn.select(query_str, content_type='application/sparql-results+json')
    if 'results' in query_results and 'bindings' in query_results['results']:
        subcl_results = query_results['results']['bindings']
        for subclass in subcl_results:
            walk_wn_hierarchy(subclass['subCl']['value'], dna_class, syns, multi_syns, file_name, is_verb, new_syns)
    return

In [21]:
file_name = 'all_wn_verb_synonyms.ttl'
with open(file_name, 'w') as syn_file:
    syn_file.write('@prefix : <urn:ontoinsights:dna:> .\n')

verb_syns = []
query_str = query_synonyms.replace('xxx', 'verb')
query_results = dna_conn.select(query_str, content_type='application/sparql-results+json')
syn_results = query_results['results']['bindings']
for syn in syn_results:
    verb_syns.append(syn['syn']['value'])
                
new_verb_syns = []
for top in top_verb_syn_ids:
    class_name = ''
    walk_wn_hierarchy(top, class_name, verb_syns, list(verbs_multiple.keys()), file_name, True, new_verb_syns)
    

In [22]:
file_name = 'all_wn_noun_synonyms.ttl'
with open(file_name, 'w') as syn_file:
    syn_file.write('@prefix : <urn:ontoinsights:dna:> .\n')

noun_syns = []
query_str = query_synonyms.replace('xxx', 'noun')
query_results = dna_conn.select(query_str, content_type='application/sparql-results+json')
syn_results = query_results['results']['bindings']
for syn in syn_results:
    noun_syns.append(syn['syn']['value'])
       
new_noun_syns = []
for top in top_noun_syn_ids:
    class_name = ''
    walk_wn_hierarchy(top, class_name, noun_syns, list(nouns_multiple.keys()), file_name, False, new_noun_syns)

## Cleanup and redefinition

* Above TTLs (all_wn*.ttl files) manually reviewed and cleaned
  * Animal, plant, health, ... and other synonyms moved directly to wordnet-*.ttl files (for loading to Stardog)
  * Other concept and multi-class mappings remain in the files, noun.csv and verb.csv, which have the format:
    * First column = triple definition of the form, <class> :noun/verb_synonym "<text>"@en. 
    * Second column is an optional, additional entry that defines an override for a multiple inheritance or multiple alternative mapping
* Processing below takes the results from the noun/verb.csv files and outputs the text definitions for the multi-mappings (that are then added to the existing noun/verb-multi.txt files) or creates new .ttl files that can be directly loaded to Stardog
  * After this step, the first cells of this notebook are rerun to create the pickle files and then, the validation cells are rerun to guarantee that there are no duplicates 

In [1]:
with open('verbs.csv', 'r') as csvfile:
    verb_details = csvfile.read().split('\n')
with open('multi-verbs.txt', 'w') as multifile:
    with open('verb-synonyms.ttl', 'w') as ttlfile:
        ttlfile.write('@prefix : <urn:ontoinsights:dna:> .\n')
        for line in verb_details:
            parts = line.split('@en .,')
            triple = parts[0]
            multi = parts[1]
            if multi:
                text = triple.split('"')[1]
                updated = multi.replace('"', '').replace(', ', "', 'urn:ontoinsights:dna:")\
                               .replace('+', '+urn:ontoinsights:dna:')
                multifile.write(f"{text} ['urn:ontoinsights:dna:{updated}']\n") 
            else:
                ttlfile.write(f'{triple}@en .\n')

In [3]:
with open('nouns.csv', 'r') as csvfile:
    noun_details = csvfile.read().split('\n')
with open('multi-nouns.txt', 'w') as multifile:
    with open('noun-synonyms.ttl', 'w') as ttlfile:
        ttlfile.write('@prefix : <urn:ontoinsights:dna:> .\n')
        for line in noun_details:
            parts = line.split('@en .,')
            triple = parts[0]
            multi = parts[1]
            if multi:
                text = triple.split('"')[1]
                updated = multi.replace('"', '').replace(', ', "', 'urn:ontoinsights:dna:")\
                               .replace('+', '+urn:ontoinsights:dna:')
                multifile.write(f"{text} ['urn:ontoinsights:dna:{updated}']\n") 
            else:
                ttlfile.write(f'{triple}@en .\n')