In [83]:
import nltk
from nltk.corpus import wordnet as wn
from tqdm import tqdm
from collections import defaultdict, Counter
import numpy as np
from numpy import linalg as LA
from bert_serving.client import BertClient

import fasttext
import spacy

import re
import os

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load WordNet 3.0

In [67]:
n_synset2def = {}
n_hypernymy = []
n_hypo2hyper = {}
n_synset2lemma = {}
for synset in tqdm(list(wn.all_synsets('n'))):
    n_synset2def[synset.name()] = synset.definition()
    n_hypernymy.extend([[synset.name(), hypo.name()] for hypo in synset.hyponyms()])
    n_hypo2hyper[synset.name()] = [ele.name() for ele in synset.hypernyms()]
    n_synset2lemma[synset.name()] = synset.name().split(".")[0]
print(f"Number of nodes in noun taxonomy: {len(n_synset2def)}")
print(f"Number of edges in noun taxonomy: {len(n_hypernymy)}")
    
v_synset2def = {}
v_hypernymy = []
v_hypo2hyper = {}
v_synset2lemma = {}
for synset in tqdm(list(wn.all_synsets('v'))):
    v_synset2def[synset.name()] = synset.definition()
    v_hypernymy.extend([[synset.name(), hypo.name()] for hypo in synset.hyponyms()])
    v_hypo2hyper[synset.name()] = [ele.name() for ele in synset.hypernyms()]
    v_synset2lemma[synset.name()] = synset.name().split(".")[0]
print(f"Number of nodes in verb taxonomy: {len(v_synset2def)}")
print(f"Number of edges in verb taxonomy: {len(v_hypernymy)}")

100%|██████████| 82115/82115 [00:00<00:00, 157245.13it/s]
  0%|          | 0/13767 [00:00<?, ?it/s]

Number of nodes in noun taxonomy: 82115
Number of edges in noun taxonomy: 75850


100%|██████████| 13767/13767 [00:00<00:00, 161900.15it/s]

Number of nodes in verb taxonomy: 13767
Number of edges in verb taxonomy: 13239





# Load SemEval Task 14 Data and merge with wordnet

In [68]:
train_data = ["../data/semeval-2016-task-14/data/training/training.data.tsv",
              "../data/semeval-2016-task-14/keys/gold/training/training.key.tsv"]
trial_data = ["../data/semeval-2016-task-14/data/trial/trial.data.tsv",
              "../data/semeval-2016-task-14/keys/gold/trial/trial.key.tsv"]
test_data = ["../data/semeval-2016-task-14/data/test/test.data.tsv",
             "../data/semeval-2016-task-14/keys/gold/test/test.key.tsv"]


for i, dataset in enumerate([train_data, trial_data, test_data]):
    if i == 0:
        prefix = "train"
    elif i == 1:
        prefix = "val"
    else:
        prefix = "test"
    with open(dataset[0], "r") as fin:
        for line in fin:
            line = line.strip()
            if line:
                lemma, pos, idx, def_sent, url = line.split("\t")
                lemma = "_".join(lemma.split())
                idx = prefix+"."+idx
                if pos == "noun":  # noun taxonomy
                    n_synset2def[idx] = def_sent
                    n_synset2lemma[idx] = lemma
                else:
                    v_synset2def[idx] = def_sent
                    v_synset2lemma[idx] = lemma
    with open(dataset[1], "r") as fin:
        for line in fin:
            line = line.strip()
            if line:
                idx, correct_position, op = line.split("\t")
                wnid = wn.synset(correct_position.replace("#", ".").replace(' ', "_")).name()
                pos = wnid.split(".")[1]
                if pos == "n": # noun taxonomy
                    if op == "attach":  
                        n_hypernymy.append([wnid, prefix+"."+idx])
                    else:
                        if len(n_hypo2hyper[wnid]) < 1:
                            print(f"[Noun] The merged place has no parent: {prefix}.{idx}")
                            n_hypernymy.append([wnid, prefix+"."+idx])
                        elif len(n_hypo2hyper[wnid]) > 1:
                            print(f"[Noun] The merged place has multiple parents: {prefix}.{idx} -- {n_hypo2hyper[wnid]}")
                            n_hypernymy.extend([[hyper, prefix+"."+idx] for hyper in n_hypo2hyper[wnid]])                    
                        else:
                            n_hypernymy.append([n_hypo2hyper[wnid][0], prefix+"."+idx])
                else:
                    if op == "attach":
                        v_hypernymy.append([wnid, prefix+"."+idx])
                    else:
                        if len(v_hypo2hyper[wnid]) < 1:
                            print(f"[Verb] The merged place has no parent: {prefix}.{idx}")
                            v_hypernymy.append([wnid, prefix+"."+idx])
                        elif len(v_hypo2hyper[wnid]) > 1:
                            print(f"[Verb] The merged place has multiple parents: {prefix}.{idx} -- {v_hypo2hyper[wnid]}")
                            v_hypernymy.extend([[hyper, prefix+"."+idx] for hyper in v_hypo2hyper[wnid]])                                                
                        else:
                            v_hypernymy.append([v_hypo2hyper[wnid][0], prefix+"."+idx])

[Noun] The merged place has multiple parents: train.withdef.209 -- ['hand_tool.n.01', 'lever.n.01']
[Verb] The merged place has no parent: train.withdef.241
[Noun] The merged place has multiple parents: train.withdef.245 -- ['expulsion.n.03', 'reflex.n.01']
[Noun] The merged place has no parent: train.withdef.356
[Noun] The merged place has multiple parents: val.withdef.4 -- ['name.n.01', 'slang.n.02']
[Verb] The merged place has no parent: val.withdef.51
[Verb] The merged place has no parent: val.withdef.53
[Verb] The merged place has no parent: val.withdef.121
[Noun] The merged place has no parent: test.test.536
[Noun] The merged place has multiple parents: test.test.545 -- ['physical_condition.n.01', 'waking.n.01']


# Save term/taxo files and define term lemmas/definition_sentences

In [69]:
n_synset2def['train.withdef.3']

'A proband is an individual being studied or reported on. A proband is usually the first affected individual in a family who brings a genetic disorder to the attention of the medical community.'

In [74]:
for n_synset in n_synset2lemma.keys():
    if n_synset[0].isupper():
        print(n_synset)

In [71]:
dir_path = "/datadrive/structure_expan/data/semeval-2016-task-14-new/"

# noun
with open(os.path.join(dir_path, "wordnet_noun.terms"), "w") as fout:
    for synset in n_synset2lemma:
        fout.write(f"{synset}\t{n_synset2lemma[synset]}||{synset}\n")
        
with open(os.path.join(dir_path, "wordnet_noun.taxo"), "w") as fout:
    for hypernymy in n_hypernymy:
        fout.write(f"{hypernymy[0]}\t{hypernymy[1]}\n")

with open(os.path.join(dir_path, "wordnet_noun.definitions"), "w") as fout:
    for synset in n_synset2lemma:
        fout.write(f"{synset}\t{n_synset2def[synset]}\n")

# verb
with open(os.path.join(dir_path, "wordnet_verb.terms"), "w") as fout:
    for synset in v_synset2lemma:
        fout.write(f"{synset}\t{v_synset2lemma[synset]}||{synset}\n")
        
with open(os.path.join(dir_path, "wordnet_verb.taxo"), "w") as fout:
    for hypernymy in v_hypernymy:
        fout.write(f"{hypernymy[0]}\t{hypernymy[1]}\n")
        
with open(os.path.join(dir_path, "wordnet_verb.definitions"), "w") as fout:
    for synset in v_synset2lemma:
        fout.write(f"{synset}\t{v_synset2def[synset]}\n")


# Generate and save term initial (embedding) features 

In [58]:
fasttext = fasttext.load_model("/datadrive/fastText-pretrained-embedding/cc.en.300.bin")
nlp = spacy.load("en_core_web_sm")
# bert = spacy.load("en_trf_bertbaseuncased_lg")
bc = BertClient()

#### Embedding 1 (bert_base_uncased_defonly): BERT embedding of definition sentences

In [29]:
n_synset_list = []
n_synset_definition_list = []
for k,v in n_synset2def.items():
    n_synset_list.append(k)
    n_synset_definition_list.append(v)
    
n_bert_def_only_embed = bc.encode(n_synset_definition_list)

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


In [31]:
v_synset_list = []
v_synset_definition_list = []
for k,v in v_synset2def.items():
    v_synset_list.append(k)
    v_synset_definition_list.append(v)
    
v_bert_def_only_embed = bc.encode(v_synset_definition_list)

In [38]:
dir_path = "/datadrive/structure_expan/data/semeval-2016-task-14/"

# noun
with open(os.path.join(dir_path, "wordnet_noun.terms.bert_uncased_defonly.embed"), "w") as fout:
    fout.write(f"{n_bert_def_only_embed.shape[0]} {n_bert_def_only_embed.shape[1]}\n")
    for i, synset in enumerate(n_synset_list):
        synset_embed = n_bert_def_only_embed[i, :]
        synset_embed_string = " ".join([str(e) for e in synset_embed])
        fout.write(f"{synset} {synset_embed_string}\n")
        
# verb
with open(os.path.join(dir_path, "wordnet_verb.terms.bert_uncased_defonly.embed"), "w") as fout:
    fout.write(f"{v_bert_def_only_embed.shape[0]} {v_bert_def_only_embed.shape[1]}\n")
    for i, synset in enumerate(v_synset_list):
        synset_embed = v_bert_def_only_embed[i, :]
        synset_embed_string = " ".join([str(e) for e in synset_embed])
        fout.write(f"{synset} {synset_embed_string}\n")

#### Embedding 2,3 (fasttext_defonly|fasttext_all_average): fasttext based embedding vectors

In [44]:
n_synset2lemma['test.test.2']

'never_event'

In [54]:
def obtain_fasttext_embed(model, token_list, vector_size=300):
    embed = np.zeros(vector_size)
    cnt = 0
    for token in token_list:
        if token in model:  # whenever a character ngrams appaer in the token
            embed += model.get_word_vector(token)
            cnt += 1
    if cnt != 0:
        embed /= cnt
    return embed

In [59]:
n_synset_list = []
n_synset_defonly_embed_list = []
n_synset_average_defonly_plus_lemma_embed_list = []
n_synset_weighted_average_defonly_plus_lemma_embed_list = []
n_synset_defonly_plus_lemma_with_fwfs_embed_list = []

for n_id in tqdm(n_synset2lemma):
    n_synset_list.append(n_id)
    
    # lemma embedding
    lemma_tok_list = re.split(r"\s|, |-", n_synset2lemma[n_id].lower())
    lemma_embed = obtain_fasttext_embed(fasttext, lemma_tok_list)
    
    # definition embedding
    def_pos_list = []
    def_tok_list = []
    for token in nlp(n_synset2def[n_id]):
        def_pos_list.append(token.pos_)
        def_tok_list.append(token.text)
    def_embed = obtain_fasttext_embed(fasttext, def_tok_list)
    
    # first same pos tagged word embedding
    first_same_pos_tagged_word = ""
    for tok, pos in zip(def_tok_list, def_pos_list):
        if pos == "NOUN":
            first_same_pos_tagged_word = tok
            break
    if first_same_pos_tagged_word != "":
        first_same_pos_tagged_word_tok_list = re.split(r"\s|, |-", first_same_pos_tagged_word.lower())
        first_same_pos_tagged_word_tok_embed = obtain_fasttext_embed(fasttext, first_same_pos_tagged_word_tok_list)
    else:
        first_same_pos_tagged_word_tok_embed = ""
    
    # embedding based only on definition
    n_synset_defonly_embed_list.append(def_embed)
    
    # embedding based on half lemma name and half definition
    n_synset_average_defonly_plus_lemma_embed_list.append((lemma_embed+def_embed)/2)
    
    # embedding based on 0.25 lemma name and 0.75 definition
    n_synset_weighted_average_defonly_plus_lemma_embed_list.append(0.25*lemma_embed + 0.75*def_embed)

    # embedding based on lemma name, definition, and first same pos tagged word embedding
    if first_same_pos_tagged_word_tok_embed == "":
        n_synset_defonly_plus_lemma_with_fwfs_embed_list.append((lemma_embed + 2*def_embed) / 3)
    else:
        n_synset_defonly_plus_lemma_with_fwfs_embed_list.append((lemma_embed + def_embed + first_same_pos_tagged_word_tok_embed) / 3)            

100%|██████████| 83073/83073 [43:21<00:00, 31.94it/s]  


In [62]:
v_synset_list = []
v_synset_defonly_embed_list = []
v_synset_average_defonly_plus_lemma_embed_list = []
v_synset_weighted_average_defonly_plus_lemma_embed_list = []
v_synset_defonly_plus_lemma_with_fwfs_embed_list = []

for v_id in tqdm(v_synset2lemma):
    v_synset_list.append(v_id)
    
    # lemma embedding
    lemma_tok_list = re.split(r"\s|, |-", v_synset2lemma[v_id].lower())
    lemma_embed = obtain_fasttext_embed(fasttext, lemma_tok_list)
    
    # definition embedding
    def_pos_list = []
    def_tok_list = []
    for token in nlp(v_synset2def[v_id]):
        def_pos_list.append(token.pos_)
        def_tok_list.append(token.text)
    def_embed = obtain_fasttext_embed(fasttext, def_tok_list)
    
    # first same pos tagged word embedding
    first_same_pos_tagged_word = ""
    for tok, pos in zip(def_tok_list, def_pos_list):
        if pos == "VERB":
            first_same_pos_tagged_word = tok
            break
    if first_same_pos_tagged_word != "":
        first_same_pos_tagged_word_tok_list = re.split(r"\s|, |-", first_same_pos_tagged_word.lower())
        first_same_pos_tagged_word_tok_embed = obtain_fasttext_embed(fasttext, first_same_pos_tagged_word_tok_list)
    else:
        first_same_pos_tagged_word_tok_embed = ""
    
    # embedding based only on definition
    v_synset_defonly_embed_list.append(def_embed)
    
    # embedding based on half lemma name and half definition
    v_synset_average_defonly_plus_lemma_embed_list.append((lemma_embed+def_embed)/2)
    
    # embedding based on 0.25 lemma name and 0.75 definition
    v_synset_weighted_average_defonly_plus_lemma_embed_list.append(0.25*lemma_embed + 0.75*def_embed)

    # embedding based on lemma name, definition, and first same pos tagged word embedding
    if first_same_pos_tagged_word_tok_embed == "":
        v_synset_defonly_plus_lemma_with_fwfs_embed_list.append((lemma_embed + 2*def_embed) / 3)
    else:
        v_synset_defonly_plus_lemma_with_fwfs_embed_list.append((lemma_embed + def_embed + first_same_pos_tagged_word_tok_embed) / 3)            

100%|██████████| 13936/13936 [03:49<00:00, 60.84it/s]


In [64]:
n_synset_defonly_embed_list = np.array(n_synset_defonly_embed_list)
n_synset_average_defonly_plus_lemma_embed_list = np.array(n_synset_average_defonly_plus_lemma_embed_list)
n_synset_weighted_average_defonly_plus_lemma_embed_list = np.array(n_synset_weighted_average_defonly_plus_lemma_embed_list)
n_synset_defonly_plus_lemma_with_fwfs_embed_list = np.array(n_synset_defonly_plus_lemma_with_fwfs_embed_list)

v_synset_defonly_embed_list = np.array(v_synset_defonly_embed_list)
v_synset_average_defonly_plus_lemma_embed_list = np.array(v_synset_average_defonly_plus_lemma_embed_list)
v_synset_weighted_average_defonly_plus_lemma_embed_list = np.array(v_synset_weighted_average_defonly_plus_lemma_embed_list)
v_synset_defonly_plus_lemma_with_fwfs_embed_list = np.array(v_synset_defonly_plus_lemma_with_fwfs_embed_list)



In [65]:
dir_path = "/datadrive/structure_expan/data/semeval-2016-task-14/"

# noun
embed_suffices = ["fasttext_mode1", "fasttext_mode2", "fasttext_mode3", "fasttext_mode4"]
embed_lists = [
    n_synset_defonly_embed_list, 
    n_synset_average_defonly_plus_lemma_embed_list, 
    n_synset_weighted_average_defonly_plus_lemma_embed_list,
    n_synset_defonly_plus_lemma_with_fwfs_embed_list
]
for suffix, embed_list in zip(embed_suffices, embed_lists):
    with open(os.path.join(dir_path, f"wordnet_noun.terms.{suffix}.embed"), "w") as fout:
        fout.write(f"{embed_list.shape[0]} {embed_list.shape[1]}\n")
        for i, synset in enumerate(n_synset_list):
            synset_embed = embed_list[i, :]
            synset_embed_string = " ".join([str(e) for e in synset_embed])
            fout.write(f"{synset} {synset_embed_string}\n")
        
# verb
embed_suffices = ["fasttext_mode1", "fasttext_mode2", "fasttext_mode3", "fasttext_mode4"]
embed_lists = [
    v_synset_defonly_embed_list, 
    v_synset_average_defonly_plus_lemma_embed_list, 
    v_synset_weighted_average_defonly_plus_lemma_embed_list,
    v_synset_defonly_plus_lemma_with_fwfs_embed_list
]
for suffix, embed_list in zip(embed_suffices, embed_lists):
    with open(os.path.join(dir_path, f"wordnet_verb.terms.{suffix}.embed"), "w") as fout:
        fout.write(f"{embed_list.shape[0]} {embed_list.shape[1]}\n")
        for i, synset in enumerate(v_synset_list):
            synset_embed = embed_list[i, :]
            synset_embed_string = " ".join([str(e) for e in synset_embed])
            fout.write(f"{synset} {synset_embed_string}\n")