In [1]:
from pytrips.ontology import get_ontology as tripsont
from pytrips.structures import TripsType
from nltk.corpus.reader.wordnet import Synset
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
from collections import defaultdict
from collections import namedtuple
import random

from scipy.stats import spearmanr, pearsonr

nouns = list(wn.all_synsets(pos=wn.NOUN))
num = 100
noun_sample = random.sample(nouns, num)

In [44]:
"""
Default node weights, can be overrided for variations
"""
_node_weights = defaultdict(lambda: 1)

_node_weights["fakeroot"] = 1

def _fallback(x, func, nonzero=False, fbs=-1):
    """
    Basic null checks for selecting a value from a list
    """
    if (not x) or (sum(x) == 0 and nonzero):
        return fbs
    return func(x)

"""
Pick a value from a list
"""
ListStrategy = {
    'choose': lambda f: lambda b: lambda x: _fallback(x, f, fbs=-1)
    'min': lambda x: _fallback(x, min),
    'bmin': lambda b: lambda x: _fallback(x, min, fbs=b),
    'max': lambda x: _fallback(x, max),
    'bmax': lambda b: lambda x: _fallback(x, max, fbs=b),
    'average': lambda x: _fallback(x, lambda l: sum(l)/len(l), nonzero=True)
}


"""
Find the last common element between two lists
"""
def last_overlap(v1, v2, aligned=None):
    if not v1 or not v2:
        return aligned
    elif v1[0] != v2[0]:
        return aligned
    else:
        return last_overlap(v1[1:], v2[1:], aligned=v1[0])

class SemNode:
    def __init__(self, node):
        self._node = node
    
    def __eq__(self, other):
        if issubclass(type(other), SemNode):
            if type(other.content) == type(self.content):
                return other.content == self.content
        return False
    
    def __hash__(self):
        return hash(self.__repr__())
    
    def __repr__(self):
        return "<SemNode: {}>".format(self.content.__repr__())
    
    def __str__(self):
        return "<SemNode: {}>".format(self.name)
    
    @property
    def name(self):
        """
        return canonical name for node
        """
        return str(self.content)
    
    @property
    def adjacent(self, label=None):
        """
        Return connected elements by label.  Useful for following non-hypernym relations in wordnet
        Can be used to implement other connections.
        Override as necessary.
        """
        return []
    
    @property
    def root(self):
        """
        Check if the node is a root of some sort.  Override as necessary for different resources.
        """
        return not self.parents
    
    @property
    def parents(self):
        """
        Get all parents for a node.  Perform any cross-resource cutoffs.
        """
        pass
    
    @property
    def children(self):
        """
        Get all children for a node.  Perform any cross resource cutoffs.
        """
        pass
    
    @property
    def content(self):
        """
        Return the wrapped node.
        """
        return self._node
    
    @property
    def resource(self):
        """
        Return the name of the relevant resource
        """
        return "default"
    
    def weight(self, weights=None):
        """
        Get resource-based weight of node.  Pass a weight dictionary as necessary.
        TODO: add an argument to pass individual weights
        """
        if not weights:
            weights = _node_weights
        return weights[self.resource]
    
    @staticmethod
    def path_depth(path, weights=None):
        """
        Get the total depth of a path, defined as a list of nodes.
        Does not check validity of path
        """
        return sum([p.weight(weights=weights) for p in path])
    
    @staticmethod
    def depth(node, weights=None, strategy='min'):
        """
        Get the depth of a node to a root using a weight dictionary and selection strategy.
        Default is minimum depth from any root.
        """
        weighted = [SemNode.path_depth(p, weights=weights) for p in node.paths_to_root()]
        return ListStrategy[strategy](weighted)
    
    @staticmethod
    def make(node):
        """
        Make a node based on the input type.
        Should add parameter dictionary to pass on to children
        """
        if type(node) is TripsType:
            return TripsNode(node)
        elif type(node) is Synset:
            return WordNetNode(node)
        elif type(node) is str:
            return WordNode(node)
        elif type(node) is SemNode:
            return node
        else:
            return None
    
    def paths_to_root(self):
        """
        Find all paths to a root based on hierarchy rules.
        Some resources return only one (Trips), others may return multiple (WordNet)
        """
        if self.root:
            return [[self]]
        res = []
        for c in self.parents:
            ptrs = c.paths_to_root()
            res.extend([t + [self] for t in ptrs if self not in t])
        return res
    
    def lcs_set(self, other):
        """
        Find the set of Lowest Common Subsumers for a node.  
        Some resources have only one (Trips) other can have multiple (WordNet)
        """
        lcs = [last_overlap(p,q) for p in self.paths_to_root() for q in other.paths_to_root()]
        filtered = [x for x in lcs if x]
        if not filtered:
            return [TripsNode(tripsont()["root"])]
        return filtered
    
    def wupalmer(self, other, weights=None, depth_strategy='min', lcs_strategy='max'):
        """
        return cross-wupalmer measure using provided weights, depth_strategy and lcs_strategy
        depth_strategy: Choose max, min, or average depth over all paths
        lcs_strategy: Choose max, min, or average depth of lcs over all alternatives
        """
        if not issubclass(type(other), SemNode):
            other = SemNode.make(other) # this would break passing in an arbitrary maker object
        lcs_depth = ListStrategy[lcs_strategy]([SemNode.depth(d, weights, depth_strategy) for d in self.lcs_set(other)])
        sd = SemNode.depth(self, weights, depth_strategy)
        od = SemNode.depth(other, weights, depth_strategy)
        return 2*lcs_depth/(sd + od)
    
    def path_similarity(self, other, weights=None, depth_strategy='min', lcs_strategy='min'):
        """
        Like wupalmer, except (d(s1) + d(s2) - 2 * lcs(s1,s2))
        """
        if not issubclass(type(other), SemNode):
            other = SemNode.make(other) # this would break passing in an arbitrary maker object
        lcs_depth = ListStrategy[lcs_strategy]([SemNode.depth(d, weights, depth_strategy) for d in self.lcs_set(other)])
        sd = SemNode.depth(self, weights, depth_strategy)
        od = SemNode.depth(other, weights, depth_strategy)
        return 1/(2 + sd + od - 2*lcs_depth)

SyntaxError: invalid syntax (<ipython-input-44-f5e06b70ed2c>, line 21)

In [45]:
class WordNode(SemNode):
    """
    Take a "word.pos" element as a node in the generalized hierarchy.
    """
    def resource(self):
        return "word"
    
    @property
    def name(self):
        return self.content
    
    @property
    def children(self):
        return []
    
    def word_pos(self):
        if "." in self._node:
            return self._node.split(".")
        return self._node, None
    
    @property
    def parents(self):
        """
        Lookup all TripsTypes, lookup all Wordnet Types.
        """
        w, p = self.word_pos()
        wordnet = wn.synsets(w, p)
        trips = tripsont().get_word(w, p)
        return [SemNode(c) for c in wordnet+trips]

class FakeRoot(SemNode):
    """
    FakeRoot for completeness purposes
    """
    def __init__(self):
        super(FakeRoot, self).__init__("fakeroot")
        
    def resource(self):
        return "fakeroot"
    
    @property
    def name(self):
        return "fakeroot"
    
    @property
    def parents(self):
        return []
    
    @property
    def children(self):
        return []
        

class TripsNode(SemNode):
    @property
    def name(self):
        return self.content.name
    
    @property
    def parents(self):
        return [SemNode.make(self._node.parent)]

    @property
    def children(self):
        return [SemNode.make(c) for c in self._node.children] + [SemNode.make(c) for c in self._node.wordnet]
    
    @property
    def resource(self):
        return "trips"
    
    @property
    def root(self):
        return self._node.depth == 0

    
class WordNetNode(SemNode):
    @property
    def name(self):
        return self.content.name()
    
    @property
    def parents(self):  
        # NOTE: actually this is a little bit of a problem because we're not taking
        #       WN hypernyms
        tt = tripsont()[self._node]
        if not tt:
            tt = self._node.hypernyms()
        return [SemNode.make(p) for p in tt]

    @property
    def children(self):
        return [SemNode.make(c) for c in self._node.hyponyms()]
    
    @property
    def resource(self):
        return "wordnet"
    

# tests

In [46]:
# equality

cat1 = wn.synset("cat.v.1")
cat2 = wn.synset("cat.v.1")

wcat1 = WordNetNode(cat1)
wcat2 = WordNetNode(cat2)

assert issubclass(type(wcat1), SemNode)
assert wcat1 == wcat2

# hypernyms

animal = SemNode.make(tripsont()["nonhuman-animal"])
mammal = SemNode.make(tripsont()["mammal"])
assert [mammal] == animal.parents


abbess = SemNode.make(wn.synset("abbess.n.1"))
scand  = SemNode.make(wn.synset("scandinavia.n.2"))
print(abbess.paths_to_root())
print(scand.paths_to_root())

wn.synset("scandinavia.n.2").hypernyms()

[[<SemNode: ont::root>, <SemNode: ont::any-sem>, <SemNode: ont::referential-sem>, <SemNode: ont::phys-object>, <SemNode: ont::natural-object>, <SemNode: ont::organism>, <SemNode: ont::animal>, <SemNode: ont::vertebrate>, <SemNode: ont::mammal>, <SemNode: ont::person>, <SemNode: Synset('abbess.n.01')>]]
[[<SemNode: Synset('scandinavia.n.02')>]]


[]

# Evaluation tools

from collections import namedtuple, Counter
from tqdm import tnrange, tqdm_notebook

Comp = namedtuple("comp", ["n1", "n2", "normal", "cross"])
def comp_string(c):
    return """
    node1: {}
    node1: {}
    wordnet only wup: {}
    cross wupalmer:   {}
    """.format(c.n1, c.n2, c.normal, c.cross)

def compare_wup(n1, n2):
    cross = SemNode.make(n1).wupalmer(SemNode.make(n2))
    normal= n1.wup_similarity(n2)
    return Comp(n1, n2, normal, cross)

res = []
for i in tqdm_notebook(range(num), desc="i"):
    for j in range(i, num):
        res.append(compare_wup(noun_sample[i], noun_sample[j]))

# count data
ranges_normal = Counter()
ranges_cross = Counter()
for i in res:
    ranges_normal[i.normal] += 1
    ranges_cross[i.cross] += 1

def binning(data, n=10):
    bin_width = 1/n
    binned = []
    for d in data:
        try:
            binned.append(int(n*d)/n)
        except:
            print(n, d, int(n*d))
    return binned

def bin_count(data, n=10):
    b_counter = Counter()
    bin_width = 1/n
    binned = []
    for d in data:
        try:
            b_counter[int(n*d)/n] += 1
        except:
            print(n, d, int(n*d))
    return b_counter

import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

bins=25
data = binning([i.normal for i in res], bins)

counted = sorted(bin_count([i.normal for i in res], n=bins).items(), key=lambda x: x[0])
for c in counted:
    print(c)


plt.hist(data, density=True, bins=bins)
plt.ylabel('WordnetOnly');

data = binning([i.cross for i in res if i.cross <= 1], bins)

counted = sorted(bin_count([i.cross for i in res], n=bins).items(), key=lambda x: x[0])
for c in counted:
    print(c)

for i in res:
    if 0 > i.cross or 1 < i.cross:
        print(i)

plt.hist(data, density=True, bins=bins)
plt.ylabel('Trips-WUPalmer');

In [47]:
import pandas as pd

simlex = pd.read_csv("../SimLex-999/SimLex-999.txt", sep="\t")

In [50]:
node_weights = defaultdict(lambda: 1)
node_weights["fakeroot"] = 1
node_weights["wordnet"] = 1
node_weights["trips"] = 1
node_weights["word"] = 1

def pops(l):
    if l:
        return l[0]
    return None

def collects(l):
    return [y for y in l if y]

def n_or_v(synset):
    return synset.pos() in [wn.NOUN, wn.VERB]

def resnik(s1, s2, ic_corpus):
    if n_or_v(s1) and n_or_v(s2):
        return s1.res_similarity(s2, ic_corpus)
    return -1 #s1.wup_similarity(s2)

def _list_fallback(func, fallback_func, args):
    result = func(*args)
    if not result:
        result = fallback_func(*args)
    return result
    
sim_strategy = {
    "mfs": lambda x, p: [SemNode.make(pops(wn.synsets(x, p)))],
    "average": lambda x, p: [SemNode.make(s) for s in wn.synsets(x, p)],
    "word": lambda x, p: [SemNode.make(v) for v in tripsont().get_word(x, p)],
    "lookup": lambda x, p: _list_fallback(
        lambda x, p: [SemNode.make(v) for v in tripsont().get_word(x, p)], 
        lambda x, p: [SemNode.make(pops(wn.synsets(x, p)))],
        [x, p]
    ),
    "both": lambda x, p: [SemNode.make(v) for v in tripsont().get_word(x, p)] + [SemNode.make(pops(wn.synsets(x, p)))]
}

sim_metric = {
    "cross" : lambda x, y: x.wupalmer(y, node_weights),
    "tripspath" : lambda x, y: x.path_similarity(y, weights=node_weights),
    "normal": lambda x, y: x.content.wup_similarity(y.content),
    "resnik_brown": lambda x, y: resnik(x.content, y.content, brown_ic),
    "resnik_semcor": lambda x, y: resnik(x.content, y.content, semcor_ic)
}

def similarity_test(word1, word2, pos=None, metric="cross", strategy="average", select1="max", select2="max"):
    if not pos:
        pos = "nvar"
    else:
        pos = pos.lower()[0]
    metric = sim_metric[metric]
    strategy = sim_strategy[strategy]
    results = []
    for x in pos:
        word1_node = collects(strategy(word1, x))
        word2_node = collects(strategy(word2, x))
        if word1_node and word2_node:
            scores = collects([metric(x, y) for x in word1_node for y in word2_node])
            results.append(ListStrategy[select1](scores))
    if results:
        return ListStrategy[select2](results)
    return -1 # fallback of 0.5
    

In [64]:
ws353 = []
cross = []
regular=[]

def get_valid_scores(l1, l2, theta=1):
    res1 = []
    res2 = []
    for x, y in zip(l1, l2):
        if y > theta:
            y = 1
        if y >= 0:
            res1.append(x)
            res2.append(y)
    return res1, res2 

In [65]:
SimExperiment = namedtuple("SimExperiment", 
                           ["name", "data", "metric", "strategy", "select1", "select2"]
                          )
SimTask = namedtuple("SimTask", ["word1", "word2", "gold", "pos"])
SimTaskResults = namedtuple("SimTaskResults", ["experiment", "instances", "spearman","pearson"])

def ws353(name, metric, strategy, select1="max", select2="max"):
    wordsim2 = pd.read_csv("../wordsim353/set2.tab", sep="\t")
    wordsim1= pd.read_csv("../wordsim353/set1.tab", sep="\t")
    res = []
    for i, row in wordsim1.iterrows():
        res.append(SimTask(row[0], row[1], row[2], None))
    for i, row in wordsim2.iterrows():
        res.append(SimTask(row[0], row[1], row[2], None))
    return SimExperiment(name, res, metric, strategy, select1, select2)

def run_experiment(exp):
    results = []
    for d in exp.data:
        results.append(
            similarity_test(d.word1, d.word2, 
                            metric=exp.metric, 
                            strategy=exp.strategy, 
                            select1=exp.select1, select2=exp.select2)
        )
    
    results, gold = get_valid_scores([d.gold for d in exp.data], results)
    return SimTaskResults(exp, results, spearmanr(results, gold), pearsonr(results, gold))

def experiment_string(exp, pandas=True, dataframe=None):
    columns = "name metric candidate choice1 choice2 instances spearmanr spearmanp pearsonr pearsonp".split()
    if dataframe is None:
        dataframe = pd.DataFrame(columns=columns)
    e = exp.experiment
    if pandas:
        values = [e.name, e.metric, e.strategy, e.select1, e.select2, len(exp.instances),
        exp.spearman.correlation, exp.spearman.pvalue, 
        exp.pearson[0], exp.pearson[1]]
        res = pd.DataFrame({v: [r] for r, v in zip(values, columns)})
        return dataframe.append(res)
    return """
    ---
    name:      {}
    metric:    {}
    candidate: {}
    choice:    {}/{}
    --- 
    instances:    {}
    spearman rho: {}
    p-value:      {}
    
    pearson rho:  {}
    p-value:      {}
    ================
    """.format(
        e.name, e.metric, e.strategy, e.select1, e.select2, len(exp.instances),
        exp.spearman.correlation, exp.spearman.pvalue, 
        exp.pearson[0], exp.pearson[1]
    )
    

In [66]:
df = experiment_string(
        run_experiment(ws353("base", "normal", "mfs", select1="average", select2="average")))

df = experiment_string(
        run_experiment(ws353("base", "normal", "average", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("base", "resnik_brown", "mfs", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("base", "resnik_brown", "average", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("base", "resnik_semcor", "mfs", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("base", "resnik_semcor", "average", select1="average", select2="average")), dataframe=df)

df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,candidate,choice1,choice2,instances,metric,name,pearsonp,pearsonr,spearmanp,spearmanr
0,mfs,average,average,345,normal,base,3.548461e-09,0.311068,1.954825e-07,0.275704
0,average,average,average,345,normal,base,6.395076e-09,0.306162,2.650805e-08,0.29393
0,mfs,average,average,247,resnik_brown,base,5.451604e-06,0.284728,6.457752e-07,0.310397
0,average,average,average,316,resnik_brown,base,0.01722821,0.133913,0.002426755,0.170012
0,mfs,average,average,247,resnik_semcor,base,3.29626e-07,0.318007,8.627637e-08,0.332575
0,average,average,average,316,resnik_semcor,base,0.02902721,0.122831,0.01830429,0.132665


In [67]:
# Tripswordnet
#df = None
df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "tripspath", "both", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "tripspath", "both", select1="max", select2="max")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "tripspath", "both", select1="max", select2="average")), dataframe=df)

df

Unnamed: 0,candidate,choice1,choice2,instances,metric,name,pearsonp,pearsonr,spearmanp,spearmanr
0,mfs,average,average,345,normal,base,3.548461e-09,0.311068,1.954825e-07,0.275704
0,average,average,average,345,normal,base,6.395076e-09,0.306162,2.650805e-08,0.29393
0,mfs,average,average,247,resnik_brown,base,5.451604e-06,0.284728,6.457752e-07,0.310397
0,average,average,average,316,resnik_brown,base,0.01722821,0.133913,0.002426755,0.170012
0,mfs,average,average,247,resnik_semcor,base,3.29626e-07,0.318007,8.627637e-08,0.332575
0,average,average,average,316,resnik_semcor,base,0.02902721,0.122831,0.01830429,0.132665
0,both,average,average,351,tripspath,Trips-Wordnet,1.334416e-07,0.27697,1.284464e-08,0.297758
0,both,max,max,351,tripspath,Trips-Wordnet,3.63324e-10,0.326563,5.391482e-09,0.305062
0,both,max,average,351,tripspath,Trips-Wordnet,3.902308e-11,0.343153,4.505489e-09,0.306548


In [68]:
# Tripswordnet

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "cross", "mfs", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "cross", "average", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "cross", "word", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "cross", "lookup", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "tripspath", "mfs", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "tripspath", "average", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "tripspath", "word", select1="average", select2="average")), dataframe=df)

df = experiment_string(
        run_experiment(ws353("Trips-Wordnet", "tripspath", "lookup", select1="average", select2="average")), dataframe=df)

In [69]:
df

Unnamed: 0,candidate,choice1,choice2,instances,metric,name,pearsonp,pearsonr,spearmanp,spearmanr
0,mfs,average,average,345,normal,base,3.548461e-09,0.311068,1.954825e-07,0.275704
0,average,average,average,345,normal,base,6.395076e-09,0.306162,2.650805e-08,0.29393
0,mfs,average,average,247,resnik_brown,base,5.451604e-06,0.284728,6.457752e-07,0.310397
0,average,average,average,316,resnik_brown,base,0.01722821,0.133913,0.002426755,0.170012
0,mfs,average,average,247,resnik_semcor,base,3.29626e-07,0.318007,8.627637e-08,0.332575
0,average,average,average,316,resnik_semcor,base,0.02902721,0.122831,0.01830429,0.132665
0,both,average,average,351,tripspath,Trips-Wordnet,1.334416e-07,0.27697,1.284464e-08,0.297758
0,both,max,max,351,tripspath,Trips-Wordnet,3.63324e-10,0.326563,5.391482e-09,0.305062
0,both,max,average,351,tripspath,Trips-Wordnet,3.902308e-11,0.343153,4.505489e-09,0.306548
0,mfs,average,average,351,cross,Trips-Wordnet,6.782327e-05,0.210993,0.0002336983,0.195201
