## Explore the Wordnet dataset using `nltk` API

- https://www.nltk.org/howto/wordnet.html
- http://wordnetweb.princeton.edu/perl/webwn

In [1]:
import nltk
from nltk.corpus import wordnet as wn
Synset = nltk.corpus.reader.wordnet.Synset

In [2]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/nick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
assert isinstance(wn.NOUN, str) and wn.NOUN == "n"

In [4]:
for i, s in enumerate(wn.all_eng_synsets(wn.NOUN)):
    if i < 82000:
        continue
    print(i, s)

82000 Synset('kilohertz.n.01')
82001 Synset('kilometers_per_hour.n.01')
82002 Synset('megahertz.n.01')
82003 Synset('terahertz.n.01')
82004 Synset('metabolic_rate.n.01')
82005 Synset('miles_per_hour.n.01')
82006 Synset('pace.n.01')
82007 Synset('pulse.n.03')
82008 Synset('femoral_pulse.n.01')
82009 Synset('radial_pulse.n.01')
82010 Synset('rate_of_return.n.01')
82011 Synset('return_on_invested_capital.n.01')
82012 Synset('respiratory_rate.n.01')
82013 Synset('revolutions_per_minute.n.01')
82014 Synset('sampling_rate.n.01')
82015 Synset('nyquist_rate.n.01')
82016 Synset('solar_constant.n.01')
82017 Synset('spacing.n.01')
82018 Synset('speed.n.01')
82019 Synset('tempo.n.02')
82020 Synset('quick_time.n.01')
82021 Synset('double_time.n.01')
82022 Synset('airspeed.n.01')
82023 Synset('escape_velocity.n.01')
82024 Synset('groundspeed.n.01')
82025 Synset('hypervelocity.n.01')
82026 Synset('muzzle_velocity.n.01')
82027 Synset('peculiar_velocity.n.01')
82028 Synset('radial_velocity.n.01')
82029

In [5]:
s = wn.synset("underground_railroad.n.01")
print(s, s.definition())
assert s.hypernyms() == []
assert s.hyponyms() == []
# A very specific thing, so no hyperonyms or hyponyms

Synset('underground_railroad.n.01') secret aid to escaping slaves that was provided by abolitionists in the years before the American Civil War


In [6]:
# Let's try something simpler
s = wn.synsets("dog")[0]
print(s, s.definition())
print("hypernyms:", s.hypernyms())
print("hyponyms:", s.hyponyms())

Synset('dog.n.01') a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds
hypernyms: [Synset('canine.n.02'), Synset('domestic_animal.n.01')]
hyponyms: [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'), Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'), Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'), Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), Synset('pug.n.01'), Synset('puppy.n.01'), Synset('spitz.n.01'), Synset('toy_dog.n.01'), Synset('working_dog.n.01')]


In [7]:
wn.synset("dog.n.01").tree(Synset.hypernyms)

[Synset('dog.n.01'),
 [Synset('canine.n.02'),
  [Synset('carnivore.n.01'),
   [Synset('placental.n.01'),
    [Synset('mammal.n.01'),
     [Synset('vertebrate.n.01'),
      [Synset('chordate.n.01'),
       [Synset('animal.n.01'),
        [Synset('organism.n.01'),
         [Synset('living_thing.n.01'),
          [Synset('whole.n.02'),
           [Synset('object.n.01'),
            [Synset('physical_entity.n.01'),
             [Synset('entity.n.01')]]]]]]]]]]]]],
 [Synset('domestic_animal.n.01'),
  [Synset('animal.n.01'),
   [Synset('organism.n.01'),
    [Synset('living_thing.n.01'),
     [Synset('whole.n.02'),
      [Synset('object.n.01'),
       [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]

In [8]:
# Check that (A is a hypernym for B) implies (B is a hyponym for A)
def check_hyponyms(s: Synset):
    for h in s.hyponyms():
        assert s in h.hypernyms(), f"{h} is a hyponyms for {s}, but {s} is not a hypernym for {h}"

check_hyponyms(wn.synset("dog.n.01"))
check_hyponyms(wn.synset("cat.n.01"))

for s in wn.all_eng_synsets(wn.NOUN):
    check_hyponyms(s)
# It holds for every synset in WordNet, hooray

## Filtering and counting

In [12]:
print("total", filterlen(iterable=wn.all_eng_synsets(wn.NOUN)))
print("contains_underscore", filterlen(predicate=contains_underscore))
print("has_hyponyms", filterlen(predicate=has_hyponyms))
print("has_hypernyms", filterlen(predicate=has_hypernyms))
print("has_high_lemfreq", filterlen(predicate=has_high_lemfreq))

total 82115
contains_underscore 27165
has_hyponyms 16693
has_hypernyms 74389
has_high_lemfreq 3055


In [9]:
def contains_underscore(s: Synset) -> bool:
    return "_" in str(s)
    
def has_hypernyms(s: Synset) -> bool:
    return len(s.hypernyms()) >= 1

def has_hyponyms(s: Synset) -> bool:
    return len(s.hyponyms()) >= 1

def has_high_lemfreq(s: Synset, threshold: int = 5) -> bool:
    return s.lemmas()[0].count() >= threshold

def filterlen(predicate = lambda _: True, iterable = None):
    if iterable is None:
        iterable = wn.all_eng_synsets(wn.NOUN)
    return len(list(filter(predicate, iterable)))

In [None]:
# Is there a single root noun, that is a hypernym for everything else?
assert wn.synset("entity.n.01").hypernyms() == []
wn.synset("entity.n.01").tree(Synset.hyponyms, depth=3)

## Creating a linear dataset of hypernyms of depth 1

In [15]:
hyper_hypo_pairs = []

def to_str(s: Synset) -> str:
    return str(s.lemmas()[0].name())

for hyper in wn.all_eng_synsets(wn.NOUN):
    for hypo in hyper.hyponyms():
        pair = (to_str(hyper), to_str(hypo))
        hyper_hypo_pairs.append(pair)


In [17]:
len(hyper_hypo_pairs)
hyper_hypo_pairs[0]

('entity', 'abstraction')