# Explore the Wordnet dataset using `nltk` API

- https://www.nltk.org/howto/wordnet.html
- http://wordnetweb.princeton.edu/perl/webwn

In [1]:
import random
import nltk
from nltk.corpus import wordnet as wn
Synset = nltk.corpus.reader.wordnet.Synset

In [2]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/nick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
assert isinstance(wn.NOUN, str) and wn.NOUN == "n"

In [4]:
for i, s in enumerate(wn.all_eng_synsets(wn.NOUN)):
    if i < 82000:
        continue
    print(i, s)

82000 Synset('kilohertz.n.01')
82001 Synset('kilometers_per_hour.n.01')
82002 Synset('megahertz.n.01')
82003 Synset('terahertz.n.01')
82004 Synset('metabolic_rate.n.01')
82005 Synset('miles_per_hour.n.01')
82006 Synset('pace.n.01')
82007 Synset('pulse.n.03')
82008 Synset('femoral_pulse.n.01')
82009 Synset('radial_pulse.n.01')
82010 Synset('rate_of_return.n.01')
82011 Synset('return_on_invested_capital.n.01')
82012 Synset('respiratory_rate.n.01')
82013 Synset('revolutions_per_minute.n.01')
82014 Synset('sampling_rate.n.01')
82015 Synset('nyquist_rate.n.01')
82016 Synset('solar_constant.n.01')
82017 Synset('spacing.n.01')
82018 Synset('speed.n.01')
82019 Synset('tempo.n.02')
82020 Synset('quick_time.n.01')
82021 Synset('double_time.n.01')
82022 Synset('airspeed.n.01')
82023 Synset('escape_velocity.n.01')
82024 Synset('groundspeed.n.01')
82025 Synset('hypervelocity.n.01')
82026 Synset('muzzle_velocity.n.01')
82027 Synset('peculiar_velocity.n.01')
82028 Synset('radial_velocity.n.01')
82029

In [5]:
s = wn.synset("underground_railroad.n.01")
print(s, s.definition())
assert s.hypernyms() == []
assert s.hyponyms() == []
# A very specific thing, so no hyperonyms or hyponyms

Synset('underground_railroad.n.01') secret aid to escaping slaves that was provided by abolitionists in the years before the American Civil War


In [6]:
# Let's try something simpler
s = wn.synsets("dog")[0]
print(s, s.definition())
print("hypernyms:", s.hypernyms())
print("hyponyms:", s.hyponyms())

Synset('dog.n.01') a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds
hypernyms: [Synset('canine.n.02'), Synset('domestic_animal.n.01')]
hyponyms: [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'), Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'), Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'), Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), Synset('pug.n.01'), Synset('puppy.n.01'), Synset('spitz.n.01'), Synset('toy_dog.n.01'), Synset('working_dog.n.01')]


In [7]:
wn.synset("dog.n.01").tree(Synset.hypernyms)

[Synset('dog.n.01'),
 [Synset('canine.n.02'),
  [Synset('carnivore.n.01'),
   [Synset('placental.n.01'),
    [Synset('mammal.n.01'),
     [Synset('vertebrate.n.01'),
      [Synset('chordate.n.01'),
       [Synset('animal.n.01'),
        [Synset('organism.n.01'),
         [Synset('living_thing.n.01'),
          [Synset('whole.n.02'),
           [Synset('object.n.01'),
            [Synset('physical_entity.n.01'),
             [Synset('entity.n.01')]]]]]]]]]]]]],
 [Synset('domestic_animal.n.01'),
  [Synset('animal.n.01'),
   [Synset('organism.n.01'),
    [Synset('living_thing.n.01'),
     [Synset('whole.n.02'),
      [Synset('object.n.01'),
       [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]

In [8]:
# Check that (A is a hypernym for B) implies (B is a hyponym for A)
def check_hyponyms(s: Synset):
    for h in s.hyponyms():
        assert s in h.hypernyms(), f"{h} is a hyponyms for {s}, but {s} is not a hypernym for {h}"

check_hyponyms(wn.synset("dog.n.01"))
check_hyponyms(wn.synset("cat.n.01"))

for s in wn.all_eng_synsets(wn.NOUN):
    check_hyponyms(s)
# It holds for every synset in WordNet, hooray

## Filtering and counting

In [9]:
def contains_underscore(s: Synset) -> bool:
    return "_" in str(s)
    
def has_hypernyms(s: Synset) -> bool:
    return len(s.hypernyms()) >= 1

def has_hyponyms(s: Synset) -> bool:
    return len(s.hyponyms()) >= 1

def has_high_lemfreq(s: Synset, threshold: int = 5) -> bool:
    return s.lemmas()[0].count() >= threshold

def filterlen(predicate = lambda _: True, iterable = None):
    if iterable is None:
        iterable = wn.all_eng_synsets(wn.NOUN)
    return len(list(filter(predicate, iterable)))

In [10]:
print("total", filterlen(iterable=wn.all_eng_synsets(wn.NOUN)))
print("contains_underscore", filterlen(predicate=contains_underscore))
print("has_hyponyms", filterlen(predicate=has_hyponyms))
print("has_hypernyms", filterlen(predicate=has_hypernyms))
print("has_high_lemfreq", filterlen(predicate=has_high_lemfreq))

total 82115
contains_underscore 27165
has_hyponyms 16693
has_hypernyms 74389
has_high_lemfreq 3055


In [11]:
# Is there a single root noun, that is a hypernym for everything else?
print(wn.synset("dog.n.01").root_hypernyms())
assert wn.synset("entity.n.01").hypernyms() == []
wn.synset("entity.n.01").tree(Synset.hyponyms, depth=3)

[Synset('entity.n.01')]


[Synset('entity.n.01'),
 [Synset('abstraction.n.06'),
  [Synset('attribute.n.02'),
   [Synset('ballast.n.03')],
   [Synset('character.n.09')],
   [Synset('cheerfulness.n.01')],
   [Synset('common_denominator.n.02')],
   [Synset('depth.n.06')],
   [Synset('eidos.n.01')],
   [Synset('ethos.n.01')],
   [Synset('human_nature.n.01')],
   [Synset('inheritance.n.04')],
   [Synset('personality.n.01')],
   [Synset('property.n.02')],
   [Synset('quality.n.01')],
   [Synset('shape.n.02')],
   [Synset('space.n.01')],
   [Synset('state.n.02')],
   [Synset('thing.n.09')],
   [Synset('time.n.05')],
   [Synset('trait.n.01')],
   [Synset('uncheerfulness.n.02')]],
  [Synset('communication.n.02'),
   [Synset('auditory_communication.n.01')],
   [Synset('contagion.n.03')],
   [Synset('didacticism.n.01')],
   [Synset('display.n.05')],
   [Synset('document.n.03')],
   [Synset('expressive_style.n.01')],
   [Synset('indication.n.01')],
   [Synset('language.n.01')],
   [Synset('message.n.01')],
   [Synset('mess

## Checking if synsets `s` and `t` are related

In [12]:
car = wn.synset('car.n.01')
veh = wn.synset('vehicle.n.01')

assert veh not in car.hypernyms()
assert s not in veh.hyponyms()
assert car not in veh.hyponyms()
assert veh not in car.hyponyms()

In [13]:
assert veh in car.lowest_common_hypernyms(veh)

In [14]:
veh.lowest_common_hypernyms(car)

[Synset('vehicle.n.01')]

In [15]:
car.hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('instrumentality.n.03'),
  Synset('container.n.01'),
  Synset('wheeled_vehicle.n.01'),
  Synset('self-propelled_vehicle.n.01'),
  Synset('motor_vehicle.n.01'),
  Synset('car.n.01')],
 [Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('instrumentality.n.03'),
  Synset('conveyance.n.03'),
  Synset('vehicle.n.01'),
  Synset('wheeled_vehicle.n.01'),
  Synset('self-propelled_vehicle.n.01'),
  Synset('motor_vehicle.n.01'),
  Synset('car.n.01')]]

In [16]:
wn.synset("dog.n.01").lowest_common_hypernyms(wn.synset("cat.n.01"))

[Synset('carnivore.n.01')]

In [17]:
assert car in wn.synset('motor_vehicle.n.01').hyponyms()
wn.synset('motor_vehicle.n.01').lowest_common_hypernyms(car)

[Synset('motor_vehicle.n.01')]

In [18]:
def are_related(s: Synset, t: Synset) -> bool:
    lch = s.lowest_common_hypernyms(t)
    return s in lch or t in lch

In [19]:
def check_if_everything_is_an_entity():
    for s in wn.all_eng_synsets(wn.NOUN):
        assert wn.synset('entity.n.01') in s.root_hypernyms()

check_if_everything_is_an_entity()
# Since everything is an entity, unrelated words are just coming from
# different paths in the undirected acyclic graph 

## Creating a flat dataset of hypernyms of depth 1

In [20]:
hyper_hypo_pairs = []

def to_str(s: Synset) -> str:
    return str(s.lemmas()[0].name())

for hyper in wn.all_eng_synsets(wn.NOUN):
    for hypo in hyper.hyponyms():
        pair = (to_str(hyper), to_str(hypo))
        hyper_hypo_pairs.append(pair)

len(hyper_hypo_pairs)

75850

In [21]:
random.sample(hyper_hypo_pairs, k=10)

[('bird_genus', 'Apus'),
 ('wildflower', 'shrubby_penstemon'),
 ('conifer', 'keteleeria'),
 ('syndrome', 'fetal_alcohol_syndrome'),
 ('energy', 'juice'),
 ('additive', 'adjuvant'),
 ('sympathizer', 'fellow_traveler'),
 ('linguistics', 'historical_linguistics'),
 ('free_phagocyte', 'leukocyte'),
 ('woolly_bear', 'woolly_bear_moth')]

### Creating unrelated word pairs

In [22]:
# # Naive N^2 implementation, too slow, we don't need 80k^2 pairs
# unrelated_pairs = []
# for s in wn.all_eng_synsets(wn.NOUN):
#     for t in wn.all_eng_synsets(wn.NOUN):
#         if not are_related(s, t):
#             unrelated_pairs.append((s, t))
#
# len(unrelated_pairs)

In [23]:
# Smarter: to avoid unwanted patterns (e.g. repetition) and control the quantity, let's draw random pairs
%time

def generate_unrelated_pairs(how_many_pairs: int = 50000, rng_seed: int = 42) -> list[tuple[Synset, Synset]]:
    unrelated_pairs = []
    all_synsets = list(wn.all_eng_synsets(wn.NOUN))
    rng = random.Random(rng_seed)
    while len(unrelated_pairs) < how_many_pairs:
        s, t = rng.sample(all_synsets, k=2)
        if not are_related(s, t):
            unrelated_pairs.append((s, t))
    return unrelated_pairs

unrelated_pairs = generate_unrelated_pairs(50000)
print(unrelated_pairs[10])
random.sample(unrelated_pairs, k=20)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
(Synset('solidity.n.03'), Synset('reserve_account.n.01'))


[(Synset('parts_catalog.n.01'), Synset('crabgrass.n.01')),
 (Synset('latin_american.n.01'), Synset('dhu'l-qa'dah.n.01')),
 (Synset('zola.n.01'), Synset('crunch.n.02')),
 (Synset('short_shrift.n.01'), Synset('discomfiture.n.01')),
 (Synset('nissen_hut.n.01'), Synset('butt.n.01')),
 (Synset('slam.n.02'), Synset('trademark.n.02')),
 (Synset('compartment_pressure.n.01'), Synset('production.n.08')),
 (Synset('whiteface.n.02'), Synset('carving_knife.n.01')),
 (Synset('california_newt.n.01'), Synset('devonian.n.01')),
 (Synset('aunt.n.01'), Synset('algeripithecus.n.01')),
 (Synset('plate.n.10'), Synset('sotho.n.01')),
 (Synset('bone_china.n.01'), Synset('force.n.02')),
 (Synset('24/7.n.01'), Synset('interior_live_oak.n.01')),
 (Synset('aversion.n.02'),
  Synset('american_standard_code_for_information_interchange.n.01')),
 (Synset('skep.n.01'), Synset('swine.n.01')),
 (Synset('escape.n.07'), Synset('idolizer.n.01')),
 (Synset('flat.n.05'), Synset('boletus_zelleri.n.01')),
 (Synset('bombing_run