# WordNet Category Hyponyms
The goal of this notebook is to extract fitting hyponyms for two lists of categories that can then function as wordlists where I can sample from for the Codenames Boards.

In [1]:
import json
with open('../category lists/schroeder et al.json') as file:
    categories_schroeder = json.load(file)["categories"]
    print(categories_schroeder)

['animal', 'bird', 'fruit', 'vegetable', 'clothing', 'furniture', 'vehicle', 'tool', 'musical_instrument', 'profession', 'sport']


In [2]:
with open('../category lists/jaramillo et al cleaned.json') as file:
    categories_jaramillo = json.load(file)["categories"]
    print(categories_jaramillo)

['animal', 'body', 'clothes', 'color', 'day', 'dessert', 'food', 'relative', 'room', 'shape', 'sound', 'toy', 'beverage', 'bird', 'building', 'coin', 'collectable', 'condiment', 'container', 'dinosaur', 'direction', 'emotion', 'flower', 'fruit', 'holiday', 'ingredient', 'insect', 'instrument', 'job', 'jungle animal', 'liquid', 'measure', 'month', 'movie', 'story', 'pattern', 'planet', 'plant', 'reptile', 'rhyming', 'season', 'sense', 'silverware', 'size', 'solid', 'sport', 'transportation', 'tool', 'vegetable', 'writing', 'ability', 'businesses', 'city', 'country', 'communication', 'continent', 'currency', 'exercise', 'habitat', 'hazard', 'mammal', 'material', 'metal', 'ocean', 'president', 'school subject', 'seasoning', 'state', 'symbol', 'texture', 'tree', 'weather', 'colony', 'ancient civilization', 'constellation', 'cuisine', 'element', 'landmark', 'government type', 'gas', 'gem', 'organ', 'language', 'mineral', 'mountain', 'music type', 'precipitation', 'book', 'religion', 'tradit

In [4]:
len(categories_jaramillo)

90

In [3]:
set(categories_jaramillo).intersection(set(categories_schroeder))

{'animal', 'bird', 'fruit', 'sport', 'tool', 'vegetable'}

In [5]:
from nltk.corpus import wordnet

In [6]:
from typing import List, Dict

In [7]:
def inspect_synsets(word: str):
    synsets = wordnet.synsets(word)
    for synset in synsets:
        print(synset.lemma_names())
        print(synset.definition())
    return synsets

In [9]:
inspect_synsets('vegetable')

['vegetable', 'veggie', 'veg']
edible seeds or roots or stems or leaves or bulbs or tubers or nonsweet fruits of any of numerous herbaceous plant
['vegetable']
any of various herbaceous plants cultivated for an edible part such as the fruit or the root of the beet or the leaf of spinach or the seeds of bean plants or the flower buds of broccoli or cauliflower


[Synset('vegetable.n.01'), Synset('vegetable.n.02')]

In [10]:
def get_hyponyms(word: str):
    synsets = inspect_synsets(word)
    for synset in synsets:
        hyponyms = []
        for hyponym in synset.hyponyms():
            hyponyms.append(hyponym.lemma_names()[0])
        print(hyponyms)

In [11]:
get_hyponyms('vegetable')

['vegetable', 'veggie', 'veg']
edible seeds or roots or stems or leaves or bulbs or tubers or nonsweet fruits of any of numerous herbaceous plant
['vegetable']
any of various herbaceous plants cultivated for an edible part such as the fruit or the root of the beet or the leaf of spinach or the seeds of bean plants or the flower buds of broccoli or cauliflower
['artichoke', 'artichoke_heart', 'asparagus', 'bamboo_shoot', 'cardoon', 'celery', 'cruciferous_vegetable', 'cucumber', 'fennel', 'greens', 'gumbo', 'julienne', 'leek', 'legume', 'mushroom', 'onion', 'pieplant', 'plantain', 'potherb', 'pumpkin', 'raw_vegetable', 'root_vegetable', 'solanaceous_vegetable', 'squash', 'truffle']
['artichoke', 'beet', 'cardoon', 'spinach']


There seem to be a lot of sub-categories which would need to be expanded to get more good examples, e.g. root_vegetable or raw_vegetable, hyponyms are not on the same level so to speak... maybe concept net works better here

In [12]:
def get_hyponyms(word: str):
    synsets = wordnet.synsets(word)
    if len(synsets) == 0:
        print("Word does not exist in wordnet!")
        return []
    max_depth = wordnet.synsets(word)[0].min_depth() + 2
    hyponyms = []
    get_hyponyms_recursive(word, hyponyms)
    return hyponyms

def get_hyponyms_recursive(word: str, hyponyms, max_depth = 10, verbose = False):
    if verbose:
        print("word", word)
    synset = wordnet.synsets(word)[0] # just using the first synset of the word because 
    # it's probably going to be the the most frequent one, i.e. the one that I want
    if synset.min_depth() >= 20:
        return
    else:
        for hyponym in synset.hyponyms():
            lemmas = hyponym.lemma_names()
            if len(lemmas) == 0:
                return
            lemma = lemmas[0]
            if verbose:
                print("Lemma", lemma)
            if lemma in hyponyms:
                return
            hyponyms.append(lemma)
            more_hyponyms = get_hyponyms_recursive(lemma, hyponyms, max_depth)
                
            if verbose:
                print("hyponyms", hyponyms)

def remove_compound_words(list_of_words: List):
    return [word for word in list_of_words if not '_' in word and not '-' in word]

In [13]:
len(remove_compound_words(get_hyponyms('vegetable')))

68

In [14]:
wordlists = {}
for category in categories_schroeder:
    wordlists[category] = remove_compound_words(get_hyponyms(category))
    print(category, len(wordlists[category]))

animal 1263
bird 340
fruit 66
vegetable 68
clothing 658
furniture 109
vehicle 901
tool 183
musical_instrument 465
profession 9
sport 146


In [15]:
def collect_wordlists(categories: List) -> Dict:
    wordlists = {}
    for category in categories:
        wordlist = remove_compound_words(get_hyponyms(category))
        if len(wordlist) >= 25:
            wordlists[category] = wordlist
            print(category, len(wordlists[category]))
        else:
            print(f"Skipping {category}, only contains {len(wordlist)} words.")
    return wordlists

In [16]:
collect_wordlists(categories_schroeder)

animal 1263
bird 340
fruit 66
vegetable 68
clothing 658
furniture 109
vehicle 901
tool 183
musical_instrument 465
Skipping profession, only contains 9 words.
sport 146


{'animal': ['acrodont',
  'adult',
  'brachycephalic',
  'caregiver',
  'bonesetter',
  'electrologist',
  'dentist',
  'endodontist',
  'exodontist',
  'orthodontist',
  'pedodontist',
  'periodontist',
  'prosthodontist',
  'doctor',
  'abortionist',
  'allergist',
  'angiologist',
  'extern',
  'gastroenterologist',
  'hakim',
  'intern',
  'quack',
  'specialist',
  'attache',
  'canonist',
  'criminologist',
  'crystallographer',
  'dietician',
  'educationist',
  'enologist',
  'Germanist',
  'graphologist',
  'limnologist',
  'meteorologist',
  'weatherman',
  'optometrist',
  'orientalist',
  'Teutonist',
  'surgeon',
  'amputator',
  'neurosurgeon',
  'veterinarian',
  'inoculator',
  'nurse',
  'matron',
  'midwife',
  'probationer',
  'pharmacist',
  'pharmacologist',
  'catch',
  'centrist',
  'character',
  'conservative',
  'capitalist',
  'conformist',
  'hardliner',
  'minimalist',
  'mossback',
  'neoconservative',
  'reactionary',
  'Bourbon',
  'rightist',
  'fascist

In [18]:
collect_wordlists(categories_jaramillo)

animal 1263
body 1636
Skipping clothes, only contains 3 words.
color 226
Skipping day, only contains 4 words.
Skipping dessert, only contains 19 words.
food 392
relative 109
Skipping room, only contains 18 words.
shape 62
Skipping sound, only contains 6 words.
toy 47
beverage 200
bird 340
building 39
coin 109
Skipping collectable, only contains 1 words.
condiment 95
container 40
dinosaur 45
direction 153
emotion 85
Skipping flower, only contains 1 words.
fruit 66
Skipping holiday, only contains 2 words.
Skipping ingredient, only contains 3 words.
insect 1049
instrument 743
job 299
Word does not exist in wordnet!
Skipping jungle animal, only contains 0 words.
liquid 170
measure 313
month 59
movie 435
Skipping story, only contains 3 words.
Skipping pattern, only contains 6 words.
Skipping planet, only contains 0 words.
Skipping plant, only contains 22 words.
reptile 102
Skipping rhyming, only contains 2 words.
Skipping season, only contains 4 words.
Skipping sense, only contains 0 words.

{'animal': ['acrodont',
  'adult',
  'brachycephalic',
  'caregiver',
  'bonesetter',
  'electrologist',
  'dentist',
  'endodontist',
  'exodontist',
  'orthodontist',
  'pedodontist',
  'periodontist',
  'prosthodontist',
  'doctor',
  'abortionist',
  'allergist',
  'angiologist',
  'extern',
  'gastroenterologist',
  'hakim',
  'intern',
  'quack',
  'specialist',
  'attache',
  'canonist',
  'criminologist',
  'crystallographer',
  'dietician',
  'educationist',
  'enologist',
  'Germanist',
  'graphologist',
  'limnologist',
  'meteorologist',
  'weatherman',
  'optometrist',
  'orientalist',
  'Teutonist',
  'surgeon',
  'amputator',
  'neurosurgeon',
  'veterinarian',
  'inoculator',
  'nurse',
  'matron',
  'midwife',
  'probationer',
  'pharmacist',
  'pharmacologist',
  'catch',
  'centrist',
  'character',
  'conservative',
  'capitalist',
  'conformist',
  'hardliner',
  'minimalist',
  'mossback',
  'neoconservative',
  'reactionary',
  'Bourbon',
  'rightist',
  'fascist

Currently, the wordlists seem to get mixed up, there are a lot of professions in 'animal' for example. I believe the source of this issue lies within the wordlist building, as I convert synsets to lemmas and back to synsets and the original word sense is changed while doing this conversion. I should rewrite this to keep the original synset intact. But nevertheless, the issue of different levels remains, so I will see if ConceptNet is better suited.