# Using WordNet Hypernyms to Group Nouns Semantically #
Code by Anna Swigart, ANLP 2015

*** This code is intended to find main topics in the document by organizing nouns according to semantic class using WordNet.  It finds common hypernyms for many of the nouns, and squishes the hierarchy down for those shared hypernyms into one level. ***

In [2]:
import nltk
import re
from nltk.corpus import brown
from nltk.collocations import *
from string import punctuation
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict


### Raw Text Corpora

In [3]:
# Cookbook corpus
with open('cookbooks.txt', 'r') as text_file:
    cookbooks_corpus = text_file.read()

### Tokenize the Text

In [4]:
# Use default tokenizer to start with
def tokenize_text(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences
    
    return [nltk.word_tokenize(word) for word in raw_sents]

cookbook_sents = tokenize_text(cookbooks_corpus)
brown_news_sents = brown.sents(categories='news')

### Algorithm: Frequent Unigrams with Hypernyms and Examples

This algorithm takes in tokenized sentences, tags them the standard NLTK tagger, and then normalizes the words. (Note: in the original code, Anna used her special ngram tagger trained on her recipe collection.) Only nouns are included in the terms and cardinal numbers are excluded. The words are normalized by stemming (using the WordNet Lemmatizer) and cast to lowercase. Then, the 25 most common hyperterms of a list of the most frequent unigrams are extracted, along with a set of corresponding examples from the normalized corups.

In [31]:
def freq_normed_unigrams(sents):
    wnl = WordNetLemmatizer() # to get word stems
    
    tagged_POS_sents = [nltk.pos_tag(sent) for sent in sents] # tags sents
    
    normed_tagged_words = [wnl.lemmatize(word[0].lower()) for sent in tagged_POS_sents
                           for word in sent 
                           if word[0].lower() not in nltk.corpus.stopwords.words('english')
                           and word[0] not in punctuation # remove punctuation
                           and not re.search(r'''^[\.,;"'?!():\-_`]+$''', word[0])
                           and word[1].startswith('N')]  # include only nouns

    top_normed_unigrams = [word for (word, count) in nltk.FreqDist(normed_tagged_words).most_common(40)]
    return top_normed_unigrams

def categories_from_hypernyms(sents):
    termlist = freq_normed_unigrams(sents) # get top unigrams
    hypterms = []
    hypterms_dict = defaultdict(list)
    for term in termlist:                  # for each term
        s = wn.synsets(term.lower(), 'n')  # get its nominal synsets
        for syn in s:                      # for each lemma synset
            for hyp in syn.hypernyms():    # It has a list of hypernyms
                hypterms = hypterms + [hyp.name]      # Extract the hypernym name and add to list
                hypterms_dict[hyp.name].append(term)  # Extract examples and add them to dict
    hypfd = nltk.FreqDist(hypterms)             # After going through all the nouns, print out the hypernyms 
    for (name, count) in hypfd.most_common(25):  # that have accumulated the most counts (have seen the most descendents)
        print( name(), '({0})'.format(count))
        print ('\t', ', '.join(set(hypterms_dict[name])))  # show the children found for each hypernym
        print ()

In [32]:
categories_from_hypernyms(cookbook_sents)

time_period.n.01 (5)
	 day, hour, time

flavorer.n.01 (3)
	 salt, pepper, herb

helping.n.01 (3)
	 slice, round, piece

time_unit.n.01 (3)
	 day, hour

united_states_dry_unit.n.01 (2)
	 quart, pint

case.n.01 (2)
	 piece, time

force_unit.n.01 (2)
	 pound

united_states_liquid_unit.n.01 (2)
	 quart, pint

time.n.03 (2)
	 day, piece

soup.n.01 (2)
	 broth

attendant.n.01 (2)
	 page

part.n.09 (2)
	 round, half

thing.n.12 (2)
	 water, piece

british_capacity_unit.n.01 (2)
	 quart, pint

avoirdupois_unit.n.01 (2)
	 pound, ounce

happening.n.01 (2)
	 gravy, fire

share.n.01 (2)
	 slice, piece

vegetable.n.01 (2)
	 onion, mushroom

foodstuff.n.02 (2)
	 egg, flour

herb.n.01 (2)
	 carrot, parsley

element.n.05 (2)
	 fire, water

food.n.02 (2)
	 butter, meat

agaric.n.02 (2)
	 mushroom

distance.n.01 (2)
	 piece, hour

meat.n.01 (2)
	 beef, mutton



In [33]:
categories_from_hypernyms(brown_news_sents)

time_period.n.01 (15)
	 month, day, week, school, year, night, time

building.n.01 (5)
	 school, house, club

administrative_district.n.01 (5)
	 state, country, county, city

unit.n.03 (5)
	 member, home, family, house, company

time_unit.n.01 (4)
	 night, day, month

educational_institution.n.01 (3)
	 school, university

compartment.n.02 (3)
	 car

body.n.02 (3)
	 school, administration, university

male.n.02 (3)
	 man

collection.n.01 (2)
	 family, law

association.n.01 (2)
	 family, club

activity.n.01 (2)
	 service, game

government.n.01 (2)
	 state, court

attribute.n.02 (2)
	 state, time

title.n.06 (2)
	 mrs., mr.

social_gathering.n.01 (2)
	 company, meeting

selling.n.01 (2)
	 sale

political_unit.n.01 (2)
	 state, country

system.n.04 (2)
	 program, government

social_control.n.01 (2)
	 administration, government

legal_document.n.01 (2)
	 law, bill

force.n.04 (2)
	 service, law

head_of_state.n.01 (2)
	 president

idea.n.01 (2)
	 program, plan

region.n.01 (2)
	 house, coun

 *** What works well? *** 

*** What are the problems, and how can this code be improved? ***