<a href="https://colab.research.google.com/github/mithunkumarsr/NLPMay22/blob/main/WordSenseWordNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WordNet
https://wordnet.princeton.edu

In [1]:
# NLTK provides direct access to wordnet. 
# WordNet corpus reader gives access to the Open Multilingual WordNet.
from nltk.corpus import wordnet as wn
from nltk import pos_tag,word_tokenize

# Synsets

In [3]:
#Import wordnet from the NLTK
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.corpus import wordnet 
# A synset is identified with a 3-parts of the form: word.pos.nn (word.part-of-speech.no.-of-sense)
synset = wn.synsets("education")
print('Word and Type : ' + synset[0].name())
print('The meaning of the word : ' + synset[0].definition())
print('Example : ' + str(synset[0].examples()))

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Word and Type : education.n.01
The meaning of the word : the activities of educating or instructing; activities that impart knowledge or skill
Example : ['he received no formal education', 'our instruction was carefully programmed', 'good classroom teaching is seldom rewarded']


In [4]:
# For a word like "car" we can take a look at the synsets:
print(wn.synsets('car'))

# Example usage of synset for "car":
print(wn.synset('car.n.01').examples())

[Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]
['he needs a car to get to work']


In [5]:
# One primary use for WordNet is to determine the similarity between words.
# Take for example the following two sentences:
# 1. I learned natural language processing by resources found on the internet.
# 2. I learned natural language processing by resources found on the net.

# Both sentence 1. and 2. are the same, with the exception of the last word.
# The words "internet" and "net" are synoynms, so the meaning of each sentence
# is the same irrespective of whether "internet" or "net" is used at the end.

# We can use the wordnet module to determine the synsets (synonym sets) of
# the word internet:
print(wn.synsets('internet'))

# The entry 'internet.n.01' is a synset for the word internet.
# Each synonym in the set is referred to as a **lemma**.
# We can print out the list of such synsets and their corresponding
# lemmas.(Specifically, the pairing of a synset with a word is called a lemma):
print(wn.synset('internet.n.01').lemma_names())

[Synset('internet.n.01')]
['internet', 'net', 'cyberspace']


In [None]:
# According to WordNet, the word "internet" is a synonym of the word "net" and the word "cyberspace".

# Synonyms and Antonyms

In [6]:
# Get Synonyms and Antonyms
syn = list()
ant = list()
for synset in wn.synsets("good"):
   for lemma in synset.lemmas():
      syn.append(lemma.name())    #add the synonyms
      if lemma.antonyms():    #When antonyms are available, add them into the list
        ant.append(lemma.antonyms()[0].name())
      
print('Synonyms: ' + str(syn))

Synonyms: ['good', 'good', 'goodness', 'good', 'goodness', 'commodity', 'trade_good', 'good', 'good', 'full', 'good', 'good', 'estimable', 'good', 'honorable', 'respectable', 'beneficial', 'good', 'good', 'good', 'just', 'upright', 'adept', 'expert', 'good', 'practiced', 'proficient', 'skillful', 'skilful', 'good', 'dear', 'good', 'near', 'dependable', 'good', 'safe', 'secure', 'good', 'right', 'ripe', 'good', 'well', 'effective', 'good', 'in_effect', 'in_force', 'good', 'good', 'serious', 'good', 'sound', 'good', 'salutary', 'good', 'honest', 'good', 'undecomposed', 'unspoiled', 'unspoilt', 'good', 'well', 'good', 'thoroughly', 'soundly', 'good']


In [7]:
print('Antonyms: ' + str(ant))

Antonyms: ['evil', 'evilness', 'bad', 'badness', 'bad', 'evil', 'ill']


# Hyponym and Hypernym

In [8]:
# Hyponym: "a word of more specific meaning than a general or superordinate
# term applicable to it. For example, spoon is a hyponym of cutlery."

# First obtain the synsets for the term "cat":
print(wn.synsets('cat'))

[Synset('cat.n.01'), Synset('guy.n.01'), Synset('cat.n.03'), Synset('kat.n.01'), Synset('cat-o'-nine-tails.n.01'), Synset('caterpillar.n.02'), Synset('big_cat.n.01'), Synset('computerized_tomography.n.01'), Synset('cat.v.01'), Synset('vomit.v.01')]


In [9]:
# There are a few different synsets for this word.
# Let us take a look at what the definition of
# the synset 'cat.n.01' is:
print(wn.synset('cat.n.01').definition())

feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats


In [10]:
# Definition refers to the feline variety of the term cat. 
# Let us determine the hyponyms of the term "cat", and
# store that into a variable `types_of_cats`.
cat = wn.synset('cat.n.01')
types_of_cats = cat.hyponyms()

# Now, let us loop through the hyponyms and see the
# lemmas for each synset:
for synset in types_of_cats:
    for lemma in synset.lemmas():
        print(lemma.name())

domestic_cat
house_cat
Felis_domesticus
Felis_catus
wildcat


In [11]:
# Note that terms like "domestic_cat" and "house_cat" are
# more specific terms with respect to the term "cat", that is,
# they are hyponyms of the word "cat".

# Hypernym: "a word with a broad meaning that more specific words fall
# under; a superordinate. For example, color is a hypernym of red.

# A hyponym drills down to more specificity, while a hypernym goes
# upward toward more generality.

# Example:
#   Cat <- hypernym
#       house_cat <- hyponym
print(wn.synset('house_cat.n.01').hypernyms())

# One way in which one may ascribe similarity between two different words
# is to assign a score based on the distance in terms of hypernyms and
# hyponyms. That is, how many levels up or down is a given word from
# the other we are attempting to compare it to.

[Synset('cat.n.01'), Synset('domestic_animal.n.01')]


# WordNet Path Similarity

In [12]:
# WordNet provides to us some metric to know how two words are related to one another.
# The `path_similarity` function returns a score denoting how similar two
# words are in terms of the distance between hypernyms/hyponyms.

# Let us calculate this metric of similarity between words "car" and "automobile".

# First, define the synsets for these terms:
car = wn.synset('car.n.01')
#print(car)
automobile = wn.synset('automobile.n.01')

# The path_similarity function returns a score between 0 and 1, 
# where 0 is no similarity between the hypernym/hyponym tree and 
# distance of 1 is the node which houses both of the words 
# in terms of hypernyms/hyponyms is identical.

print("Path similarity between CAR and AUTOMOBILE = ")
print(car.path_similarity(automobile))

Path similarity between CAR and AUTOMOBILE = 
1.0


In [13]:
# We see that "car" and "automobile" have the highest similarity possible, with a score of 1.0.
# One of the synonyms of "car" is indeed "automobile".

# Let us now take a look at the term "car" and "boat":
boat = wn.synset('boat.n.01')
print("Path similarity between CAR and BOAT = ")
print(car.path_similarity(boat))

Path similarity between CAR and BOAT = 
0.125


In [18]:
# We see a lower number here. The traversal with respect to hypernyms/hyponyms 
# from car to boat is certainly at least below 1.0.

# There are actually many ways in which to define distances between words.
# let us look at the Wu-Palmber similarity metric.
print("Wu-Palmber similarity between CAR and AUTOMOBILE = ")
print(car.wup_similarity(automobile))

Wu-Palmber similarity between CAR and AUTOMOBILE = 
1.0


In [15]:
# wup metric with "car" and "boat":
print("Wu-Palmber similarity between CAR and BOAT = ")
print(car.wup_similarity(boat))

Wu-Palmber similarity between CAR and BOAT = 
0.6956521739130435


In [16]:
# wup metric with "car" and "cat":
cat = wn.synset('cat.n.01')
print("Wu-Palmber similarity between CAR and CAT = ")
print(car.wup_similarity(cat))

# We see an even lower number here, as one may expect between the terms 
# "car" and "cat" under this metric of word similarity. 

Wu-Palmber similarity between CAR and CAT = 
0.32


# WordNet for word sense disambiguation

In [None]:
# Let us take example sentence and find exact sense of the word "bank" used in the sentence.

In [17]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
  
sentence='The river bank is beautiful'
#POS Tagging
def pos_tag_s(sent):
    pos_tag_list=pos_tag(word_tokenize(sent),tagset='universal')
    return pos_tag_list

pos_tag_sent=pos_tag_s(sentence)
print(pos_tag_sent)

#We try to find the exact sense of noun "bank" in this sentence
chk_Noun=[]

for i in pos_tag_sent:
    if i[1]=='NOUN':
        chk_Noun.append(i[0])
print("Nouns :",chk_Noun)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...


[('The', 'DET'), ('river', 'NOUN'), ('bank', 'NOUN'), ('is', 'VERB'), ('beautiful', 'ADJ')]
Nouns : ['river', 'bank']


[nltk_data]   Unzipping taggers/universal_tagset.zip.


In [None]:
#Extract synsets 
synset_1=[ss for ss in wordnet.synsets(chk_Noun[0],'n')]
print(synset_1,"\n")

[Synset('river.n.01')] 



In [None]:
#Extract synsets 
synset_2=[ss for ss in wordnet.synsets(chk_Noun[1],'n')]
print(synset_2,"\n")

[Synset('bank.n.01'), Synset('depository_financial_institution.n.01'), Synset('bank.n.03'), Synset('bank.n.04'), Synset('bank.n.05'), Synset('bank.n.06'), Synset('bank.n.07'), Synset('savings_bank.n.02'), Synset('bank.n.09'), Synset('bank.n.10')] 



In [None]:
listpath=[]
for i in synset_1:
    list1=[]
    for j in synset_2:
       list1.append(i.path_similarity(j))
    listpath.append(list1)

print("Path Similarities(Probabilities) : ",listpath)

Path Similarities(Probabilities) :  [[0.1111111111111111, 0.07692307692307693, 0.1, 0.09090909090909091, 0.05555555555555555, 0.08333333333333333, 0.1111111111111111, 0.09090909090909091, 0.09090909090909091, 0.0625]]


In [None]:
maxvalues=[]
index_list=[]
for x in listpath:
    a=max(x)
    index_list.append(x.index(a))
    maxvalues.append(a)
print
print("maxvalues :",maxvalues," index list : ",index_list)

maxvalues : [0.1111111111111111]  index list :  [0]


In [None]:
zipped=zip(index_list,maxvalues)
listzip=list(zipped)
print("The maximum probability of "+chk_Noun[0]+" matching with "+chk_Noun[1]+" is in the sense: "
      +str(synset_1[0].name())+"\nThe definition of the same is : "+synset_1[0].definition()
     +"\nThe other noun" + str(synset_2[listzip[0][0]].name()) + " matching with the propability of: "+str(listzip[0][1])
      +"\nIts definition is: " + synset_2[listzip[0][0]].definition())

The maximum probability of river matching with bank is in the sense: river.n.01
The definition of the same is : a large natural stream of water (larger than a creek)
The other nounbank.n.01 matching with the propability of: 0.1111111111111111
Its definition is: sloping land (especially the slope beside a body of water)
