# WordNet and Word Sense Disambiguation (WSD) with NLTK

WordNet is a lexical database for the English language, and it superficially resembles a thesaurus, in that it groups words together based on their meanings. In the WordNet network, the words are connected by linguistic relations, i.e. Synonyms, Meronyms and Holonyms,  Entailments, Similarity, etc.

NLTK includes the English WordNet, with 155,287 words and 117,659 synonym sets.
https://www.linkedin.com/pulse/wordnet-word-sense-disambiguation-wsd-nltk-aswathi-nambiar/

In [27]:
from nltk.corpus import wordnet

In [19]:
syns = wordnet.synsets("jupiter")
for syn in syns:
    print syn.definition()

the largest planet and the 5th from the sun; has many satellites and is one of the brightest objects in the night sky
(Roman mythology) supreme god of Romans; counterpart of Greek Zeus


In [26]:
wordnet.path_similarity(wordnet.synset('dog.n.01'),wordnet.synset('cat.n.01'))

0.2

In [28]:
import nltk
import codecs
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer


In [3]:
# Remove Stop Words . Word Stemming . Return new tokenised list.
def filteredSentence(sentence):

	lemmatizer = WordNetLemmatizer()   #lemmatizes the words
	ps = PorterStemmer()    #stemmer stems the root of the word.
	stop_words = set(stopwords.words("english"))
	words = word_tokenize(sentence)

	filtered_sent = []
	for w in words:
        	if w not in stop_words:
                	filtered_sent.append(lemmatizer.lemmatize(ps.stem(w)))
                	for i in synonymsCreator(w):
                		filtered_sent.append(i)
	return filtered_sent

In [4]:
# Add synonyms to match list
def synonymsCreator(word):
	synonyms = []

	for syn in wordnet.synsets(word):
		for i in syn.lemmas():
			synonyms.append(i.name())

	return synonyms


In [5]:
# Cehck and return similarity
def simlilarityCheck(word1, word2):

	word1 = word1 + ".n.01"
	word2 = word2 + ".n.01"
	try:
		w1 = wordnet.synset(word1)
		w2 = wordnet.synset(word2)

		return w1.wup_similarity(w2)

	except:
		return 0

In [10]:
def simpleFilter(sentence):

	filtered_sent = []
	lemmatizer = WordNetLemmatizer()
	stop_words = set(stopwords.words("english"))
	words = word_tokenize(sentence)

	for w in words:
        	if w not in stop_words:
                	filtered_sent.append(lemmatizer.lemmatize(w))
                	# for i in synonymsCreator(w):
                	# 	filtered_sent.append(i)
	return filtered_sent

In [30]:
sent1 = "the commercial banks are used for finance. all the financial matters are managed by financial banks and they have lots of money, user accounts like salary account and savings account, current account. money can also be withdrawn from this bank."
sent2 = "the river bank has water in it and it has fishes trees . lots of water is stored in the banks. boats float in it and animals come and drink water from it."
sent3 = "from which bank should i withdraw money"


filtered_sent1 = simpleFilter(sent1)
filtered_sent2 = simpleFilter(sent2)
filtered_sent3 = simpleFilter(sent3)

counter1 = 0
counter2 = 0
sent31_similarity = 0
sent32_similarity = 0

for i in filtered_sent3:

    for j in filtered_sent1:
        counter1 = counter1 + 1
        sent31_similarity = sent31_similarity + simlilarityCheck(i,j)

    for j in filtered_sent2:
        counter2 = counter2 + 1
        sent32_similarity = sent32_similarity + simlilarityCheck(i,j)

        
filtered_sent1 = filteredSentence(sent1)
filtered_sent2 = filteredSentence(sent2)
filtered_sent3 = filteredSentence(sent3)

sent1_count = 0
sent2_count = 0

for i in filtered_sent3:

    for j in filtered_sent1:

        if(i==j):
            sent1_count = sent1_count + 1

    for j in filtered_sent2:
        if(i==j):
            sent2_count = sent2_count + 1

if((sent1_count + sent31_similarity)>(sent2_count+sent32_similarity)):
    print "Money Bank"
else:
    print "River Bank"


Money Bank


## CountVectorizer, TfidfVectorizer DO NOT WORK

In [28]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.spatial.distance import cosine

In [42]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, max_df=0.5, max_features=150,
                        encoding='utf-8', norm='l2', 
                        ngram_range=(1, 3), stop_words='english', lowercase=True)
cntvec = CountVectorizer(stop_words='english',ngram_range=(1, 3))

In [45]:
v1, v2  = tfidf.fit_transform([sent1, sent2 ]).toarray()
v3 = tfidf.transform([sent3]).toarray()

In [51]:
v1, v2, v3  = cntvec.fit_transform([sent1, sent2, sent3 ]).toarray()
 

In [52]:
cosine(v1,v3) , cosine(v2,v3)

(0.8613249509436927, 0.9410744349011211)