# We study about Stemming , n-grams,Part of Speech, NER - Name ENtity Recognition, Text Disambiguation, Stopping Word, Tokenization etc. using NLTK toolkit

In [2]:
import nltk

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [4]:
text = "Mary had a little lamb. Her fleece was white as snow"

In [5]:
sents=sent_tokenize(text)

In [6]:
print(sents)

['Mary had a little lamb.', 'Her fleece was white as snow']


In [7]:
words=[word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow']]


# Remove Stop Words and Punctuations

In [8]:
from nltk.corpus import stopwords
from string import punctuation

In [9]:
customStopWords = set(stopwords.words('english')+list(punctuation))

In [10]:
len(customStopWords)

211

In [11]:
cleanWords=[word for word in word_tokenize(text) if word not in customStopWords]
print(cleanWords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


# Collocation

In [12]:
from nltk.collocations import *

In [14]:
finder=BigramCollocationFinder.from_words(cleanWords)

In [15]:
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

# Stemming and Part of Speech

In [20]:
text2 ="Mary closed on closing night when she was in the mood to close"

In [21]:
from nltk.stem.lancaster import LancasterStemmer

In [22]:
st=LancasterStemmer()

In [24]:
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]

In [25]:
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos']


In [26]:
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB')]

In [27]:
from nltk.corpus import wordnet as wn

In [43]:
for ss in wn.synsets('bass'):
    print(ss,ss.definition)

Synset('bass.n.01') <bound method Synset.definition of Synset('bass.n.01')>
Synset('bass.n.02') <bound method Synset.definition of Synset('bass.n.02')>
Synset('bass.n.03') <bound method Synset.definition of Synset('bass.n.03')>
Synset('sea_bass.n.01') <bound method Synset.definition of Synset('sea_bass.n.01')>
Synset('freshwater_bass.n.01') <bound method Synset.definition of Synset('freshwater_bass.n.01')>
Synset('bass.n.06') <bound method Synset.definition of Synset('bass.n.06')>
Synset('bass.n.07') <bound method Synset.definition of Synset('bass.n.07')>
Synset('bass.n.08') <bound method Synset.definition of Synset('bass.n.08')>
Synset('bass.s.01') <bound method Synset.definition of Synset('bass.s.01')>


# Explore Wordnet: Senses and Synonyms
- Retrives all synonym sets from the word
- gather list of words representing meaning from each set

In [56]:
for e in wn.synsets('computer'):
    print(f'{e}-->{e.lemma_names()}')

Synset('computer.n.01')-->['computer', 'computing_machine', 'computing_device', 'data_processor', 'electronic_computer', 'information_processing_system']
Synset('calculator.n.01')-->['calculator', 'reckoner', 'figurer', 'estimator', 'computer']


In [57]:
wn.synset('calculator.n.01').definition()

'an expert at calculation (or at operating calculating machines)'

In [59]:
wn.synset('calculator.n.01').lemmas()

[Lemma('calculator.n.01.calculator'),
 Lemma('calculator.n.01.reckoner'),
 Lemma('calculator.n.01.figurer'),
 Lemma('calculator.n.01.estimator'),
 Lemma('calculator.n.01.computer')]

In [50]:
wn.synset('bass.n.06').lemma_names()

['bass', 'bass_voice', 'basso']

# Word Sense Disambiguation

In [33]:
from nltk.wsd import lesk

In [35]:
sense1=lesk(word_tokenize("Sing in a lower tone, along with the bass"),"bass")
print(sense1,sense1.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [40]:
sense2=lesk(word_tokenize("This sea bass was really hard to catch"),"bass")
print(sense2,sense2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae


In [None]:
sense1=lesk(word_tokenize("Sing in a lower tone, along with the bass"),"bass")
print(sense1,sense1.definition())

# Spam Detection