# Tasks in Natural Language Processing
Following are the Common tasks in NLP

![image.png](attachment:image.png)

# Vectorization

In [50]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/husnara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [51]:
text = "Mary had a little lamb. Her fleece was white as snow"

In [52]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [53]:
sents = sent_tokenize(text)
print(sents)

['Mary had a little lamb.', 'Her fleece was white as snow']


In [54]:
words = [word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow']]


# Stopword Removal

In [55]:
from nltk.corpus import stopwords
from string import punctuation

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/husnara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
customStopWords = set(stopwords.words('english')+list(punctuation))
wordsWOStopwords = [word for word in word_tokenize(text) if word not in customStopWords]
print(wordsWOStopwords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


# N-Grams

In [57]:
from nltk.collocations import * 

In [58]:
Bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopwords)

In [59]:
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

# Word Sense Disambiguation

In [60]:
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/husnara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [61]:
for ss in wn.synsets('bass'):
    print(ss,ss.definition())

(Synset('bass.n.01'), u'the lowest part of the musical range')
(Synset('bass.n.02'), u'the lowest part in polyphonic music')
(Synset('bass.n.03'), u'an adult male singer with the lowest voice')
(Synset('sea_bass.n.01'), u'the lean flesh of a saltwater fish of the family Serranidae')
(Synset('freshwater_bass.n.01'), u'any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)')
(Synset('bass.n.06'), u'the lowest adult male singing voice')
(Synset('bass.n.07'), u'the member with the lowest range of a family of musical instruments')
(Synset('bass.n.08'), u'nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes')
(Synset('bass.s.01'), u'having or denoting a low vocal or instrumental range')


In [62]:
from nltk.wsd import lesk

In [63]:
sense1 = lesk(word_tokenize('Sing in a lower tone, along with the bass'), 'bass')

In [64]:
print(sense1,sense1.definition())

(Synset('bass.n.07'), u'the member with the lowest range of a family of musical instruments')


# Parts of Speech & Stemming

In [65]:
text2 = "Mary closed on closing night when she was in the mood of close"

In [66]:
from nltk.stem.lancaster import LancasterStemmer

In [67]:
st = LancasterStemmer()
stemmedWords = [ st.stem(word) for word in word_tokenize(text2) ]
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'of', 'clos']


In [68]:
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(word_tokenize(text2))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/husnara/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('of', 'IN'),
 ('close', 'NN')]