In [22]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jutro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jutro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jutro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jutro\AppData\Roaming\nltk_data...


True

**1. Tokenization**

Split text into words. 

In [10]:
from nltk.tokenize import word_tokenize
text = "Sailing employs the wind—acting on sails, wingsails or kites—to propel a craft on the surface of the water, on ice or on land over a chosen course, which is often part of a larger plan of navigation."
print(word_tokenize(text))

['Sailing', 'employs', 'the', 'wind—acting', 'on', 'sails', ',', 'wingsails', 'or', 'kites—to', 'propel', 'a', 'craft', 'on', 'the', 'surface', 'of', 'the', 'water', ',', 'on', 'ice', 'or', 'on', 'land', 'over', 'a', 'chosen', 'course', ',', 'which', 'is', 'often', 'part', 'of', 'a', 'larger', 'plan', 'of', 'navigation', '.']


In [11]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
words = [word_tokenize(sent) for sent in sentences]
print(words)

[['Sailing', 'employs', 'the', 'wind—acting', 'on', 'sails', ',', 'wingsails', 'or', 'kites—to', 'propel', 'a', 'craft', 'on', 'the', 'surface', 'of', 'the', 'water', ',', 'on', 'ice', 'or', 'on', 'land', 'over', 'a', 'chosen', 'course', ',', 'which', 'is', 'often', 'part', 'of', 'a', 'larger', 'plan', 'of', 'navigation', '.']]


Split text into sentences. 

In [12]:
print(sentences)

['Sailing employs the wind—acting on sails, wingsails or kites—to propel a craft on the surface of the water, on ice or on land over a chosen course, which is often part of a larger plan of navigation.']


**2. Removing stop words** 

In [13]:
from nltk.corpus import stopwords
from string import punctuation
custom_stop_words = set(stopwords.words('english')+list(punctuation))
words_wo_stop_words = [word for word in word_tokenize(text) if word not in custom_stop_words]
print(words_wo_stop_words)

['Sailing', 'employs', 'wind—acting', 'sails', 'wingsails', 'kites—to', 'propel', 'craft', 'surface', 'water', 'ice', 'land', 'chosen', 'course', 'often', 'part', 'larger', 'plan', 'navigation']


**3. Identify n-grams**

In [14]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words_wo_stop_words)
sorted(finder.ngram_fd.items())

[(('Sailing', 'employs'), 1),
 (('chosen', 'course'), 1),
 (('course', 'often'), 1),
 (('craft', 'surface'), 1),
 (('employs', 'wind—acting'), 1),
 (('ice', 'land'), 1),
 (('kites—to', 'propel'), 1),
 (('land', 'chosen'), 1),
 (('larger', 'plan'), 1),
 (('often', 'part'), 1),
 (('part', 'larger'), 1),
 (('plan', 'navigation'), 1),
 (('propel', 'craft'), 1),
 (('sails', 'wingsails'), 1),
 (('surface', 'water'), 1),
 (('water', 'ice'), 1),
 (('wind—acting', 'sails'), 1),
 (('wingsails', 'kites—to'), 1)]

**4. Stemming and Part-Of-Speech tagging**

In [15]:
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text)]
print(stemmedWords)

['sail', 'employ', 'the', 'wind—acting', 'on', 'sail', ',', 'wingsail', 'or', 'kites—to', 'propel', 'a', 'craft', 'on', 'the', 'surfac', 'of', 'the', 'wat', ',', 'on', 'ic', 'or', 'on', 'land', 'ov', 'a', 'chos', 'cours', ',', 'which', 'is', 'oft', 'part', 'of', 'a', 'larg', 'plan', 'of', 'navig', '.']


In [18]:
nltk.pos_tag(word_tokenize(text))

[('Sailing', 'VBG'),
 ('employs', 'VBZ'),
 ('the', 'DT'),
 ('wind—acting', 'VBG'),
 ('on', 'IN'),
 ('sails', 'NNS'),
 (',', ','),
 ('wingsails', 'NNS'),
 ('or', 'CC'),
 ('kites—to', 'VB'),
 ('propel', 'VB'),
 ('a', 'DT'),
 ('craft', 'NN'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('surface', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('water', 'NN'),
 (',', ','),
 ('on', 'IN'),
 ('ice', 'NN'),
 ('or', 'CC'),
 ('on', 'IN'),
 ('land', 'NN'),
 ('over', 'IN'),
 ('a', 'DT'),
 ('chosen', 'JJ'),
 ('course', 'NN'),
 (',', ','),
 ('which', 'WDT'),
 ('is', 'VBZ'),
 ('often', 'RB'),
 ('part', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('larger', 'JJR'),
 ('plan', 'NN'),
 ('of', 'IN'),
 ('navigation', 'NN'),
 ('.', '.')]

**5. Word sense disambiguation**

In [30]:
from nltk.corpus import wordnet # lexical base
for ss in wordnet.synsets('flood'):
    print(ss, ss.definition())

Synset('flood.n.01') the rising of a body of water and its overflowing onto normally dry land
Synset('flood.n.02') an overwhelming number or amount
Synset('flood.n.03') light that is a source of artificial illumination having a broad beam; used in photography
Synset('flood.n.04') a large flow
Synset('flood.n.05') the act of flooding; filling to overflowing
Synset('flood_tide.n.02') the occurrence of incoming water (between a low tide and the following high tide);  -Shakespeare
Synset('deluge.v.01') fill quickly beyond capacity; as with a liquid
Synset('flood.v.02') cover with liquid, usually water
Synset('flood.v.03') supply with an excess of
Synset('flood.v.04') become filled to overflowing


In [31]:
from nltk.wsd import lesk # classical algorithm for word sense disambiguation
sense = lesk(word_tokenize("The flood took all they had."), 'flood')
print(sense, sense.definition())

Synset('flood_tide.n.02') the occurrence of incoming water (between a low tide and the following high tide);  -Shakespeare


In [33]:
sense = lesk(word_tokenize("We should not flood the investigators with all information, but make sure that the right people have access to the right data."), 'flood')
print(sense, sense.definition())

Synset('flood.n.05') the act of flooding; filling to overflowing
