In [1]:
import nltk

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize
text = "Do or do not. There is no try"

sentences = sent_tokenize(text)
sentences

['Do or do not.', 'There is no try']

In [7]:
words = [word_tokenize(sentence) for sentence in sentences]
words

[['Do', 'or', 'do', 'not', '.'], ['There', 'is', 'no', 'try']]

In [11]:
# Removing stopwords
from nltk.corpus import stopwords
from string import punctuation

custom_stop_words = set(stopwords.words('english') + list(punctuation))

In [12]:
words_without_stopwords = [word for word in word_tokenize(text) if word not in custom_stop_words]
words_without_stopwords

['Do', 'There', 'try']

In [13]:
# Bigrams from a list of word
from nltk.collocations import *
bigrams = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words_without_stopwords)
sorted(finder.ngram_fd.items())

[(('Do', 'There'), 1), (('There', 'try'), 1)]

In [15]:
# Stemming and Part-of-Speech tagging
text2 = "Mary closed on closing night when she was in the mood to close." # same word, different meanings

from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmed_words = [st.stem(word) for word in word_tokenize(text2)]
print(stemmed_words)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


In [16]:
nltk.pos_tag( word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

In [25]:
# wordnet: lexicon (something like a thesaurus).
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bitch'):
    print(ss, ss.definition())

Synset('bitch.n.01') an unpleasant difficulty
Synset('cunt.n.01') a person (usually but not necessarily a woman) who is thoroughly disliked
Synset('gripe.n.01') informal terms for objecting
Synset('bitch.n.04') female of any member of the dog family
Synset('gripe.v.01') complain
Synset('backbite.v.01') say mean things


In [33]:
# Disambiguated word meanings
# wsd: word sense disambiguation
from nltk.wsd import lesk
sense_1 = lesk(word_tokenize("Sing in the lower tone, along with the bass"), 'bass')
print(sense_1, sense_1.definition())

Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)


In [32]:
sense_2 = lesk(word_tokenize("This sea bass was really hard to catch"), 'bass')
print(sense_2, sense_2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
