In [1]:
# Tokenization, Stemming, Lemmatization

In [11]:
import nltk
from nltk.tokenize import word_tokenize

In [19]:
s = '''Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\nThanks.'''

In [20]:
# word_tokenize() uses TreebankWordTokenizer internally
word_tokenize(s)

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [21]:
from nltk.tokenize import sent_tokenize

In [22]:
# sent_tokenizer() uses PunktSentenceTokenizer internally
sent_tokenize(s)

['Good muffins cost $3.88\nin New York.',
 'Please buy me two of them.',
 'Thanks.']

In [23]:
import spacy

In [24]:
nlp = spacy.load('en_core_web_sm')

In [26]:
doc = nlp(s)

In [27]:
[token.text for token in doc]

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 '\n',
 'in',
 'New',
 'York',
 '.',
 ' ',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 '\n\n',
 'Thanks',
 '.']

In [30]:
# https://github.com/explosion/spaCy/issues/93
[sent.string.strip() for sent in  doc.sents]

['Good muffins cost $3.88\nin New York.',
 'Please buy me two of them.',
 'Thanks.']

In [31]:
from nltk.stem.porter import PorterStemmer

In [32]:
stemmer = PorterStemmer()

In [56]:
words = ['caresses', 'flies', 'dies', 'mules', 'denied',
           'died', 'agreed', 'owned', 'humbled', 'sized',
           'meetings', 'stating', 'siezing', 'itemization',
           'sensational', 'traditional', 'reference', 'colonizer',
           'plotted']

In [57]:
[stemmer.stem(word) for word in words]

['caress',
 'fli',
 'die',
 'mule',
 'deni',
 'die',
 'agre',
 'own',
 'humbl',
 'size',
 'meet',
 'state',
 'siez',
 'item',
 'sensat',
 'tradit',
 'refer',
 'colon',
 'plot']

In [58]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [59]:
[lemmatizer.lemmatize(plural) for plural in plurals]

['caress',
 'fly',
 'dy',
 'mule',
 'denied',
 'died',
 'agreed',
 'owned',
 'humbled',
 'sized',
 'meeting',
 'stating',
 'siezing',
 'itemization',
 'sensational',
 'traditional',
 'reference',
 'colonizer',
 'plotted']

In [60]:
doc = nlp(' '.join(plurals))

In [61]:
[token.lemma_ for token in doc]

['caress',
 'fly',
 'die',
 'mule',
 'deny',
 'die',
 'agree',
 'own',
 'humble',
 'sized',
 'meeting',
 'state',
 'siezing',
 'itemization',
 'sensational',
 'traditional',
 'reference',
 'colonizer',
 'plot']