### 3.1 Introducing embeddings

##### 3.1.1 What are embeddings?

##### 3.1.2 Why are embeddings important?

### 3.2 Building blocks of language: Characters, words, and phrases

##### 3.2.1 Characters

##### 3.2.2 Words, tokens, morphemes, and phrases

##### 3.2.3 N-grams

### 3.3 Tokenization, stemming, and lemmatization

##### 3.3.1 Tokenization

##### 3.3.2 Stemming

##### 3.3.3 Lemmatization

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\longln3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\longln3\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
s = '''Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\nThanks.'''

In [3]:
# word_tokenize() uses TreebankWordTokenizer internally
word_tokenize(s)

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [4]:
from nltk.tokenize import sent_tokenize

In [5]:
# sent_tokenizer() uses PunktSentenceTokenizer internally
sent_tokenize(s)

['Good muffins cost $3.88\nin New York.',
 'Please buy me two of them.',
 'Thanks.']

In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [7]:
doc = nlp(s)

In [8]:
[token.text for token in doc]

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 '\n',
 'in',
 'New',
 'York',
 '.',
 ' ',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 '\n\n',
 'Thanks',
 '.']

In [24]:
[str(sent).strip() for sent in  doc.sents]

['Good muffins cost $3.88\nin New York.',
 '',
 'Please buy me two of them.',
 '',
 'Thanks.']

In [25]:
from nltk.stem.porter import PorterStemmer

In [26]:
stemmer = PorterStemmer()

In [27]:
words = ['caresses', 'flies', 'dies', 'mules', 'denied',
         'died', 'agreed', 'owned', 'humbled', 'sized',
         'meetings', 'stating', 'siezing', 'itemization',
         'sensational', 'traditional', 'reference', 'colonizer',
         'plotted']

In [28]:
[stemmer.stem(word) for word in words]

['caress',
 'fli',
 'die',
 'mule',
 'deni',
 'die',
 'agre',
 'own',
 'humbl',
 'size',
 'meet',
 'state',
 'siez',
 'item',
 'sensat',
 'tradit',
 'refer',
 'colon',
 'plot']

In [29]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [30]:
[lemmatizer.lemmatize(word) for word in words]

['caress',
 'fly',
 'dy',
 'mule',
 'denied',
 'died',
 'agreed',
 'owned',
 'humbled',
 'sized',
 'meeting',
 'stating',
 'siezing',
 'itemization',
 'sensational',
 'traditional',
 'reference',
 'colonizer',
 'plotted']

In [31]:
doc = nlp(' '.join(words))

In [32]:
[token.lemma_ for token in doc]

['caress',
 'fly',
 'die',
 'mule',
 'deny',
 'died',
 'agree',
 'own',
 'humble',
 'sized',
 'meeting',
 'state',
 'sieze',
 'itemization',
 'sensational',
 'traditional',
 'reference',
 'colonizer',
 'plot']