# Wordpiece tokenizers

# Chapter 2: Tokenization using NLTK


This notebook demonstrates various text tokenization techniques using NLTK and spaCy libraries. 

In [175]:
import re

import nltk
import pandas as pd
import requests
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import spacy

In [176]:
text = (
    "Trust me, though, the words were on their way, and when "
    "they arrived, Liesel would hold them in her hands like "
    "the clouds, and she would wring them out, like the rain."
)

# New interesting sentences with various linguistic features
new_sentences = [
    "The AI researcher's model achieved 99.9% accuracy - a groundbreaking result!",
    "Mr. Smith bought a Ph.D. degree from example.com for $9,999...",
    "She exclaimed, 'OMG! This can't be real!' while reading the email.",
    "The code runs fast (about 2.5x faster) than our previous implementation.",
]
texts = [text] + new_sentences

In [177]:
vectorizer = CountVectorizer(ngram_range=(1, 2), analyzer="char")
vectorizer.fit(texts)

In [178]:
bpe_vocab_list = [sorted((i, s) for i, s in enumerate(vectorizer.vocabulary_.items()))]
bpe_vocab_dict = dict(bpe_vocab_list[0])
list(bpe_vocab_dict.values())[:7]

[('t', 204),
 ('r', 183),
 ('u', 213),
 ('s', 196),
 (' ', 0),
 ('m', 155),
 ('e', 95)]

### Create a df of # of representation of each token in the corpus (each column is a sentence)

In [179]:
vectors = vectorizer.transform(texts)
df = pd.DataFrame(vectors.todense(), columns=vectorizer.get_feature_names_out())
df.index = [t[:8] + "..." for t in texts]
df = df.T
df["total"] = df.T.sum()
df

Unnamed: 0,Trust me...,The AI r...,Mr. Smit...,She excl...,The code...,total
,31,10,9,10,10,70
$,0,0,1,0,0,1
',0,0,0,1,0,1
(,0,0,0,0,1,1
-,0,1,0,0,0,1
...,...,...,...,...,...,...
xa,0,0,1,0,0,1
xc,0,0,0,1,0,1
y,2,1,0,0,0,3
y,1,1,0,0,0,2


### Show the 5 most frequest tokens accross all sentences

In [180]:
df["n"] = [len(token) for token in vectorizer.vocabulary_]
df[df["n"] == 2].sort_values("total").tail()  # noqa

Unnamed: 0,Trust me...,The AI r...,Mr. Smit...,She excl...,The code...,total,n
a,6,7,2,5,5,25,2
t,11,2,2,3,7,25,2
r,9,7,4,2,4,26,2
h,14,3,3,4,2,26,2
e,18,9,5,9,6,47,2


## Stop words and ngrams

In [181]:
url = "https://gitlab.com/tangibleai/nlpia/-/raw/master/" "src/nlpia/data/stopword_lists.json"
response = requests.get(url)
stop_words = response.json()["exhaustive"]
tokens = "the words were just as I remembered them".split()
tokens_without_stop_words = [x for x in tokens if x not in stop_words]
print(tokens_without_stop_words)

['I', 'remembered']


### other list of stop words (less exhaustive)

In [182]:
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\giloz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [183]:
len(stop_words), stop_words[:7]

(198, ['a', 'about', 'above', 'after', 'again', 'against', 'ain'])

In [184]:
[stop_word for stop_word in stop_words if len(stop_word) == 1]

['a', 'd', 'i', 'm', 'o', 's', 't', 'y']

## Case folding

Normalize the captialization of tokens in python

In [185]:
tokens = ["House", "Visitor", "Center"]
normalized_tokens = [x.lower() for x in tokens]
normalized_tokens

['house', 'visitor', 'center']

## Stemming

In [186]:
regex = r"^(.*ss|.*?)(s)?$"


def stem(phrase: str) -> str:
    """Stem a phrase's words."""
    return " ".join([re.findall(regex, word)[0][0].strip("'") for word in phrase.lower().split()])


In [187]:
stem("houses")

'house'

In [188]:
stem("Doctor House's calls")

'doctor house call'

In [189]:
stem("dishes") # clearly the regex above is lacking some features! we need a better stemmer

'dishe'

### Porter stemmer

In [190]:
stemmer = PorterStemmer()
" ".join([stemmer.stem(word).strip("'") for word in "dish washer's fairly washed dishes".split()])

'dish washer fairli wash dish'

### Snowball stemmer

In [191]:
snowball_stemmer = SnowballStemmer(language="english")
" ".join([snowball_stemmer.stem(word).strip("'") for word in "dish washer's fairly washed dishes".split()])

'dish washer fair wash dish'

## Lematizing

### Nltk lemmatizing

In [192]:
nltk.download("wordnet")
nltk.download("omw-1.4")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\giloz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\giloz\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [193]:

lemmatizer.lemmatize("better")

'better'

In [194]:
lemmatizer.lemmatize("better", pos="a")

'good'

In [195]:
lemmatizer.lemmatize("good", pos="a")

'good'

In [196]:
lemmatizer.lemmatize("goods", pos="a")

'goods'

In [197]:

lemmatizer.lemmatize("goods", pos="n")

'good'

In [198]:
lemmatizer.lemmatize("goodness", pos="n")

'goodness'

In [199]:
lemmatizer.lemmatize("best", pos="a")

'best'

## SpaCy lemmatizing

In [200]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("better good goods goodness best")
for token in doc:
    print(token.text, token.lemma_)

better well
good good
goods good
goodness goodness
best well
