# Supplemental Data Cleaning: Using a Lemmatizer

### Test out WordNet lemmatizer (read more about WordNet [here](https://wordnet.princeton.edu/))

In [5]:
import nltk
import time

wn = nltk.WordNetLemmatizer() # commonly used. Collection of nouns, verbs etc
ps = nltk.PorterStemmer()

In [6]:
dir(wn)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'lemmatize']

In [16]:
start= time.time()
print(ps.stem("meanness"))
print(ps.stem("meaning"))
print(f"Stemmer took {round(time.time()-start,10)} secounds")

mean
mean
Stemmer took 0.0003530979 secounds


In [15]:
start=time.time()
print(wn.lemmatize("meanness"))
print(wn.lemmatize("meaning"))
print(f"Lemitizer took {round(time.time()-start,10)} secounds")

meanness
meaning
Lemitizer took 0.0004200935 secounds


In [17]:
start= time.time()
print(ps.stem("goose"))
print(ps.stem("geese"))
print(f"Stemmer took {round(time.time()-start,10)} secounds")

goos
gees
Stemmer took 0.0004448891 secounds


In [18]:
start=time.time()
print(wn.lemmatize("goose"))
print(wn.lemmatize("geese"))
print(f"Lemitizer took {round(time.time()-start,10)} secounds")

goose
goose
Lemitizer took 0.0008120537 secounds


### Read in raw text

In [None]:
import pandas as pd
import re
import string
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

### Clean up text

In [None]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

data['body_text_nostop'] = data['body_text'].apply(lambda x: clean_text(x.lower()))

### Lemmatize text