# Lemmatization

In [8]:
import nltk
from nltk.stem import WordNetLemmatizer

In [11]:
tokens = ["going","gone","go","goes","went"]
lemmatizer = WordNetLemmatizer()
# Here pos='v' indicates verb
# pos stands for Part-of-speech
tokens_new = [lemmatizer.lemmatize(token,pos='v') for token in tokens]
tokens_new

['go', 'go', 'go', 'go', 'go']

In [19]:
# Nouns. By default, lemmatizer looks for nouns. As shown above, we have explicitly called Verb
tokens = ['man','woman','mans','womans','bat','bats','leg','legs']
x = [lemmatizer.lemmatize(token) for token in tokens]
print(x)

['man', 'woman', 'man', 'woman', 'bat', 'bat', 'leg', 'leg']


# Stemming

SNOWBALL STEMMER

In [27]:
# Use algorithm to get to base form by removing prefixes and suffixes
from nltk.stem import SnowballStemmer

In [29]:
tokens = ['man','woman','mans','womans','bat','bats','leg','legs','hands','flying','running','getting']
snow = SnowballStemmer('english')
x = [snow.stem(token) for token in tokens]
print(x)

['man', 'woman', 'man', 'woman', 'bat', 'bat', 'leg', 'leg', 'hand', 'fli', 'run', 'get']


PORTER STEMMER

In [31]:
# This is an older version of stemmer
from nltk.stem import PorterStemmer

In [32]:
porter = PorterStemmer()
y = [porter.stem(token) for token in tokens]
print(y)

['man', 'woman', 'man', 'woman', 'bat', 'bat', 'leg', 'leg', 'hand', 'fli', 'run', 'get']


# N GRAMS

How would you classify or group your documents/ tweets based on these? Do single words make sense? Need to capture the context between words. Hence, we have to use ngrams to capture two or three or four words together.

In [34]:
# Bi Gram
count = 0
for i in nltk.ngrams(y,2):
    print(i)
    count+=1

print(count)

('man', 'woman')
('woman', 'man')
('man', 'woman')
('woman', 'bat')
('bat', 'bat')
('bat', 'leg')
('leg', 'leg')
('leg', 'hand')
('hand', 'fli')
('fli', 'run')
('run', 'get')
11


In [35]:
# Tri Gram
count = 0
for i in nltk.ngrams(y,3):
    print(i)
    count+=1

print(count)

('man', 'woman', 'man')
('woman', 'man', 'woman')
('man', 'woman', 'bat')
('woman', 'bat', 'bat')
('bat', 'bat', 'leg')
('bat', 'leg', 'leg')
('leg', 'leg', 'hand')
('leg', 'hand', 'fli')
('hand', 'fli', 'run')
('fli', 'run', 'get')
10


To find frequency of the words

In [38]:
frequency = nltk.FreqDist()
for i in nltk.ngrams(y,2):
    frequency[i]+=1

print(frequency.most_common(10))

<class 'nltk.probability.FreqDist'>
[(('man', 'woman'), 2), (('woman', 'man'), 1), (('woman', 'bat'), 1), (('bat', 'bat'), 1), (('bat', 'leg'), 1), (('leg', 'leg'), 1), (('leg', 'hand'), 1), (('hand', 'fli'), 1), (('fli', 'run'), 1), (('run', 'get'), 1)]


In [39]:
# Test

In [40]:
txt = '''At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes. 
It was a perfect day, with a bright sun and a few fleecy clouds in the heavens. The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth. To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged. 
My companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought. 
Suddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.
At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes. 
It was a perfect day, with a bright sun and a few fleecy clouds in the heavens. My flight is at 09:30 AM and I have to reach airport by 08:00 AM'''

In [55]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import RegexpTokenizer as regExToken

stop = stopwords.words('english')
lmtzr = WordNetLemmatizer()
reg = regExToken('\w+')
def process_text(text):
    sentences = nltk.tokenize.sent_tokenize(text)
    #sentence_tokens = [tokenizer.tokenize(sentence) for sentence in sentences] # list of lists
    tokens = [] # initialising list variable
    
    #for sentence in sentence_tokens:
    #    sent = []
    #    for word in sentence:
    #        if word.lower() not in stop:
    #            sent.append(word.lower())
    #    tokens.append(sent)
    
    for i in sentences:
        sent = []
        words = reg.tokenize(i)
        for i in words:
            if i.lower() not in stop:
                sent.append(i.lower())
        tokens.append(sent)
    
    ## THE SAME for LOOP CAN BE WRITTEN AS FOLLOWS
    ## tokens = [[word.lower() for word in sent if word not in stop] for sent in sentence_tokens]
    
    tokens = [[lmtzr.lemmatize(word) for word in sent] for sent in tokens]
    return tokens

def process_ngrams(input_sentence_tokens):
    ngram_list = []
    for sentence in input_sentence_tokens:
        ngram_sent = nltk.ngrams(sentence, 2)
        ngram_list = ngram_list + list(ngram_sent)
    return ngram_list

In [56]:
sentence_tokens = process_text(txt)
print(sentence_tokens)

[['waterloo', 'fortunate', 'catching', 'train', 'leatherhead', 'hired', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'mile', 'lovely', 'surrey', 'lane'], ['perfect', 'day', 'bright', 'sun', 'fleecy', 'cloud', 'heaven'], ['tree', 'wayside', 'hedge', 'throwing', 'first', 'green', 'shoot', 'air', 'full', 'pleasant', 'smell', 'moist', 'earth'], ['least', 'strange', 'contrast', 'sweet', 'promise', 'spring', 'sinister', 'quest', 'upon', 'engaged'], ['companion', 'mr', 'alfred', 'sat', 'front', 'trap', 'arm', 'folded', 'hat', 'pulled', 'eye', 'chin', 'sunk', 'upon', 'breast', 'buried', 'deepest', 'thought'], ['suddenly', 'however', 'started', 'tapped', 'shoulder', 'pointed', 'meadow'], ['waterloo', 'fortunate', 'catching', 'train', 'leatherhead', 'hired', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'mile', 'lovely', 'surrey', 'lane'], ['perfect', 'day', 'bright', 'sun', 'fleecy', 'cloud', 'heaven'], ['flight', '09', '30', 'reach', 'airport', '08', '00']]
