## Tokenization Pipeline

In [3]:
#libraries
import numpy as np
import re
import string
import collections
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.chunk import RegexpParser
from nltk.corpus import gutenberg
import pandas as pd

In [4]:
text = '''Joe waited for the train. The train was late. Mary and Samantha took the bus. 
I looked for Mary and Samantha at the bus stations. '''

### Sentence Level

In [5]:
tokenized_sent=sent_tokenize(text) 
print(tokenized_sent)

['Joe waited for the train.', 'The train was late.', 'Mary and Samantha took the bus.', 'I looked for Mary and Samantha at the bus stations.']


### Word Level

In [6]:
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Joe', 'waited', 'for', 'the', 'train', '.', 'The', 'train', 'was', 'late', '.', 'Mary', 'and', 'Samantha', 'took', 'the', 'bus', '.', 'I', 'looked', 'for', 'Mary', 'and', 'Samantha', 'at', 'the', 'bus', 'stations', '.']


### POS Level
Do not remove punctuation and stop words

In [7]:
pos_tagged = pos_tag(tokenized_word)
pd.DataFrame(pos_tagged, columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
Word,Joe,waited,for,the,train,.,The,train,was,late,...,looked,for,Mary,and,Samantha,at,the,bus,stations,.
POS tag,NNP,VBD,IN,DT,NN,.,DT,NN,VBD,JJ,...,VBD,IN,NNP,CC,NNP,IN,DT,NN,NNS,.


### NP-Chunking

In [8]:
grammar = """
NP: {<DT>?<JJ>?<NN.*>}
"""

In [9]:
rc = RegexpParser(grammar)
c = rc.parse(pos_tagged)
print(c)

(S
  (NP Joe/NNP)
  waited/VBD
  for/IN
  (NP the/DT train/NN)
  ./.
  (NP The/DT train/NN)
  was/VBD
  late/JJ
  ./.
  (NP Mary/NNP)
  and/CC
  (NP Samantha/NNP)
  took/VBD
  (NP the/DT bus/NN)
  ./.
  I/PRP
  looked/VBD
  for/IN
  (NP Mary/NNP)
  and/CC
  (NP Samantha/NNP)
  at/IN
  (NP the/DT bus/NN)
  (NP stations/NNS)
  ./.)


### Remove Punctuation

In [10]:
# remove all tokens that are not alphabetic
words = [word.lower() for word in tokenized_word if word.isalpha()]
print(words)

['joe', 'waited', 'for', 'the', 'train', 'the', 'train', 'was', 'late', 'mary', 'and', 'samantha', 'took', 'the', 'bus', 'i', 'looked', 'for', 'mary', 'and', 'samantha', 'at', 'the', 'bus', 'stations']


In [11]:
word_counts = collections.Counter(words)
word_counts.most_common(10)

[('the', 4),
 ('for', 2),
 ('train', 2),
 ('mary', 2),
 ('and', 2),
 ('samantha', 2),
 ('bus', 2),
 ('joe', 1),
 ('waited', 1),
 ('was', 1)]

In [12]:
stop_words = stopwords.words('english')
filtered_words = [w for w in words if not w in stop_words] 

In [13]:
word_counts = collections.Counter(filtered_words)
word_counts.most_common(10)

[('train', 2),
 ('mary', 2),
 ('samantha', 2),
 ('bus', 2),
 ('joe', 1),
 ('waited', 1),
 ('late', 1),
 ('took', 1),
 ('looked', 1),
 ('stations', 1)]

In [14]:
custom_list = ['looked'] # add more stopwords
stop_words.extend(custom_list)

In [15]:
extended_words = [w for w in words if not w in stop_words] 
word_counts = collections.Counter(extended_words)
word_counts.most_common(10)

[('train', 2),
 ('mary', 2),
 ('samantha', 2),
 ('bus', 2),
 ('joe', 1),
 ('waited', 1),
 ('late', 1),
 ('took', 1),
 ('stations', 1)]

In [16]:
stop_words = list(set(stopwords.words('english')) - set(['at'])) # remove stopswords

### Stemmers

In [17]:
ps = PorterStemmer()
stemmed = [ps.stem(word) for word in words]
print(stemmed)

['joe', 'wait', 'for', 'the', 'train', 'the', 'train', 'wa', 'late', 'mari', 'and', 'samantha', 'took', 'the', 'bu', 'i', 'look', 'for', 'mari', 'and', 'samantha', 'at', 'the', 'bu', 'station']


### Lemmatizer

In [18]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered_words]
print(lemmatized[0:10])

['joe', 'waited', 'train', 'train', 'late', 'mary', 'samantha', 'took', 'bus', 'looked']


In [19]:
#from nltk.tokenize import sent_tokenize
#from nltk.tokenize import word_tokenize
def normalize_corpus(corpus, text_lower_case=True,
                     text_lemmatization=True, text_stemmer=False, text_punct=True,
                     stopword_removal=True):

    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # lowercase the text
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_punct:
            tokens = word_tokenize(doc)
            tokens = [token.lower() for token in tokens if token.isalpha()]
            doc = ' '.join(tokens) 
        if text_lemmatization:
            word_list = word_tokenize(doc)
            doc = ' '.join([lemmatizer.lemmatize(w) for w in word_list])       
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            tokens = word_tokenize(doc)
            tokens = [token.strip() for token in tokens]
            filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
            doc = ' '.join(filtered_tokens) 
        if text_stemmer:
            ps = nltk.porter.PorterStemmer()
            doc = ' '.join([ps.stem(word) for word in doc.split()])
        normalized_corpus.append(doc)
    return normalized_corpus

In [20]:
corpus=sent_tokenize(text=text)

In [21]:
# normalize our corpus
norm_corpus = normalize_corpus(corpus, 
                                 text_lower_case=True, text_lemmatization=False,
                                 text_stemmer=False, stopword_removal=True)

In [22]:
norm_corpus

['joe waited train',
 'train late',
 'mary samantha took bus',
 'looked mary samantha bus stations']