# Text Processing


## Tokenizing

In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /home/nirmala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# String to Tokenize
example_string = """Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""
# The example string has 3 sentences. Once we tokenize by sentence we should get 3 sentences as output

In [3]:
# Tokenize by sentence
sent_tokenize(example_string)

["Muad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn,\nand how many more believe learning to be difficult."]

In [4]:
# Tokenize by word
word_tokenize(example_string)
# In the output ',', '.', are also considered as separate words. 
# Note: 
# In the output It's is treated as two words, 'It' and 's' because it is a contrction of it and is. 
# In the output, Maud's Dib appears as a single word as it is not an accepted contraction. 
# Output contains common words such as 'in', 'is', and 'an'. So, next step is to remove them from output.

["Muad'Dib",
 'learned',
 'rapidly',
 'because',
 'his',
 'first',
 'training',
 'was',
 'in',
 'how',
 'to',
 'learn',
 '.',
 'And',
 'the',
 'first',
 'lesson',
 'of',
 'all',
 'was',
 'the',
 'basic',
 'trust',
 'that',
 'he',
 'could',
 'learn',
 '.',
 'It',
 "'s",
 'shocking',
 'to',
 'find',
 'how',
 'many',
 'people',
 'do',
 'not',
 'believe',
 'they',
 'can',
 'learn',
 ',',
 'and',
 'how',
 'many',
 'more',
 'believe',
 'learning',
 'to',
 'be',
 'difficult',
 '.']

## Filtering Stop Words



In [5]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nirmala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Text to filter
worf_quote = "Sir, I protest. I am not a merry man!"

In [7]:
# Tokenize the worf_quote and store the result in words_in_quote
words_in_quote = word_tokenize(worf_quote)
words_in_quote
# The output will have 12 words including 'I' and 'am'

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [8]:
# Set stop words to filter out from the list in words_in_quote
stop_words = set(stopwords.words("english"))

In [9]:
# Create an empty list to hold the revised words from words_in_quote without the stop words
filtered_list = []

In [10]:
# For loop to create a filtered_list without stop words
for word in words_in_quote:
    if word.casefold() not in stop_words:
         filtered_list.append(word)
        

In [11]:
# The filtered list with the stop words removed from words_in_quote
filtered_list
# In filtered list, 'am', 'not', 'I', 'I' and 'a' -- 5 stop words removed from words_in_quote 

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

## Stemming:
 Reducing words to their root

In [12]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


In [13]:
# the Porter STemming algorithm which dates from 1979 is being used here. 
# The snowball stemmer, aka, Porter2 is an improvement on the Porter Stemming Algorithm. 
stemmer = PorterStemmer()

In [14]:
# Creating a string for stemming
string_for_stemming = """
... The crew of the USS Discovery discovered many discoveries.
... Discovering is what explorers do."""

In [15]:
# Before stemming the string, tokenzie the string
words = word_tokenize(string_for_stemming)

In [16]:
words

['...',
 'The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discoveries',
 '.',
 '...',
 'Discovering',
 'is',
 'what',
 'explorers',
 'do',
 '.']

In [17]:
# Create a list to hold the stemmed words 
stemmed_words = [stemmer.stem(word) for word in words]

In [18]:
stemmed_words
# In the output, discovery is stemmed to disoveri, discoveries to discoveri,
# discovering and discovered  are stemmed to discov
# The purpose of Porter stemmer is to discover variant forms of words.

['...',
 'the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 '...',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

In [19]:
stemmer.stem("scarves")

'scarv'

# Tagging Parts of Speech

In [20]:
from nltk.tokenize import word_tokenize

In [21]:
sagan_quote = """
If you wish to make an apple pie from scratch,
you must first invent the universe."""

In [22]:
words_in_sagan_quote = word_tokenize(sagan_quote)

In [23]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.pos_tag(words_in_sagan_quote)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nirmala/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /home/nirmala/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


[('If', 'IN'),
 ('you', 'PRP'),
 ('wish', 'VBP'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('an', 'DT'),
 ('apple', 'NN'),
 ('pie', 'NN'),
 ('from', 'IN'),
 ('scratch', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('must', 'MD'),
 ('first', 'VB'),
 ('invent', 'VB'),
 ('the', 'DT'),
 ('universe', 'NN'),
 ('.', '.')]

In [24]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [25]:
# Jabberwocky is a nonsense poem by Lewis Carroll
jabberwocky_excerpt = """
'Twas brillig, and the slithy toves did gyre and gimble in the wabe:
all mimsy were the borogoves, and the mome raths outgrabe."""

In [26]:
words_in_excerpt = word_tokenize(jabberwocky_excerpt)

In [27]:
 nltk.pos_tag(words_in_excerpt)
# Common words like and the were correctly tagged as Conjunction (CN) 
# The gibberish words slithy and gimble are tagged as adjuctives. 

[("'Twas", 'CD'),
 ('brillig', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('slithy', 'JJ'),
 ('toves', 'NNS'),
 ('did', 'VBD'),
 ('gyre', 'NN'),
 ('and', 'CC'),
 ('gimble', 'JJ'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('wabe', 'NN'),
 (':', ':'),
 ('all', 'DT'),
 ('mimsy', 'NNS'),
 ('were', 'VBD'),
 ('the', 'DT'),
 ('borogoves', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('mome', 'JJ'),
 ('raths', 'NNS'),
 ('outgrabe', 'RB'),
 ('.', '.')]

# Lemmatizing
Reducing words to their core meaning
Lemma is a word that represents a whole group of words, and the group of words is called a lexeme

In [28]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /home/nirmala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
lemmatizer = WordNetLemmatizer()


In [30]:
stemmer.stem("scarves")

'scarv'

In [31]:
stemmer.stem("worst")

'worst'

In [32]:
# lemmatizing a plural pronoun
lemmatizer.lemmatize("scarves")
 
# Porter stemmer outputs scarv for scarves whereas lemmatizer outputs scarf. 


'scarf'

In [33]:
# String for lemmatizing 
string_for_lemmatizing = "The friends of DeSoto love scarves."

In [34]:
words = word_tokenize(string_for_lemmatizing)

In [35]:
words

['The', 'friends', 'of', 'DeSoto', 'love', 'scarves', '.']

In [36]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

In [37]:
lemmatized_words
# The ouptut shows that the plural frinds and scraves became the singular friend and scarf. 

['The', 'friend', 'of', 'DeSoto', 'love', 'scarf', '.']

In [38]:
lemmatizer.lemmatize("worst")
# the output is worst because it is assumed that worst was a noun. 


'worst'

In [39]:
lemmatizer.lemmatize("worst", pos="a")
# the default parameter for pos is a noun. By specifying pos=a, worst is now treated as an adjective.
# worst is the superlative form of bad. lemmatizing reduces superlatives & comparatives to their lemmas. 

'bad'