## Tokenizing words and sentences

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
example_sentences = "Natural Language Processing is the task we give computers to read and understand (process) written text (natural language)"

In [3]:
tokenized_sentences = word_tokenize(example_sentences)
print(tokenized_sentences)

['Natural', 'Language', 'Processing', 'is', 'the', 'task', 'we', 'give', 'computers', 'to', 'read', 'and', 'understand', '(', 'process', ')', 'written', 'text', '(', 'natural', 'language', ')']


## Stop words

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
example_sentences = "This is an example showing off stop word filtration."
stop_words = set(stopwords.words("english"))

In [6]:
words = word_tokenize(example_sentences)

In [7]:
filtered_sentences = [w for w in words if not w in stop_words]
filtered_sentences

['This', 'example', 'showing', 'stop', 'word', 'filtration', '.']

## Stemming

This is a form of data pre-processing with natural languae processing, called "stemming." The idea is we process words by removing its affices from the end of words. The reason we would this is so that we do not need to store the meaning of every single tense of a word. For example:

- Reader
- Reading
- Read

Or for another advanced instance:

- I was taking a ride in the car
- I was riding in the car

Aside from tense, and even one of these is a noun, they all have the same meaning for their "root" stem (read). By this way, we store one single value for the root stem of "read." Then, when we wish to learn more, we can look into the affices that were on the end, like "ing" is an active word, or in the past, then you have reader as someone who reads. Then just plain read as either past tense or current.

In [8]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [9]:
ps = PorterStemmer()

In [10]:
example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

In [11]:
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [12]:
new_text = "It is important to be pythonly while you are pythoning with python."
tokenized_new_text = word_tokenize(new_text)

In [15]:
for w in tokenized_new_text:
    print(ps.stem(w))

It
is
import
to
be
pythonli
while
you
are
python
with
python
.


## Part of Speech Tagging

Labelling words as noun, adjective, verb etc. PoS tagging also convers tenses of the parts of speech.

In [28]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [29]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
sample_text

'PRESIDENT GEORGE W. BUSH\'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all. Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream. Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King. (Applause.)\n\nPresident George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan. 31, 2006. White House photo by Eric DraperEvery time I\'m invited to this rostrum, I\'m humbled by the privilege, and mindful of the history we\'ve seen together. We have gathered under this Capitol dome in moments of national mourning and national achievement. We have serv

In [30]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
custom_sent_tokenizer

<nltk.tokenize.punkt.PunktSentenceTokenizer at 0x109d75da0>

In [31]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)
tokenized

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.",
 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.',
 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.',
 '(Applause.)',
 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.',
 '31, 2006.',
 "White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together.",
 'We have gathered under this Capitol dome in moments of national mourning and national ach

In [32]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
        print(str(e))

In [33]:
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

[('Hindsight', 'NNP'), ('alone', 'RB'), ('is', 'VBZ'), ('not', 'RB'), ('wisdom', 'JJ'), (',', ','), ('and', 'CC'), ('second-guessing', 'NN'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('strategy', 'NN'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('With', 'IN'), ('so', 'RB'), ('much', 'JJ'), ('in', 'IN'), ('the', 'DT'), ('balance', 'NN'), (',', ','), ('those', 'DT'), ('of', 'IN'), ('us', 'PRP'), ('in', 'IN'), ('public', 'JJ'), ('office', 'NN'), ('have', 'VBP'), ('a', 'DT'), ('duty', 'NN'), ('to', 'TO'), ('speak', 'VB'), ('with', 'IN'), ('candor', 'NN'), ('.', '.')]
[('A', 'DT'), ('sudden', 'JJ'), ('withdrawal', 'NN'), ('of', 'IN'), ('our', 'PRP$'), ('forces', 'NNS'), ('from', 'IN'), ('Iraq', 'NNP'), ('would', 'MD'), ('abandon', 'VB'), ('our', 'PRP$'), ('Iraqi', 'NNP'), ('allies', 'NNS'), ('to', 'TO'), ('death', 'NN'), ('and', 'CC'), ('prison', 'NN'), (',', ','), ('would', 'MD'), ('put', 'VB'), ('men', 'NNS'), ('like', 'IN'), ('bin', 'NN'), ('Laden', 'NNP'),

[('Here', 'RB'), ('at', 'IN'), ('home', 'NN'), (',', ','), ('America', 'NNP'), ('also', 'RB'), ('has', 'VBZ'), ('a', 'DT'), ('great', 'JJ'), ('opportunity', 'NN'), (':', ':'), ('We', 'PRP'), ('will', 'MD'), ('build', 'VB'), ('the', 'DT'), ('prosperity', 'NN'), ('of', 'IN'), ('our', 'PRP$'), ('country', 'NN'), ('by', 'IN'), ('strengthening', 'VBG'), ('our', 'PRP$'), ('economic', 'JJ'), ('leadership', 'NN'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('.', '.')]
[('Our', 'PRP$'), ('economy', 'NN'), ('is', 'VBZ'), ('healthy', 'JJ'), ('and', 'CC'), ('vigorous', 'JJ'), (',', ','), ('and', 'CC'), ('growing', 'VBG'), ('faster', 'RBR'), ('than', 'IN'), ('other', 'JJ'), ('major', 'JJ'), ('industrialized', 'VBN'), ('nations', 'NNS'), ('.', '.')]
[('In', 'IN'), ('the', 'DT'), ('last', 'JJ'), ('two-and-a-half', 'JJ'), ('years', 'NNS'), (',', ','), ('America', 'NNP'), ('has', 'VBZ'), ('created', 'VBN'), ('4.6', 'CD'), ('million', 'CD'), ('new', 'JJ'), ('jobs', 'NNS'), ('--', ':'), ('more', 'JJR'

## Chunking
https://www.youtube.com/watch?v=imPpT2Qo2sk&index=5&list=PLQVvvaa0QuDf2JswnfiGkliBInZnIC4HL