In [104]:
import nltk
from nltk import pos_tag
from nltk.corpus import state_union, wordnet
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer

In [14]:
sentences = 'The quick brown fox jumps over the lazy dog.'
words = nltk.word_tokenize(sentences)
pos_tags  = pos_tag(words)

In [16]:
for word, pos in pos_tags:
    print(f' Word {word} , POS Is {pos} ')

 Word The , POS Is DT 
 Word quick , POS Is JJ 
 Word brown , POS Is NN 
 Word fox , POS Is NN 
 Word jumps , POS Is VBZ 
 Word over , POS Is IN 
 Word the , POS Is DT 
 Word lazy , POS Is JJ 
 Word dog , POS Is NN 
 Word . , POS Is . 


In [17]:
for word, pos in  pos_tag(nltk.word_tokenize("NLTK is a powerful library for natural language processing.")):
    print(f' Word {word} , POS Is {pos} ')

 Word NLTK , POS Is NNP 
 Word is , POS Is VBZ 
 Word a , POS Is DT 
 Word powerful , POS Is JJ 
 Word library , POS Is NN 
 Word for , POS Is IN 
 Word natural , POS Is JJ 
 Word language , POS Is NN 
 Word processing , POS Is NN 
 Word . , POS Is . 


In [21]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [27]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [30]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [34]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [47]:
example_string = """
Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""

In [48]:
sent_tokenize(example_string)

["\nMuad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn,\nand how many more believe learning to be difficult."]

In [49]:
word_tokenize(example_string)

["Muad'Dib",
 'learned',
 'rapidly',
 'because',
 'his',
 'first',
 'training',
 'was',
 'in',
 'how',
 'to',
 'learn',
 '.',
 'And',
 'the',
 'first',
 'lesson',
 'of',
 'all',
 'was',
 'the',
 'basic',
 'trust',
 'that',
 'he',
 'could',
 'learn',
 '.',
 'It',
 "'s",
 'shocking',
 'to',
 'find',
 'how',
 'many',
 'people',
 'do',
 'not',
 'believe',
 'they',
 'can',
 'learn',
 ',',
 'and',
 'how',
 'many',
 'more',
 'believe',
 'learning',
 'to',
 'be',
 'difficult',
 '.']

In [50]:
#Filtering Stop Words

In [51]:
from nltk.corpus import stopwords

In [55]:
stop_words = stopwords.words('english')

In [57]:
worf_quote = "Sir, I protest. I am not a merry man!"
words_in_quote = word_tokenize(worf_quote)

In [58]:
words_in_quote

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [65]:
filtered_list = []

In [66]:
for word in words_in_quote:
    if word.casefold() not in stop_words:
        filtered_list.append(word)

In [67]:
filtered_list

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

In [68]:
filtered_list01 = [w for w in words_in_quote if w.casefold() not in stop_words]
filtered_list01

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

In [83]:
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer

In [85]:
stemmer = SnowballStemmer('english')

In [86]:
string_for_stemming = """
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

In [87]:
tokens_to_stem = word_tokenize(string_for_stemming)

In [88]:
# for w in tokens_to_stem:
#     print(stemmer.stem(w))

stemmed_words = [stemmer.stem(word) for word in tokens_to_stem]
stemmed_words

['the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

In [90]:
lemma = WordNetLemmatizer()

In [107]:
lemma_words = [lemma.lemmatize(w, pos=wordnet.ADJ) for w in tokens_to_stem ]

In [108]:
lemma_words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discoveries',
 '.',
 'Discovering',
 'is',
 'what',
 'explorers',
 'do',
 '.']

In [94]:
sagan_quote = """
If you wish to make an apple pie from scratch,
you must first invent the universe."""
words_in_sagan_quote = word_tokenize(sagan_quote)

In [96]:
pos_tag(words_in_sagan_quote)

[('If', 'IN'),
 ('you', 'PRP'),
 ('wish', 'VBP'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('an', 'DT'),
 ('apple', 'NN'),
 ('pie', 'NN'),
 ('from', 'IN'),
 ('scratch', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('must', 'MD'),
 ('first', 'VB'),
 ('invent', 'VB'),
 ('the', 'DT'),
 ('universe', 'NN'),
 ('.', '.')]

In [97]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or