In [None]:
import nltk
import sys
import sklearn

In [None]:
nltk.download()

# Corpus
Body of text, singluar. Corpora is the plural of this. Example: A collection of medical journals

# Lexicon
Words and their meaning. Example English dictionary. Consider, however, that various fields will have different lexicons.

# Token
Each "entity" that is a part of whatever was split up based on rules. For examples, each word is a token when a sentence is 'tokenized' into words. Each  sentence can be a token if you tokenized sentences out of a paragraph


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
text = 'Hello humans, I am here to rule you. Dont bother to fight. You will  loose'
print(sent_tokenize(text))

In [None]:
print(word_tokenize(text))

In [None]:
# remove stopwords - not meaning full data
from nltk.corpus import stopwords
print(set(stopwords.words('english')))

In [None]:
example_stop = 'This is sample to showing off the stop words filtration'
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_stop)
filtered_sentence = [w for w in   word_tokens if not w in stop_words]
print(word_tokens)
print(filtered_sentence)

* Filtering stop words is a very common step in nlp projects

In [None]:
# Stemming words with NLTK
from nltk.stem import PorterStemmer

ps = PorterStemmer()

example_stem = ['ride','riding','rider','rides']
for w in example_stem:
    print(ps.stem(w))

In [None]:
# Stemming an entire sentence
new_text = 'When riders are riding their horses, they often think of how cowboys rode horses.'

words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w))

In [None]:
# speech tagging
from nltk.corpus import udhr
print(udhr.raw('English-Latin1'))

In [None]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [None]:
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

In [None]:
# Now we have some text, we can train the PunktSentenceTokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [None]:
#Now lets token the sample text
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [None]:
print(tokenized)

In [None]:
# Define a function that will tag each tokenized word with a part of speech
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

In [None]:
process_content()

In [None]:
nltk.help.upenn_tagset()

In [42]:
#Chunking with NLTK
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:10]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # combine the part of speech tag with a regular expression
            chunkGram = r"""Chunk:{<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            #print nltk tree
            for subtree in   chunked.subtrees(filter=lambda t:t.label() == 'Chunk'):
                print(subtree)
            # Draw chunks with nltk
            # chunked.draw()
            
    except Exception as e:
        print(str(e))
# <RB.?>* = "0 or more of any tense of adverb," followed by :
# <VB.?>* = "0 or more of any tense of verb," followed by:
# <NNP>+ = "One or more proper noun"
# <NN>? = " zero or one singular noun."

process_content()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk called/VBD America/NNP)
(Chunk Coretta/NNP Scott/NNP King/NNP)
(Chunk Applause/NNP)
(Chunk President/NNP George/NNP W./NNP Bush/NNP)
(Chunk State/NNP)
(Chunk Union/NNP Address/NNP)
(Chunk Capitol/NNP)
(Chunk Tuesday/NNP)
(Chunk Jan/NNP)
(Chunk White/NNP House/NNP photo/NN)
(Chunk Eric/NNP DraperEvery/NNP time/NN)
(Chunk Capitol/NNP dome/NN)
(Chunk have/VBP served/VBN America/NNP)


In [49]:
# Chinking with NLTK

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:10]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # combine the part of speech tag with a regular expression
            chunkGram = r"""Chunk: {<.*>*}
                                        }<VB.?|IN|DT|TO>+{"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            #print nltk tree
            for subtree in   chunked.subtrees(filter=lambda t:t.label() == 'Chunk'):
                print(subtree)
            # Draw chunks with nltk
            # chunked.draw()
            
    except Exception as e:
        print(str(e))
# <RB.?>* = "0 or more of any tense of adverb," followed by :
# <VB.?>* = "0 or more of any tense of verb," followed by:
# <NNP>+ = "One or more proper noun"
# <NN>? = " zero or one singular noun."

process_content()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk
  THE/NNP
  UNION/NNP
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP)
(Chunk ./.)
(Chunk
  Mr./NNP
  Speaker/NNP
  ,/,
  Vice/NNP
  President/NNP
  Cheney/NNP
  ,/,
  members/NNS)
(Chunk Congress/NNP ,/, members/NNS)
(Chunk
  Supreme/NNP
  Court/NNP
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:)
(Chunk our/PRP$ nation/NN)
(Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
(Chunk America/NNP)
(Chunk its/PRP$ founding/NN ideals/NNS and/CC)
(Chunk noble/JJ dream/NN ./.)
(Chunk Tonight/NN we/PRP)
(Chunk hope/NN)
(Chunk glad/JJ reunion/NN)
(Chunk husband/NN who/WP)
(Chunk so/RB long/RB ago/RB ,/, and/CC we/PRP)
(Chunk grateful/JJ)
(Chunk good/JJ life/NN)
(Chunk Coretta/NNP Scott/NNP King/NNP ./.)
(Chunk

In [51]:
# Named entity recognition with NLTK

def process_content():
    try:
        for i in tokenized[:10]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=False)
            namedEnt.draw()
            
    except Exception as e:
        print(str(e))

process_content()