# Objective: <br>
<ol><li>Introduction to word processing, Parts of speech tagging</li>
<li>Introduction to NLTK,TextBlob packages. </li>
<li>N-grams with text blob.</li></ol>

In [1]:
# NLP - Parts of speech tagging.
import nltk

In [3]:
# some sample text
text = ("Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. " +
"Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group. " +
"Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, " +
"was named a director of this British industrial conglomerate.")

In [7]:
# word tokenizer
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
result = ""
for token in tokens:
    result += "[" + token + "] "
print(result)



[Pierre] [Vinken] [,] [61] [years] [old] [,] [will] [join] [the] [board] [as] [a] [nonexecutive] [director] [Nov.] [29] [.] [Mr.] [Vinken] [is] [chairman] [of] [Elsevier] [N.V.] [,] [the] [Dutch] [publishing] [group] [.] [Rudolph] [Agnew] [,] [55] [years] [old] [and] [former] [chairman] [of] [Consolidated] [Gold] [Fields] [PLC] [,] [was] [named] [a] [director] [of] [this] [British] [industrial] [conglomerate] [.] 


In [6]:
# sentence split
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
result = ""
for sentence in sentences:
	result += "[" + sentence + "] "
print(result)


[Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.] [Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.] [Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a director of this British industrial conglomerate.] 


**There are many POS taggers available, we use here the nltk library pos tagger** <br>
**You can test a few taggers online and evaluate them yourself** <br>
Stanford : http://nlp.stanford.edu:8080/parser/index.jsp


In [9]:
# we now use the tokens we split earlier to tag them with parts of speech
from nltk import pos_tag
tagged_tokens = pos_tag(tokens)
result = ""
for token in tagged_tokens:
    result += '[' + token[0] + '/' + token[1] + '] '
print(result)

[Pierre/NNP] [Vinken/NNP] [,/,] [61/CD] [years/NNS] [old/JJ] [,/,] [will/MD] [join/VB] [the/DT] [board/NN] [as/IN] [a/DT] [nonexecutive/JJ] [director/NN] [Nov./NNP] [29/CD] [./.] [Mr./NNP] [Vinken/NNP] [is/VBZ] [chairman/NN] [of/IN] [Elsevier/NNP] [N.V./NNP] [,/,] [the/DT] [Dutch/NNP] [publishing/NN] [group/NN] [./.] [Rudolph/NNP] [Agnew/NNP] [,/,] [55/CD] [years/NNS] [old/JJ] [and/CC] [former/JJ] [chairman/NN] [of/IN] [Consolidated/NNP] [Gold/NNP] [Fields/NNP] [PLC/NNP] [,/,] [was/VBD] [named/VBN] [a/DT] [director/NN] [of/IN] [this/DT] [British/JJ] [industrial/JJ] [conglomerate/NN] [./.] 


In [10]:
# Stemming using wordnet 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
result = ""
for token in tokens:
	result += '[' + lemmatizer.lemmatize(token) + '] '
print(result)

[Pierre] [Vinken] [,] [61] [year] [old] [,] [will] [join] [the] [board] [a] [a] [nonexecutive] [director] [Nov.] [29] [.] [Mr.] [Vinken] [is] [chairman] [of] [Elsevier] [N.V.] [,] [the] [Dutch] [publishing] [group] [.] [Rudolph] [Agnew] [,] [55] [year] [old] [and] [former] [chairman] [of] [Consolidated] [Gold] [Fields] [PLC] [,] [wa] [named] [a] [director] [of] [this] [British] [industrial] [conglomerate] [.] 


In [11]:
# assembling everything together
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary = False)

In [12]:
for sentence in chunked_sentences:
	str_sentence =' '.join(str(sentence).split())
	str_sentence = str_sentence.replace('(','[')
	str_sentence = str_sentence.replace(')',']')
	print(str_sentence)

[S [PERSON Pierre/NNP] [ORGANIZATION Vinken/NNP] ,/, 61/CD years/NNS old/JJ ,/, will/MD join/VB the/DT board/NN as/IN a/DT nonexecutive/JJ director/NN Nov./NNP 29/CD ./.]
[S [PERSON Mr./NNP] [PERSON Vinken/NNP] is/VBZ chairman/NN of/IN [ORGANIZATION Elsevier/NNP] N.V./NNP ,/, the/DT [GPE Dutch/NNP] publishing/NN group/NN ./.]
[S [PERSON Rudolph/NNP] [GPE Agnew/NNP] ,/, 55/CD years/NNS old/JJ and/CC former/JJ chairman/NN of/IN [ORGANIZATION Consolidated/NNP Gold/NNP Fields/NNP] PLC/NNP ,/, was/VBD named/VBN a/DT director/NN of/IN this/DT [GPE British/JJ] industrial/JJ conglomerate/NN ./.]


**We have tagged the sentences above with classes, the image below describes some of them. This is a very old list. There may be many changes to this list.** <br>
Image from Martin Jrafsky<br>
<img src="classes.png">

In [19]:
# Check spellings using wordnet

from nltk.corpus import wordnet as wn
some_text = ["komplex.complex","cloud.kloud","cat.cat"]
for pair in some_text:
    (word1,word2) = pair.split(".")
    sim=wn.path_similarity
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    sim_scores = []
    for synset1 in synsets1:
        for synset2 in synsets2:
            sim_scores.append(sim(synset1, synset2))
    if len(sim_scores) == 0:
        print(word1)
    else:
        print("match")
    

komplex
cloud
match


**This can sometime be incorrect depending on the corpus and the set of stop words being used.**
<br> **We now will use the TextBlob library built on top of NLTK, its easy and intuitive to use.**  

In [20]:
from textblob import TextBlob

In [21]:
TB = TextBlob(text)

In [22]:
# Tags
TB.tags

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('Mr.', 'NNP'),
 ('Vinken', 'NNP'),
 ('is', 'VBZ'),
 ('chairman', 'NN'),
 ('of', 'IN'),
 ('Elsevier', 'NNP'),
 ('N.V.', 'NNP'),
 ('the', 'DT'),
 ('Dutch', 'NNP'),
 ('publishing', 'NN'),
 ('group', 'NN'),
 ('Rudolph', 'NNP'),
 ('Agnew', 'NNP'),
 ('55', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 ('and', 'CC'),
 ('former', 'JJ'),
 ('chairman', 'NN'),
 ('of', 'IN'),
 ('Consolidated', 'NNP'),
 ('Gold', 'NNP'),
 ('Fields', 'NNP'),
 ('PLC', 'NNP'),
 ('was', 'VBD'),
 ('named', 'VBN'),
 ('a', 'DT'),
 ('director', 'NN'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('British', 'JJ'),
 ('industrial', 'JJ'),
 ('conglomerate', 'NN')]

In [23]:
TB.noun_phrases

WordList(['pierre vinken', 'nonexecutive director', 'nov.', 'mr. vinken', 'elsevier n.v.', 'dutch', 'rudolph agnew', 'gold fields', 'plc', 'british industrial conglomerate'])

In [24]:
TB.words

WordList(['Pierre', 'Vinken', '61', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '29', 'Mr', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V', 'the', 'Dutch', 'publishing', 'group', 'Rudolph', 'Agnew', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', 'was', 'named', 'a', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate'])

In [25]:
TB.sentences

[Sentence("Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29."),
 Sentence("Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group."),
 Sentence("Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a director of this British industrial conglomerate.")]

In [26]:
# sentiment analysis
review = TextBlob("This is truly a fantastic iPad for the money. I was skeptical when comparing to the iPad Pro model. There are certain differences but if you can get passed aesthetics, this is most likely the iPad for you. Let's address what it doesn't do/have. It does not have the fully laminated display so there is a slight airgap between the glass the the display. This also means the display does not have the anti-glare coating. It has the A9 vs A9X chip. It has dual speakers on the bottom and not quad stereo speakers like on the Pro. It also does not have the smart connector for utilizing Apple's new Smart Cover Keyboard, and is not compatible with Apple Pencil. Other than that, I don't think you can beat this value.")
review.sentiment

Sentiment(polarity=0.08875541125541127, subjectivity=0.5753354978354979)

Excercise <br>
**If you read the above review it seems like a positive one but why us the polarity value so low?** 

In [27]:
# Now we will use a simple positive review
testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
testimonial.sentiment

Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)

In [28]:
# Now we will use a simple negative review
testimonial = TextBlob("Textblob is the worst. What a waste!")
testimonial.sentiment

Sentiment(polarity=-0.625, subjectivity=0.5)

**From the above experiments you might have guessed that this happens because every word is given a score for being negative or positive and what we see at the end is an overall score for the text.*** <br>
**What went wrong with the earlier review is that there were negative words that dominated the positive words, hence a low score positive but low.**

In [33]:
# Lemmatizing with Textblob
from textblob import Word
w = Word("go")
w.lemmatize("v") # parts of speech to verb


'go'

In [34]:
# pluralize/singulatize
animals = TextBlob("cat dog octopus")
animals.words.pluralize()

WordList(['cats', 'dogs', 'octopodes'])

In [35]:
# spelling correction and spell check
b = TextBlob("I hadz a komplex situation!")
print(b.correct())

I had a complex situation!


In [36]:
# returns close words that are correct
w = Word('Kolonise')
w.spellcheck()

[('Colonies', 0.9764150943396226), ('Polonaise', 0.02358490566037736)]

In [40]:
# Translate!!
en_blob = TextBlob(u'Good day, to you sir!.')
en_blob.translate(to='zh-CN') # plug in es,in


TextBlob("祝你好运，先生！")

**There are other useful features , check this out for more : http://textblob.readthedocs.io/en/dev/index.html **

In [41]:
# More on n-gram models in the class
blob = TextBlob("Now is better than never.")
blob.ngrams(n=3) # Trigram

[WordList(['Now', 'is', 'better']),
 WordList(['is', 'better', 'than']),
 WordList(['better', 'than', 'never'])]

In [42]:
blob = TextBlob("Now is better than never.")
blob.ngrams(n=2) # Bigram

[WordList(['Now', 'is']),
 WordList(['is', 'better']),
 WordList(['better', 'than']),
 WordList(['than', 'never'])]

In [43]:
blob = TextBlob("Now is better than never.")
blob.ngrams(n=1) # Unigram

[WordList(['Now']),
 WordList(['is']),
 WordList(['better']),
 WordList(['than']),
 WordList(['never'])]