In [1]:
import nltk

In [2]:
paragraph = """Thank you all so very much. Thank you to the Academy. Thank you to all of you in this room. I have to congratulate the other incredible nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … thank you for creating a transcendent cinematic experience. Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this would be possible without you. And to my friends, I love you dearly; you know who you are.

And lastly, I just want to say this: Making The Revenant was about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. Thank you so very much."""

In [3]:
#scent tokenize
sentences = nltk.sent_tokenize(paragraph)

In [4]:
len(sentences)

21

In [5]:
#word tokenizer
words = nltk.word_tokenize(paragraph)

In [6]:
len(words)

346

# Stemming 


Stemming means different words with same meaning ex intelligent,intelligently = intelligen


>1.Word representations have not have any meaning

>2.Takes less time

>3.Use stemming when meaning if words is not important for analysis. ex.for span detection

In [7]:
from nltk.stem import PorterStemmer#for stemming whole paragraph

In [8]:
stemmer = PorterStemmer()

In [9]:
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    new_words = [stemmer.stem(word) for word in words]#newwords list creation
    sentences[i] = ' '.join(new_words)

In [10]:
sentences

['thank you all so veri much .',
 'thank you to the academi .',
 'thank you to all of you in thi room .',
 'I have to congratul the other incred nomine thi year .',
 'the reven wa the product of the tireless effort of an unbeliev cast and crew .',
 'first off , to my brother in thi endeavor , mr. tom hardi .',
 'tom , your talent on screen can onli be surpass by your friendship off screen … thank you for creat a transcend cinemat experi .',
 'thank you to everybodi at fox and new regenc … my entir team .',
 'I have to thank everyon from the veri onset of my career … To my parent ; none of thi would be possibl without you .',
 'and to my friend , I love you dearli ; you know who you are .',
 "and lastli , I just want to say thi : make the reven wa about man 's relationship to the natur world .",
 'A world that we collect felt in 2015 as the hottest year in record histori .',
 'our product need to move to the southern tip of thi planet just to be abl to find snow .',
 'climat chang is re

# Lemmatization

Lemmatization means converting different word to root word

>1.Word representations have meaning

>2.Takes more time than stemminmg

>3.Use Lemmatization when meaning of words is importtant for analysys. ex-question answering app

In [11]:
from nltk.stem import WordNetLemmatizer

In [12]:
sentences = nltk.sent_tokenize(paragraph)

In [13]:
lemmataizer = WordNetLemmatizer()

In [14]:
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [lemmataizer.lemmatize(word) for word in words]
    sentences[i] = ' '.join(newwords)

In [15]:
sentences

['Thank you all so very much .',
 'Thank you to the Academy .',
 'Thank you to all of you in this room .',
 'I have to congratulate the other incredible nominee this year .',
 'The Revenant wa the product of the tireless effort of an unbelievable cast and crew .',
 'First off , to my brother in this endeavor , Mr. Tom Hardy .',
 'Tom , your talent on screen can only be surpassed by your friendship off screen … thank you for creating a transcendent cinematic experience .',
 'Thank you to everybody at Fox and New Regency … my entire team .',
 'I have to thank everyone from the very onset of my career … To my parent ; none of this would be possible without you .',
 'And to my friend , I love you dearly ; you know who you are .',
 "And lastly , I just want to say this : Making The Revenant wa about man 's relationship to the natural world .",
 'A world that we collectively felt in 2015 a the hottest year in recorded history .',
 'Our production needed to move to the southern tip of this pl

# Stop words removal

>Words that dont express any meaning

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/naimeshpatel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords

In [18]:
sentences = nltk.sent_tokenize(paragraph)

In [19]:
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    newwords = [word for word in words if word not in stopwords.words('english')]
    sentences[i] = ' '.join(newwords)

In [20]:
sentences

['Thank much .',
 'Thank Academy .',
 'Thank room .',
 'I congratulate incredible nominees year .',
 'The Revenant product tireless efforts unbelievable cast crew .',
 'First , brother endeavor , Mr. Tom Hardy .',
 'Tom , talent screen surpassed friendship screen … thank creating transcendent cinematic experience .',
 'Thank everybody Fox New Regency … entire team .',
 'I thank everyone onset career … To parents ; none would possible without .',
 'And friends , I love dearly ; know .',
 "And lastly , I want say : Making The Revenant man 's relationship natural world .",
 'A world collectively felt 2015 hottest year recorded history .',
 'Our production needed move southern tip planet able find snow .',
 'Climate change real , happening right .',
 'It urgent threat facing entire species , need work collectively together stop procrastinating .',
 'We need support leaders around world speak big polluters , speak humanity , indigenous people world , billions billions underprivileged people

# Parts of speech Tagging

>Assigns parts of speech to each word (and other token), such as noun, verb, adjective, etc

In [21]:
words = nltk.word_tokenize(paragraph)

In [22]:
tagged_words = nltk.pos_tag(words)

In [31]:
tagged_words

[('Thank', 'NNP'),
 ('you', 'PRP'),
 ('all', 'DT'),
 ('so', 'RB'),
 ('very', 'RB'),
 ('much', 'JJ'),
 ('.', '.'),
 ('Thank', 'VB'),
 ('you', 'PRP'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('Academy', 'NNP'),
 ('.', '.'),
 ('Thank', 'NNP'),
 ('you', 'PRP'),
 ('to', 'TO'),
 ('all', 'DT'),
 ('of', 'IN'),
 ('you', 'PRP'),
 ('in', 'IN'),
 ('this', 'DT'),
 ('room', 'NN'),
 ('.', '.'),
 ('I', 'PRP'),
 ('have', 'VBP'),
 ('to', 'TO'),
 ('congratulate', 'VB'),
 ('the', 'DT'),
 ('other', 'JJ'),
 ('incredible', 'JJ'),
 ('nominees', 'NNS'),
 ('this', 'DT'),
 ('year', 'NN'),
 ('.', '.'),
 ('The', 'DT'),
 ('Revenant', 'NNP'),
 ('was', 'VBD'),
 ('the', 'DT'),
 ('product', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('tireless', 'NN'),
 ('efforts', 'NNS'),
 ('of', 'IN'),
 ('an', 'DT'),
 ('unbelievable', 'JJ'),
 ('cast', 'NN'),
 ('and', 'CC'),
 ('crew', 'NN'),
 ('.', '.'),
 ('First', 'NNP'),
 ('off', 'RB'),
 (',', ','),
 ('to', 'TO'),
 ('my', 'PRP$'),
 ('brother', 'NN'),
 ('in', 'IN'),
 ('this', 'DT'),
 ('endeavor'

In [24]:
word_tags=[]
for i in tagged_words:
    word_tags.append(i[0]+"_"+i[1])
    

In [25]:
word_tags

['Thank_NNP',
 'you_PRP',
 'all_DT',
 'so_RB',
 'very_RB',
 'much_JJ',
 '._.',
 'Thank_VB',
 'you_PRP',
 'to_TO',
 'the_DT',
 'Academy_NNP',
 '._.',
 'Thank_NNP',
 'you_PRP',
 'to_TO',
 'all_DT',
 'of_IN',
 'you_PRP',
 'in_IN',
 'this_DT',
 'room_NN',
 '._.',
 'I_PRP',
 'have_VBP',
 'to_TO',
 'congratulate_VB',
 'the_DT',
 'other_JJ',
 'incredible_JJ',
 'nominees_NNS',
 'this_DT',
 'year_NN',
 '._.',
 'The_DT',
 'Revenant_NNP',
 'was_VBD',
 'the_DT',
 'product_NN',
 'of_IN',
 'the_DT',
 'tireless_NN',
 'efforts_NNS',
 'of_IN',
 'an_DT',
 'unbelievable_JJ',
 'cast_NN',
 'and_CC',
 'crew_NN',
 '._.',
 'First_NNP',
 'off_RB',
 ',_,',
 'to_TO',
 'my_PRP$',
 'brother_NN',
 'in_IN',
 'this_DT',
 'endeavor_NN',
 ',_,',
 'Mr._NNP',
 'Tom_NNP',
 'Hardy_NNP',
 '._.',
 'Tom_NNP',
 ',_,',
 'your_PRP$',
 'talent_NN',
 'on_IN',
 'screen_NN',
 'can_MD',
 'only_RB',
 'be_VB',
 'surpassed_VBN',
 'by_IN',
 'your_PRP$',
 'friendship_NN',
 'off_IN',
 'screen_JJ',
 '…_NNP',
 'thank_NN',
 'you_PRP',
 'for_I

In [26]:
#tagged_paragraph = " ".join(word_tags)
tagged_paragraph = " |  ".join(word_tags)

In [27]:
tagged_paragraph

"Thank_NNP |  you_PRP |  all_DT |  so_RB |  very_RB |  much_JJ |  ._. |  Thank_VB |  you_PRP |  to_TO |  the_DT |  Academy_NNP |  ._. |  Thank_NNP |  you_PRP |  to_TO |  all_DT |  of_IN |  you_PRP |  in_IN |  this_DT |  room_NN |  ._. |  I_PRP |  have_VBP |  to_TO |  congratulate_VB |  the_DT |  other_JJ |  incredible_JJ |  nominees_NNS |  this_DT |  year_NN |  ._. |  The_DT |  Revenant_NNP |  was_VBD |  the_DT |  product_NN |  of_IN |  the_DT |  tireless_NN |  efforts_NNS |  of_IN |  an_DT |  unbelievable_JJ |  cast_NN |  and_CC |  crew_NN |  ._. |  First_NNP |  off_RB |  ,_, |  to_TO |  my_PRP$ |  brother_NN |  in_IN |  this_DT |  endeavor_NN |  ,_, |  Mr._NNP |  Tom_NNP |  Hardy_NNP |  ._. |  Tom_NNP |  ,_, |  your_PRP$ |  talent_NN |  on_IN |  screen_NN |  can_MD |  only_RB |  be_VB |  surpassed_VBN |  by_IN |  your_PRP$ |  friendship_NN |  off_IN |  screen_JJ |  …_NNP |  thank_NN |  you_PRP |  for_IN |  creating_VBG |  a_DT |  transcendent_JJ |  cinematic_JJ |  experience_NN |  ._

# Named entity Recoginition

> First step towards information extraction that seeks to locate and classify named entities in text into pre-defined categories such as the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc

In [33]:
data = "The Taj Mahal was built by Emperor Shah Jahan"

In [34]:
words = nltk.word_tokenize(data)

In [35]:
words

['The', 'Taj', 'Mahal', 'was', 'built', 'by', 'Emperor', 'Shah', 'Jahan']

In [36]:
tagged_words = nltk.pos_tag(words)

In [37]:
tagged_words

[('The', 'DT'),
 ('Taj', 'NNP'),
 ('Mahal', 'NNP'),
 ('was', 'VBD'),
 ('built', 'VBN'),
 ('by', 'IN'),
 ('Emperor', 'NNP'),
 ('Shah', 'NNP'),
 ('Jahan', 'NNP')]

In [38]:
namedEnt = nltk.ne_chunk(tagged_words)

In [None]:
#open gui with tree output
namedEnt.draw()
