In [44]:
import nltk
import pandas as pd
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\morri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\morri\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\morri\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\morri\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\morri\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [9]:
text = 'Apple acquired Zoom in China on Wednesday 6th May 2020.\
This news has made Apple and Google stock jump 5% on Dow Jones Index in the \
United States of America'

In [10]:
#Tokenize to words
words = nltk.word_tokenize(text)
words

['Apple',
 'acquired',
 'Zoom',
 'in',
 'China',
 'on',
 'Wednesday',
 '6th',
 'May',
 '2020.This',
 'news',
 'has',
 'made',
 'Apple',
 'and',
 'Google',
 'stock',
 'jump',
 '5',
 '%',
 'on',
 'Dow',
 'Jones',
 'Index',
 'in',
 'the',
 'United',
 'States',
 'of',
 'America']

In [14]:
#Part of speech tagging
pos_tags = nltk.pos_tag(words)
pos_tags

[('Apple', 'NNP'),
 ('acquired', 'VBD'),
 ('Zoom', 'NNP'),
 ('in', 'IN'),
 ('China', 'NNP'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('6th', 'CD'),
 ('May', 'NNP'),
 ('2020.This', 'CD'),
 ('news', 'NN'),
 ('has', 'VBZ'),
 ('made', 'VBN'),
 ('Apple', 'NNP'),
 ('and', 'CC'),
 ('Google', 'NNP'),
 ('stock', 'NN'),
 ('jump', 'NN'),
 ('5', 'CD'),
 ('%', 'NN'),
 ('on', 'IN'),
 ('Dow', 'NNP'),
 ('Jones', 'NNP'),
 ('Index', 'NNP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('of', 'IN'),
 ('America', 'NNP')]

In [17]:
#check nltk help for description of the tag
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


%%html
<font size=12 color='black'>ne_chunk</font>

Ne_chunk

Binary=True

In [38]:
chunks = nltk.ne_chunk(pos_tags, binary=True) #either NE or not NE #NE = entity #Entity could be considered a noun
for chunk in chunks: 
    print(chunk)

(NE Apple/NNP)
('acquired', 'VBD')
('Zoom', 'NNP')
('in', 'IN')
(NE China/NNP)
('on', 'IN')
('Wednesday', 'NNP')
('6th', 'CD')
('May', 'NNP')
('2020.This', 'CD')
('news', 'NN')
('has', 'VBZ')
('made', 'VBN')
(NE Apple/NNP)
('and', 'CC')
(NE Google/NNP)
('stock', 'NN')
('jump', 'NN')
('5', 'CD')
('%', 'NN')
('on', 'IN')
('Dow', 'NNP')
('Jones', 'NNP')
('Index', 'NNP')
('in', 'IN')
('the', 'DT')
(NE United/NNP States/NNPS)
('of', 'IN')
(NE America/NNP)


In [47]:
#this is putting "NE" into a data frome (df)
entities =[]
labels =[]
for chunk in chunks:
    if hasattr(chunk, 'label'):
        #print(chunk)
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ['Entities', 'labels']
entities_df

Unnamed: 0,Entities,labels
0,China,NE
1,America,NE
2,United States,NE
3,Google,NE
4,Apple,NE


Why did it miss Zoom?

Binary = False

In [57]:
chunks = nltk.ne_chunk(pos_tags, binary=False)
for chunk in chunks:
    print(chunks)

entities =[]
labels =[]
for chunk in chunks:
    if hasattr(chunk, 'label'):
        #print(chunk)
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
            
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

(S
  (PERSON Apple/NNP)
  acquired/VBD
  (PERSON Zoom/NNP)
  in/IN
  (GPE China/NNP)
  on/IN
  Wednesday/NNP
  6th/CD
  May/NNP
  2020.This/CD
  news/NN
  has/VBZ
  made/VBN
  (PERSON Apple/NNP)
  and/CC
  (ORGANIZATION Google/NNP)
  stock/NN
  jump/NN
  5/CD
  %/NN
  on/IN
  (PERSON Dow/NNP Jones/NNP Index/NNP)
  in/IN
  the/DT
  (GPE United/NNP States/NNPS)
  of/IN
  (GPE America/NNP))
(S
  (PERSON Apple/NNP)
  acquired/VBD
  (PERSON Zoom/NNP)
  in/IN
  (GPE China/NNP)
  on/IN
  Wednesday/NNP
  6th/CD
  May/NNP
  2020.This/CD
  news/NN
  has/VBZ
  made/VBN
  (PERSON Apple/NNP)
  and/CC
  (ORGANIZATION Google/NNP)
  stock/NN
  jump/NN
  5/CD
  %/NN
  on/IN
  (PERSON Dow/NNP Jones/NNP Index/NNP)
  in/IN
  the/DT
  (GPE United/NNP States/NNPS)
  of/IN
  (GPE America/NNP))
(S
  (PERSON Apple/NNP)
  acquired/VBD
  (PERSON Zoom/NNP)
  in/IN
  (GPE China/NNP)
  on/IN
  Wednesday/NNP
  6th/CD
  May/NNP
  2020.This/CD
  news/NN
  has/VBZ
  made/VBN
  (PERSON Apple/NNP)
  and/CC
  (ORGANIZATIO

Unnamed: 0,Entities,Labels
0,Google,ORGANIZATION
1,China,GPE
2,United States,GPE
3,Apple,PERSON
4,Zoom,PERSON
5,America,GPE
6,Dow Jones Index,PERSON


Basic Named Entity (NE) tagging using NLTK - Sentence based

In [58]:
entities = []
labels = []

sentence = nltk.sent_tokenize(text)
for sent in sentence:
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary=False):
        if hasattr(chunk,'label'):
            entities.append(' '.join(c[0] for c in chunk))
            labels.append(chunk.label())

entities_labels = list(set(zip(entities,labels)))

entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Google,ORGANIZATION
1,China,GPE
2,United States,GPE
3,Apple,PERSON
4,Zoom,PERSON
5,America,GPE
6,Dow Jones Index,PERSON
