In [1]:
# Load the libraries
import pandas as pd
import nltk
from nltk import ne_chunk
from nltk import pos_tag
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

### Load in the data

In [2]:
# Load in the data
data = pd.read_csv('../../Datasets/yelp_labelled_processed/yelp_labelled_processed.csv')[0:10]

# Replace non-string reviews with strings (this is jsut a quirck of this dataset becasue some are np.nan)
data['text'] = data['text'].apply(lambda review: str(review))

In [3]:
# Convert each string into an array
data['tokens'] = data['text'].copy().apply(lambda string: string.split(' '))

## Try nltk implementation

### Perform POS tagging

In [4]:
data['tagged'] = data['tokens'].apply(lambda tokens: pos_tag(tokens))

### Apply NER function

In [5]:
data['NER'] = data['tagged'].apply(lambda tagged: ne_chunk(tagged, binary=False))

Note: Assigning the value of 'True' to the 'binary' parameter tells the algorithm to just recognize the named entities and not classify them.

In [6]:
# Have a look at the first parse tree structure
print(data['NER'].iloc[0].__repr__())

Tree('S', [('new', 'JJ'), ('rule', 'NN'), ('waitingtable', 'JJ'), ('almostalways', 'NNS'), ('cant', 'VBP'), ('wait', 'NN'), ('inside', 'RB'), ('posted', 'VBD'), ('sign', 'JJ'), ('upfront', 'JJ'), ('cause', 'NN'), ('concern', 'NN'), ('seated', 'VBN'), ('patron', 'RB'), ('awful', 'JJ'), ('like', 'IN'), ('included', 'JJ'), ('apology', 'NN'), ('along', 'IN'), ('especially', 'RB'), ('cold', 'JJ'), ('p.s', 'JJ'), ('try', 'NN'), ('calling', 'VBG'), ('ahead', 'RB'), ('reserve', 'NN'), ('table', 'JJ'), ('thats', 'NNS'), ('waiting', 'VBG'), ('list', 'NN'), ('short', 'JJ'), ('otherwise', 'RB'), ('show', 'VBP'), ('reserve', 'NN'), ('placecould', 'NN'), ('wrong', 'JJ'), ('eye', 'NN'), ('rattle', 'VB'), ('away', 'RP'), ('hot', 'JJ'), ('beverage', 'NN'), ('must', 'MD'), ('mention', 'VB'), ('obsessed', 'VBD'), ('mad', 'NN')])


In [7]:
# Have a look at the second parse tree structure
print(data['NER'].iloc[1].__repr__())

Tree('S', [('giving', 'VBG'), ('twostar', 'NN'), ("'spretty", 'CD'), ('rating', 'NN'), ('might', 'MD'), ('night', 'NN'), ('new', 'JJ'), ('east', 'JJ'), ('side', 'NN'), ("n'tknow", 'RB'), ('many', 'JJ'), ('hiddengem', 'NNS'), ('fiance', 'VBP'), ('met', 'VBN'), ('friend', 'NN'), ('drink', 'NN'), ('endedgetting', 'VBG'), ('thing', 'NN'), ('nibble', 'JJ'), ('first', 'JJ'), ('service', 'NN'), ('pretty', 'RB'), ('slow', 'JJ'), ('unusual', 'JJ'), ('restaurant', 'NN'), ('pretty', 'RB'), ('small', 'JJ'), ('galley', 'NN'), ('style', 'NN'), ('wouldthink', 'NN'), ('would', 'MD'), ('easy', 'VB'), ('server', 'RB'), ('routinely', 'RB'), ('hit', 'VBN'), ('table', 'JJ'), ('pas', 'NN'), ('fiance', 'NN'), ('ordered', 'VBD'), ('quinoa', 'JJ'), ('salad', 'NN'), ('said', 'VBD'), ('prettygood', 'NN'), ('dry', 'NNS'), ("n'ttoo", 'RB'), ('hungry', 'JJ'), ('simply', 'RB'), ('ordered', 'VBD'), ('came', 'VBD'), ('burnt', 'RB'), ('ordered', 'JJ'), ('side', 'NN'), ('fry', 'NN'), ('either', 'RB'), ('hard', 'JJ'), ('

In [8]:
# Have a look at the parse tree
test = data['NER'].iloc[2]
# test.draw()

#### It seems like nltk's NER doesn't work so well.
It's not describing anything as a named entity. Maybe because nothing is capitalized?

## Try spacy implementation

In [9]:
# Import the libraries
import spacy
import en_core_web_sm
# !python -m spacy download en_core_web_sm

In [10]:
# Load spaCy's 'en_core_web_sm' model
nlp = en_core_web_sm.load()

#### Apply POS tagger

In [11]:
# Apply POS tagging
data['spacy_pos_tagged'] = data['text'].apply(lambda tokens: nlp(tokens))

#### Apply NER tagger

In [12]:
# Have a look at the first text
first_text = data['spacy_pos_tagged'][0]
for token in first_text.ents:
    print(token.text, token.label_)

patron ORG
p.s try ORG


In [13]:
# Have a look at the second text
first_text = data['spacy_pos_tagged'][1]
for token in first_text.ents:
    print(token.text, token.label_)

night TIME
first ORDINAL
quinoa salad PERSON
n'ttoo hungry PERSON
can'tremember lasttime DATE
twostar decor good goodplace ORG


In [14]:
# Have a look at the third text
first_text = data['spacy_pos_tagged'][2]
for token in first_text.ents:
    print(token.text, token.label_)

hollywood GPE


In [15]:
# Have a look at the fourth text
first_text = data['spacy_pos_tagged'][3]
for token in first_text.ents:
    print(token.text, token.label_)

8 CARDINAL
carneasada burrito FAC
burrito n'tgetwrong ' PERSON
3 CARDINAL
n'tforget PERSON
tomato PERSON
one CARDINAL
cucumber quarter DATE
4 five CARDINAL


In [16]:
# Have a look at the sixth text
first_text = data['spacy_pos_tagged'][5]
for token in first_text.ents:
    print(token.text, token.label_)

hate mcdonalds ORG


#### This also doesn't seem to work so well.
Apparently, the recognition and categorization of named entities strongly depends on the data that the recognizer has been trained on. This is something to keep in mind when implementing named entity recognition; it is often better to train and develop your own recognizer for specific use cases.

## Applying the nltk NER on a tagged corpus

In [20]:
# Have a look at the corpus
corpus = nltk.corpus.treebank.tagged_sents()
print(corpus)

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]


In [23]:
# Have a look at the first sentence
print(corpus[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


#### Apply NER

In [24]:
# Try it out the the first sentence
tagged_sent_1 = ne_chunk(corpus[0], binary=False)

In [26]:
# Have a look at the parse tree structure
print(tagged_sent_1.__repr__())

Tree('S', [Tree('PERSON', [('Pierre', 'NNP')]), Tree('ORGANIZATION', [('Vinken', 'NNP')]), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')])


In [27]:
# Draw the parse tree
tagged_sent_1.draw()

In [28]:
# Try it out the the second sentence
tagged_sent_2 = ne_chunk(corpus[1], binary=False)

In [29]:
# Have a look at the parse tree structure
print(tagged_sent_2.__repr__())

Tree('S', [Tree('PERSON', [('Mr.', 'NNP')]), Tree('PERSON', [('Vinken', 'NNP')]), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), Tree('ORGANIZATION', [('Elsevier', 'NNP')]), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), Tree('GPE', [('Dutch', 'NNP')]), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')])


In [30]:
# Draw the parse tree
tagged_sent_2.draw()