In [1]:
# Load the libraries
import pandas as pd
import nltk

## Load in the data

In [2]:
# Load in the data
data = pd.read_csv('../../../Datasets/yelp_labelled_processed/yelp_labelled_processed.csv')[0:100]

# Replace non-string reviews with strings (this is jsut a quirck of this dataset becasue some are np.nan)
data['text'] = data['text'].apply(lambda review: str(review))

## Apply chunking with nltk

#### Convert string to array of tokens
The nltk pos tagger accepts a list of tokens

In [3]:
# Convert each string into an array
data['tokens'] = data['text'].copy().apply(lambda string: string.split(' '))

#### Apply POS tagger

In [4]:
# Apply POS tagging
data['nltk_pos_tagged'] = data['tokens'].apply(lambda tokens: nltk.pos_tag(tokens))

#### Apply chunking

The chunk parser present within the NLTK library is rule based and thus needs to be given a regular expression as a rule to output a chunk with its chunk tag. i.e It requires an a regular expression of a phrase and the corresponding chunk tag. It then searches the corpus for this expression and assigns it the tag.

In [5]:
# Create a regular expression that will search for a noun phrase, as shown:
rule = r"""Noun Phrase: {<DT>?<JJ>*<NN>}"""

This regular expression is searching for a determiner (optional), followed by one or more adjectives and then a single noun. This will form a chunk called Noun Phrase.

In [6]:
# Create an instance of RegexpParser and feed it the rule:
chunkParser = nltk.RegexpParser(rule)

Give chunkParser the tagset containing the tokens with their respective POS tags so that it can perform chunking, and then draw the chunks:

In [7]:
# Apply chunking
data['nltk_chunked'] = data['nltk_pos_tagged'].apply(lambda tagset: chunkParser.parse(tagset))

#### Have a look at the chunked phrases

In [8]:
# Draw a parse tree (This will open a new window)
data['nltk_chunked'][0].draw()

In [9]:
# Have a look at the first 5 rows
data['nltk_chunked'].head(n=5)

0    [[(new, JJ), (rule, NN)], (waitingtable, JJ), ...
1    [(giving, VBG), [(twostar, NN)], ('spretty, CD...
2    [(staying, VBG), [(planet, NN)], [(hollywood, ...
3    [[(foodgood, NN)], [(price, NN)], (super, VBD)...
4    [(worse, JJR), [(company, NN)], [(deal, NN)], ...
Name: nltk_chunked, dtype: object

## Apply chunking with spacy
spaCy doesn't require us to formulate rules to recognize chunks; it identifies chunks on its own and tells us what the head word is, thus telling us what the chunk tag is.

In [10]:
# Import the libraries
import spacy
import en_core_web_sm
# !python -m spacy download en_core_web_sm

In [11]:
# Load spaCy's 'en_core_web_sm' model
nlp = en_core_web_sm.load()

#### No need to convert from string to array of tokens
The spacy pos tagger accepts strings

#### Apply POS tagger

In [12]:
# Apply POS tagging
data['spacy_pos_tagged'] = data['text'].apply(lambda tokens: nlp(tokens))

### Apply the chunker
Apply noun_chunks on this model, and for each chunk, print the text of the chunk, the root word of the chunk, and the dependency relation that connects the root word to its head

In [13]:
# Have a look at the first text
first_text = data['spacy_pos_tagged'][0]
for chunk in first_text.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_)

new rule rule ROOT
waitingtable almostalways almostalways nsubj
posted sign upfront cause concern concern nsubj
patron patron dobj
apology apology dobj
reserve table table dobj
list short otherwise show reserve reserve dobj
placecould placecould ROOT
hot beverage beverage nsubj
