In [1]:
# Load the libraries
import pandas as pd
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')

### Load the data

In [2]:
# Load in the data
data = pd.read_csv('../../Datasets/yelp_labelled_processed/yelp_labelled_processed.csv')

# Replace non-string reviews with strings (this is jsut a quirck of this dataset becasue some are np.nan)
data['text'] = data['text'].apply(lambda review: str(review))

## With nltk

#### Convert string to array of tokens
The nltk pos tagger accepts a list of tokens

In [3]:
# Convert each string into an array
data['tokens'] = data['text'].copy().apply(lambda string: string.split(' '))

In [4]:
# Have a look at the first 5 rows
data['tokens'].head(n=5)

0    [new, rule, waitingtable, almostalways, cant, ...
1    [giving, twostar, 'spretty, rating, might, nig...
2    [staying, planet, hollywood, acrossstreet, saw...
3    [foodgood, price, super, expensive, 8, buck, e...
4    [worse, company, deal, horrible, work, bring, ...
Name: tokens, dtype: object

#### Apply POS tagger

In [5]:
# Apply POS tagging
data['nltk_pos_tagged'] = data['tokens'].apply(lambda tokens: nltk.pos_tag(tokens))

In [6]:
# Have a look at the first text
print(data['nltk_pos_tagged'][0])

[('new', 'JJ'), ('rule', 'NN'), ('waitingtable', 'JJ'), ('almostalways', 'NNS'), ('cant', 'VBP'), ('wait', 'NN'), ('inside', 'RB'), ('posted', 'VBD'), ('sign', 'JJ'), ('upfront', 'JJ'), ('cause', 'NN'), ('concern', 'NN'), ('seated', 'VBN'), ('patron', 'RB'), ('awful', 'JJ'), ('like', 'IN'), ('included', 'JJ'), ('apology', 'NN'), ('along', 'IN'), ('especially', 'RB'), ('cold', 'JJ'), ('p.s', 'JJ'), ('try', 'NN'), ('calling', 'VBG'), ('ahead', 'RB'), ('reserve', 'NN'), ('table', 'JJ'), ('thats', 'NNS'), ('waiting', 'VBG'), ('list', 'NN'), ('short', 'JJ'), ('otherwise', 'RB'), ('show', 'VBP'), ('reserve', 'NN'), ('placecould', 'NN'), ('wrong', 'JJ'), ('eye', 'NN'), ('rattle', 'VB'), ('away', 'RP'), ('hot', 'JJ'), ('beverage', 'NN'), ('must', 'MD'), ('mention', 'VB'), ('obsessed', 'VBD'), ('mad', 'NN')]


#### Get information for POS tag

In [7]:
# Get information of tag
nltk.help.upenn_tagset("NN")

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


## With spacy

In [8]:
# Import the libraries
import spacy
import en_core_web_sm
# !python -m spacy download en_core_web_sm

In [9]:
# Load spaCy's 'en_core_web_sm' model
nlp = en_core_web_sm.load()

#### No need to convert from string to array of tokens
The spacy pos tagger accepts strings

#### Apply POS tagger

In [10]:
# Apply POS tagging
data['spacy_pos_tagged'] = data['text'].apply(lambda tokens: nlp(tokens))

In [11]:
# Have a look at the first text
first_text = data['spacy_pos_tagged'][0]
for token in first_text:
    print(token.text, token.pos_, token.tag_)

new ADJ JJ
rule NOUN NN
waitingtable ADJ JJ
almostalways PROPN NNP
ca VERB MD
nt PART RB
wait VERB VB
inside ADV RB
posted VERB VBN
sign NOUN NN
upfront NOUN NN
cause NOUN NN
concern NOUN NN
seated VERB VBD
patron NOUN NN
awful ADJ JJ
like SCONJ IN
included VERB VBN
apology NOUN NN
along ADP IN
especially ADV RB
cold ADJ JJ
p.s PROPN NNP
try VERB VBP
calling VERB VBG
ahead ADV RB
reserve NOUN NN
table NOUN NN
that DET WDT
s VERB VBZ
waiting VERB VBG
list NOUN NN
short PROPN NNP
otherwise ADV RB
show VERB VBP
reserve PROPN NNP
placecould PROPN NNP
wrong PROPN NNP
eye NOUN NN
rattle VERB VB
away ADV RB
hot ADJ JJ
beverage NOUN NN
must VERB MD
mention VERB VB
obsessed VERB VBN
mad ADJ JJ


#### Get information for POS tag

In [13]:
# Get information of tag
spacy.explain("VBZ")

'verb, 3rd person singular present'

## Build your own POS tagger
The first thing to do is pick a corpus that we want to train our tagger on. Import the necessary Python packages. Here, we use the nltk treebank corpus to work on

In [17]:
# nltk.download('treebank')
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [18]:
print(tagged_sentences[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


Tagged sentences:  3914


Tagged words: 100676


In [21]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

In [22]:
import pprint

In [23]:
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'sentence',
 'prefix-1': 'a',
 'prefix-2': 'a',
 'prefix-3': 'a',
 'prev_word': 'is',
 'suffix-1': 'a',
 'suffix-2': 'a',
 'suffix-3': 'a',
 'word': 'a'}


Create a function to strip the tagged words of their tags so that we can feed them into our tagger:

In [24]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

Now we need to build our training set. Our tagger needs to take features individually for each word, but our corpus is actually in the form of sentences, so we need to do a little transforming. Split the data into training and testing sets. Apply this function on the training set.