In [None]:
# connect to google colab
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Named Entity Recognition

### Do necessary imports

In [None]:
%matplotlib inline
### INSTALL GHOSTSCRIPT (Required to display NLTK trees) ###
!apt install ghostscript python3-tk

### CREATE VIRTUAL DISPLAY ###
!apt-get install -y xvfb # Install X Virtual Frame Buffer
import os
os.system('Xvfb :1 -screen 0 1600x1200x16  &')    # create virtual display with size 1600x1200 and 16 bit color. Color can be changed to 24 or 8
os.environ['DISPLAY']=':1.0'    # tell X clients to use our virtual DISPLAY :1.0.

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3-tk is already the newest version (3.6.9-1~18.04).
The following additional packages will be installed:
  fonts-droid-fallback fonts-noto-mono gsfonts libcupsfilters1 libcupsimage2
  libgs9 libgs9-common libijs-0.35 libjbig2dec0 poppler-data
Suggested packages:
  fonts-noto ghostscript-x poppler-utils fonts-japanese-mincho
  | fonts-ipafont-mincho fonts-japanese-gothic | fonts-ipafont-gothic
  fonts-arphic-ukai fonts-arphic-uming fonts-nanum
The following NEW packages will be installed:
  fonts-droid-fallback fonts-noto-mono ghostscript gsfonts libcupsfilters1
  libcupsimage2 libgs9 libgs9-common libijs-0.35 libjbig2dec0 poppler-data
0 upgraded, 11 newly installed, 0 to remove and 31 not upgraded.
Need to get 14.1 MB of archives.
After this operation, 49.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-droid-fallback all 1:6.0.

In [None]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords   
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

from IPython.display import display
lemmatizer = nltk.WordNetLemmatizer()
from tqdm import tqdm
import re 

import pandas as pd
import numpy as np

from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# COLAB CONFIG
# change colab flag to false if train using jupyter notebook
COLAB_FLAG = True
COLAB_FILEPATH = './drive/My Drive/4034-amazon-review-classification/' if COLAB_FLAG == True else './'
pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline

### Loading of data

In [None]:
# read the crawled data
data_ = pd.read_csv(COLAB_FILEPATH+'data/trip-advisor-comments.csv')
print(f'Shape of the dataset:{data_.shape}')
data_.head()

Shape of the dataset:(97190, 5)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment
0,Positano @ RP,"Italian, European",aisvslife98,5,I enjoyed my time here with my girlfriends! Fa...
1,Positano @ RP,"Italian, European",Odyssey44198198885,5,Wonderful and amazing service experience. Defi...
2,Positano @ RP,"Italian, European",Ninifazelin,5,Great food and wonderful service! Will definit...
3,Positano @ RP,"Italian, European",Amaliamazlan,5,Not my first time in Positano and definitely w...
4,Positano @ RP,"Italian, European",Shahzanstim,5,Excellent service from the staff. The beef was...


In [None]:
# filtering the test data by removing duplicates
data=data_.drop_duplicates(subset={"Reviewer\'s Name","Comment"}, 
                                             keep='first', inplace=False)
print(f'Shape of the dataset:{data.shape}')

Shape of the dataset:(88042, 5)


In [None]:
data_comment_ = data['Comment']
data_comment = data_comment_.to_frame()
print(data_comment.shape)
data_comment.head()

(88042, 1)


Unnamed: 0,Comment
0,I enjoyed my time here with my girlfriends! Fa...
1,Wonderful and amazing service experience. Defi...
2,Great food and wonderful service! Will definit...
3,Not my first time in Positano and definitely w...
4,Excellent service from the staff. The beef was...


In [None]:
# convert data into a list
data_comment_list = list(data_comment_)

In [None]:
data_comment_list[1]

'Wonderful and amazing service experience. Definitely will return for dining again next time in the future.'

### Text Preprocessing to check POS and IOB

Chunking pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.

In [None]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

selected_list = [5,7,88,777,1000]

for i in selected_list:
    sent = preprocess(data_comment_list[i])

    pattern = 'NP: {<DT>?<JJ>*<NN>}'

    cp = nltk.RegexpParser(pattern)
    cs = cp.parse(sent)
    #print(cs)

    # NER Portion
    # look at the IOB tagged
    iob_tagged = tree2conlltags(cs)
    pprint(iob_tagged)
    print()

[('Place', 'NN', 'B-NP'),
 ('has', 'VBZ', 'O'),
 ('great', 'JJ', 'B-NP'),
 ('food', 'NN', 'I-NP'),
 (',', ',', 'O'),
 ('great', 'JJ', 'B-NP'),
 ('ambience', 'NN', 'I-NP'),
 ('and', 'CC', 'O'),
 ('the', 'DT', 'B-NP'),
 ('staff', 'NN', 'I-NP'),
 ('are', 'VBP', 'O'),
 ('very', 'RB', 'O'),
 ('friendly', 'RB', 'O'),
 ('.', '.', 'O'),
 ('Shout', 'NN', 'B-NP'),
 ('out', 'IN', 'O'),
 ('to', 'TO', 'O'),
 ('Naufal', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('Ain', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('their', 'PRP$', 'O'),
 ('pleasant', 'JJ', 'B-NP'),
 ('service', 'NN', 'I-NP'),
 ('.', '.', 'O')]

[('Love', 'VB', 'O'),
 ('their', 'PRP$', 'O'),
 ('risotto', 'NN', 'B-NP'),
 ('.', '.', 'O'),
 ('This', 'DT', 'O'),
 ('is', 'VBZ', 'O'),
 ('the', 'DT', 'B-NP'),
 ('third', 'JJ', 'I-NP'),
 ('time', 'NN', 'I-NP'),
 ('coming', 'VBG', 'O'),
 ('here', 'RB', 'O'),
 ('.', '.', 'O'),
 ('It', 'PRP', 'O'),
 ('wo', 'MD', 'O'),
 ("n't", 'RB', 'O'),
 ('be', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('last', 'JJ', 'O'),
 ('.', '.',

## Some examples of named entity recognition

In [None]:
doc = nlp(data_comment_list[5])
displacy.render(doc, jupyter=True, style='ent')

In [None]:
doc = nlp(data_comment_list[7])
displacy.render(doc, jupyter=True, style='ent')

In [None]:
doc = nlp(data_comment_list[88])
displacy.render(doc, jupyter=True, style='ent')

In [None]:
doc = nlp(data_comment_list[777])
displacy.render(doc, jupyter=True, style='ent')

In [None]:
doc = nlp(data_comment_list[1000])
displacy.render(doc, jupyter=True, style='ent')