In [None]:
# connect to google colab
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/50/ae/a70a58ce6b4e2daad538688806ee0f238dbe601954582a74ea57cde6c532/stanza-1.2-py3-none-any.whl (282kB)
[K     |█▏                              | 10kB 20.9MB/s eta 0:00:01[K     |██▎                             | 20kB 24.7MB/s eta 0:00:01[K     |███▌                            | 30kB 23.3MB/s eta 0:00:01[K     |████▋                           | 40kB 21.2MB/s eta 0:00:01[K     |█████▉                          | 51kB 21.6MB/s eta 0:00:01[K     |███████                         | 61kB 15.3MB/s eta 0:00:01[K     |████████▏                       | 71kB 16.5MB/s eta 0:00:01[K     |█████████▎                      | 81kB 16.8MB/s eta 0:00:01[K     |██████████▌                     | 92kB 15.1MB/s eta 0:00:01[K     |███████████▋                    | 102kB 16.2MB/s eta 0:00:01[K     |████████████▉                   | 112kB 16.2MB/s eta 0:00:01[K     |██████████████                  | 122kB 16.2MB/s

In [None]:
import pandas as pd
import numpy as np
import sqlite3
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
import stanza
from tqdm import tqdm
from bs4 import BeautifulSoup
import re

In [None]:
# download English model
stanza.download('en') 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 26.7MB/s]                    
2021-04-10 08:04:52 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/en/default.zip: 100%|██████████| 411M/411M [01:18<00:00, 5.24MB/s]
2021-04-10 08:06:17 INFO: Finished downloading models and saved to /root/stanza_resources.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# COLAB CONFIG
# change colab flag to false if train using jupyter notebook
COLAB_FLAG = True
COLAB_FILEPATH = './drive/My Drive/4034-amazon-review-classification/' if COLAB_FLAG == True else './'
pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline

# Import crawled data and remove duplication

In [None]:
# read the test data
data_test_raw_ = pd.read_csv(COLAB_FILEPATH+'data/trip-advisor-comments.csv')
print(f'Shape of the dataset:{data_test_raw_.shape}')
data_test_raw_.head()

Shape of the dataset:(97190, 5)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment
0,Positano @ RP,"Italian, European",aisvslife98,5,I enjoyed my time here with my girlfriends! Fa...
1,Positano @ RP,"Italian, European",Odyssey44198198885,5,Wonderful and amazing service experience. Defi...
2,Positano @ RP,"Italian, European",Ninifazelin,5,Great food and wonderful service! Will definit...
3,Positano @ RP,"Italian, European",Amaliamazlan,5,Not my first time in Positano and definitely w...
4,Positano @ RP,"Italian, European",Shahzanstim,5,Excellent service from the staff. The beef was...


In [None]:
# remove duplication of entries
data_test_raw=data_test_raw_.drop_duplicates(subset={"Reviewer\'s Name","Comment"}, 
                                             keep='first', inplace=False)
print(f'Shape of the dataset:{data_test_raw.shape}')

Shape of the dataset:(88042, 5)


In [None]:
#data_test_raw = pd.read_csv(COLAB_FILEPATH + 'data/trip-advisor-comments-filtered.csv')
# create an empty dataframe
data_test = pd.DataFrame()

# to store only the required columns into the new dataframe
data_test['comments'] = data_test_raw['Comment']
data_test['ratings'] = data_test_raw['Rating']
data_test['type'] = "test"
print(data_test.shape)
print(data_test.dtypes)
data_test.head()

(88042, 3)
comments    object
ratings      int64
type        object
dtype: object


Unnamed: 0,comments,ratings,type
0,I enjoyed my time here with my girlfriends! Fa...,5,test
1,Wonderful and amazing service experience. Defi...,5,test
2,Great food and wonderful service! Will definit...,5,test
3,Not my first time in Positano and definitely w...,5,test
4,Excellent service from the staff. The beef was...,5,test


# Paritition the ratings to 3 classes only
-1 (negative) <- 1,2  
0 (neutral) <- 3  
1 (positive) <- 4,5

In [None]:
def partition(x):
    if x < 3:
        return -1
    elif x == 3:
        return 0
    else:   
        return 1

In [None]:
# append partitioned data to the test set
actualScore = data_test['ratings']
class_ = actualScore.map(partition) 
data_test['ratings_class'] = class_
print("Number of data points in test data", data_test.shape)
data_test.head(3)

Number of data points in test data (88042, 4)


Unnamed: 0,comments,ratings,type,ratings_class
0,I enjoyed my time here with my girlfriends! Fa...,5,test,1
1,Wonderful and amazing service experience. Defi...,5,test,1
2,Great food and wonderful service! Will definit...,5,test,1


### Preprocess the test dataset to separate the neutral data from the opinionated ones

In [None]:
data_opinionated_test = data_test[data_test['ratings_class'] != 0]
print(data_opinionated_test.shape)
print(data_opinionated_test['ratings_class'].value_counts())

(79205, 4)
 1    72280
-1     6925
Name: ratings_class, dtype: int64


In [None]:
data_opinionated_test.head()

Unnamed: 0,comments,ratings,type,ratings_class
0,I enjoyed my time here with my girlfriends! Fa...,5,test,1
1,Wonderful and amazing service experience. Defi...,5,test,1
2,Great food and wonderful service! Will definit...,5,test,1
3,Not my first time in Positano and definitely w...,5,test,1
4,Excellent service from the staff. The beef was...,5,test,1


In [None]:
# remove contractions
def contraction_removal(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", "not", phrase)
    phrase = re.sub(r"\'re", "are", phrase)
    phrase = re.sub(r"\'s", "is", phrase)
    phrase = re.sub(r"\'d", "would", phrase)
    phrase = re.sub(r"\'ll", "will", phrase)
    phrase = re.sub(r"\'t", "not", phrase)
    phrase = re.sub(r"\'ve", "have", phrase)
    phrase = re.sub(r"\'m", "am", phrase)
    return phrase

In [None]:
# to do data cleaning here
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentence in tqdm(data_opinionated_test['comments'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = contraction_removal(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    # https://gist.github.com/sebleier/554280
    preprocessed_reviews.append(sentence.strip())

100%|██████████| 79205/79205 [00:22<00:00, 3561.20it/s]


In [None]:
type(preprocessed_reviews)

list

In [None]:
data_opinionated_test['comments_cleaned'] = preprocessed_reviews
print(data_opinionated_test.shape)
data_opinionated_test.head()

(79205, 5)


Unnamed: 0,comments,ratings,type,ratings_class,comments_cleaned
0,I enjoyed my time here with my girlfriends! Fa...,5,test,1,I enjoyed my time here with my girlfriends Faj...
1,Wonderful and amazing service experience. Defi...,5,test,1,Wonderful and amazing service experience Defin...
2,Great food and wonderful service! Will definit...,5,test,1,Great food and wonderful service Will definite...
3,Not my first time in Positano and definitely w...,5,test,1,Not my first time in Positano and definitely w...
4,Excellent service from the staff. The beef was...,5,test,1,Excellent service from the staff The beef was ...


In [None]:
data_opinionated_test.to_csv(COLAB_FILEPATH + 'data/data_test_absa.csv', 
                             index=False)

In [None]:
data_overall_ = pd.read_csv(COLAB_FILEPATH + 'data/data_test_absa.csv')
data_overall_.head()

Unnamed: 0,comments,ratings,type,ratings_class,comments_cleaned
0,I enjoyed my time here with my girlfriends! Fa...,5,test,1,I enjoyed my time here with my girlfriends Faj...
1,Wonderful and amazing service experience. Defi...,5,test,1,Wonderful and amazing service experience Defin...
2,Great food and wonderful service! Will definit...,5,test,1,Great food and wonderful service Will definite...
3,Not my first time in Positano and definitely w...,5,test,1,Not my first time in Positano and definitely w...
4,Excellent service from the staff. The beef was...,5,test,1,Excellent service from the staff The beef was ...


In [None]:
# replace na with - for the row where the comments cleaned are empty
data_overall_['comments_cleaned'] = data_overall_['comments_cleaned'].fillna('-')
data_overall_.count()

comments            79205
ratings             79205
type                79205
ratings_class       79205
comments_cleaned    79205
dtype: int64

In [None]:
comments_list = list(data_overall_['comments_cleaned'])
print(len(comments_list))
comments_list[:7]

79205


['I enjoyed my time here with my girlfriends Fajar our server gave recommendations for food and drinks which were fantastic and check up on our meals twice Ordered a dessert to celebrate two of my girlfriendis birthdays and exceeding my expectations came with another complimentary dessert Second time here and service is great as usual Thank you',
 'Wonderful and amazing service experience Definitely will return for dining again next time in the future',
 'Great food and wonderful service Will definitely return The calzone is recommended Friendly and attentive staff Good variety of food',
 'Not my first time in Positano and definitely will not be my last Such amazing service and such delicious food Despite the restaurant being full food will always be nice I will be back soon Canot wait to see the upgrade',
 'Excellent service from the staff The beef was so tender and the risotto was nice Would definitely recommend it',
 'Place has great food great ambience and the staff are very friend

In [None]:
# do a small test here
comments_list_small = comments_list[:10]
comments_list_small

['I enjoyed my time here with my girlfriends Fajar our server gave recommendations for food and drinks which were fantastic and check up on our meals twice Ordered a dessert to celebrate two of my girlfriendis birthdays and exceeding my expectations came with another complimentary dessert Second time here and service is great as usual Thank you',
 'Wonderful and amazing service experience Definitely will return for dining again next time in the future',
 'Great food and wonderful service Will definitely return The calzone is recommended Friendly and attentive staff Good variety of food',
 'Not my first time in Positano and definitely will not be my last Such amazing service and such delicious food Despite the restaurant being full food will always be nice I will be back soon Canot wait to see the upgrade',
 'Excellent service from the staff The beef was so tender and the risotto was nice Would definitely recommend it',
 'Place has great food great ambience and the staff are very friend

## Aspect based sentiment portion

In [None]:
stop_words = set(stopwords.words('english'))
nlp = stanza.Pipeline('en')

2021-04-10 08:08:19 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-04-10 08:08:19 INFO: Use device: cpu
2021-04-10 08:08:19 INFO: Loading: tokenize
2021-04-10 08:08:19 INFO: Loading: pos
2021-04-10 08:08:20 INFO: Loading: lemma
2021-04-10 08:08:20 INFO: Loading: depparse
2021-04-10 08:08:20 INFO: Loading: sentiment
2021-04-10 08:08:21 INFO: Loading: ner
2021-04-10 08:08:21 INFO: Done loading processors!


In [None]:
# a function to get the aspect based sentiments
def aspect_conversion(text):
    # lower case the text and tokenize the Sentence
    text = text.lower()
    sentList = nltk.sent_tokenize(text)

    # perform pos tagging
    for line in sentList:
        txt_list = nltk.word_tokenize(line)
        taggedList = nltk.pos_tag(txt_list)

    newwordList = []
    flag = 0
    for i in range(0,len(taggedList)-1):
        if (taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"):
            newwordList.append(taggedList[i][0]+taggedList[i+1][0])
            flag=1
        else:
            if(flag==1):
                flag=0
                continue
            newwordList.append(taggedList[i][0])
            if(i==len(taggedList)-2):
                newwordList.append(taggedList[i+1][0])
    finaltxt = ' '.join(word for word in newwordList)
    
    new_txt_list = nltk.word_tokenize(finaltxt)
    wordsList = [w for w in new_txt_list if not w in stop_words]
    taggedList = nltk.pos_tag(wordsList)
    
    doc = nlp(finaltxt)
    dep_node = []
    for dep_edge in doc.sentences[0].dependencies:
        dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
        #print(len(dep_node))
    for j in range(0, len(dep_node)):
        #print(j)
        try:
            if (int(dep_node[j][1]) != 0):
                dep_node[j][1] = newwordList[(int(dep_node[j][1]) - 1)]
        except:
            print("Error")

    # only select those sublists from the dep_node 
    # that could probably contain the features
    featureList = []
    categoriesList = []

    for i in taggedList:
        if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
            featureList.append(list(i))
            categoriesList.append(i[0])

    # now using dep_node list and the featureList we will determine to 
    # which of the words these features in the feature list are related to
    fcluster = []
    for i in featureList:
        filist = []
        for j in dep_node:
            if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                if(j[0]==i[0]):
                    filist.append(j[1])
                else:
                    filist.append(j[0])
        fcluster.append([i[0], filist])

    finalcluster = []
    dic = {}
    for i in featureList:
        dic[i[0]] = i[1]
    for i in fcluster:
        if(dic[i[0]]=="NN"):
            finalcluster.append(i)
    return finalcluster

In [None]:
# check on the sublist to see if code is working
for k in comments_list_small:
    x = aspect_conversion(k)
    print(x)

[['time', ['enjoyed', 'second']], ['food', ['fantastic']], ['check', []], ['dessert', ['ordered', 'complimentary']], ['celebrate', ['two']], ['girlfriendis', ['birthdays']], ['dessert', ['ordered', 'complimentary']], ['time', ['enjoyed', 'second']], ['service', ['great']], ['thank', ['you']]]
[['serviceexperience', ['wonderful', 'return']], ['time', ['next']], ['future', []]]
[['food', ['great', 'return']], ['service', ['wonderful']], ['calzone', []], ['staff', ['variety']], ['variety', ['staff', 'good']], ['food', ['great', 'return']]]
Error
[['time', ['not', 'first']], ['positano', []], ['service', ['definitely', 'not', 'last', 'such', 'amazing']], ['food', ['such', 'delicious', 'nice', 'full', 'nice']], ['restaurant', []], ['food', ['such', 'delicious', 'nice', 'full', 'nice']], ['wait', []], ['see', []]]
[['service', ['excellent']], ['staff', []], ['beef', ['tender']], ['tender', ['beef', 'so']], ['risotto', ['nice']]]
[['place', ['has']], ['food', ['great', 'has']], ['ambience', [

In [None]:
print(comments_list[200])
aspect_conversion(comments_list[200])

Went for lunch to celebrate my momis birthday Good food nice ambience excellent service by Manager Ruel and team


[['lunch', []],
 ['celebrate', ['birthday', 'ambience']],
 ['momis', ['birthday']],
 ['birthday', ['momis', 'celebrate']],
 ['food', ['good', 'ambience']],
 ['ambience', ['food', 'nice', 'celebrate']],
 ['service', ['excellent']],
 ['managerruel', []],
 ['team', []]]

In [None]:
print(comments_list[222])
aspect_conversion(comments_list[222])

Food quality is very good for both dishes and dim sum Staff are warm and provide excellent service


[['foodquality', ['good']],
 ['staff', ['sum', 'warm']],
 ['provide', ['service']],
 ['service', ['excellent', 'provide']]]

In [None]:
print(comments_list[777])
aspect_conversion(comments_list[777])

Food is nice and service is great My family enjoyed the dinner and the services rendered We have ordered Peking duck crispy roasted chicken prawns vegetables and other special recommendations by the staff Food served are fresh and most importantly no MSG


[['food', ['nice']],
 ['service', ['great']],
 ['family', ['enjoyed']],
 ['dinner', ['enjoyed']],
 ['peking', ['vegetables']],
 ['duckcrispy', ['vegetables']]]