In [127]:
import numpy as np
import pandas as pd

## Basic

In [128]:
# import the scraped data
data_raw_ = pd.read_csv('./data/trip-advisor-comments.csv')
data_raw_.head()

Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment
0,Positano @ RP,"Italian, European",aisvslife98,5,I enjoyed my time here with my girlfriends! Fa...
1,Positano @ RP,"Italian, European",Odyssey44198198885,5,Wonderful and amazing service experience. Defi...
2,Positano @ RP,"Italian, European",Ninifazelin,5,Great food and wonderful service! Will definit...
3,Positano @ RP,"Italian, European",Amaliamazlan,5,Not my first time in Positano and definitely w...
4,Positano @ RP,"Italian, European",Shahzanstim,5,Excellent service from the staff. The beef was...


In [129]:
data_raw_.shape

(97190, 5)

In [130]:
# remove duplicates in the data
data_raw = data_raw_.drop_duplicates(subset={"Reviewer\'s Name","Rating","Comment"}, keep='first', inplace=False)
data_raw.shape

(88045, 5)

In [131]:
#Checking to see how much % of data still remains after removing duplications
(data_raw['Reviewer\'s Name'].size*1.0)/(data_raw_['Reviewer\'s Name'].size*1.0)*100

90.5905957403025

In [132]:
# check distribution of the data's rating label
data_raw['Rating'].value_counts()

5    46304
4    25978
3     8837
1     3464
2     3462
Name: Rating, dtype: int64

## Choosing the data
We will want the comments to be longer as it will be more informative for the annontator

In [152]:
# a constant that removes the entries either less than this amount of words
N = 30

In [153]:
# remove those rows where the data entries are less than N words
data = data_raw[data_raw['Comment'].str.split().str.len().ge(N)]
data.head(5)

Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment
0,Positano @ RP,"Italian, European",aisvslife98,5,I enjoyed my time here with my girlfriends! Fa...
3,Positano @ RP,"Italian, European",Amaliamazlan,5,Not my first time in Positano and definitely w...
9,Positano @ RP,"Italian, European",faithfu7,5,"Great food . Love the pasta , milk shake . Sta..."
14,Positano @ RP,"Italian, European",Chuanhantravels,5,"Service was awesome very friendly staff, food ..."
18,Positano @ RP,"Italian, European",Matsayshi29,5,First time here and truly wowed by the quality...


In [154]:
data.shape

(65271, 5)

In [155]:
# check distribution of the data's rating label
data['Rating'].value_counts()

5    32169
4    19563
3     7327
1     3108
2     3104
Name: Rating, dtype: int64

## Preprocess the data
##### Partition the ratings to 3 classes only
-1 (negative) <- 1,2  
0 (neutral) <- 3  
1 (positive) <- 4,5

In [156]:
def partition(x):
    if x < 3:
        return -1
    elif x == 3:
        return 0
    else:   
        return 1

In [157]:
ratings = data['Rating']
rating_converted = ratings.map(partition)
data['rating_class_main'] = rating_converted
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
0,Positano @ RP,"Italian, European",aisvslife98,5,I enjoyed my time here with my girlfriends! Fa...,1
3,Positano @ RP,"Italian, European",Amaliamazlan,5,Not my first time in Positano and definitely w...,1
9,Positano @ RP,"Italian, European",faithfu7,5,"Great food . Love the pasta , milk shake . Sta...",1
14,Positano @ RP,"Italian, European",Chuanhantravels,5,"Service was awesome very friendly staff, food ...",1
18,Positano @ RP,"Italian, European",Matsayshi29,5,First time here and truly wowed by the quality...,1


In [158]:
data.dtypes

Restaurant Name      object
Restaurant Type      object
Reviewer's Name      object
Rating                int64
Comment              object
rating_class_main     int64
dtype: object

In [159]:
# check distribution of the data's class label
data['rating_class_main'].value_counts()

 1    51732
 0     7327
-1     6212
Name: rating_class_main, dtype: int64

## Sample the data
Choose 250 from class 1  
Choose 250 from class -1  
Choose 500 from class 0

In [174]:
# sample 500 neutral comments
data_zero = data[data['rating_class_main'] == 0].sample(500, replace=False)
print(data_zero.shape)
data_zero.head(3)

(500, 6)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
70794,Ninethirty (by Awfully Chocolate),"Singaporean, Cafe",Esther L,3,Just go for their cakes and desserts since the...,0
18074,Wah Lok,$$ - $$$,panosh416,3,I suppose it’s the same as to expect to go to ...,0
70760,Ninethirty (by Awfully Chocolate),"Singaporean, Cafe",charmie2015,3,Chef must be trained to know the doneness of m...,0


In [175]:
# sample 250 negative comments
data_negative = data[data['rating_class_main'] == -1].sample(250, replace=False)
print(data_negative.shape)
data_negative.head(3)

(250, 6)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
25062,Kilo Kitchen,"Bar, International",Jo B,2,We headed to Kilo Kallang following a recommen...,-1
45307,Mondo Mio,"Italian, Mediterranean",lifesaholidayy,2,We ordered both the pasta and pizza and were d...,-1
32050,Brewerkz (Riverside Point),"American, Bar",Alex R,1,After a long walk through the city I went to B...,-1


In [176]:
# sample 250 positive comments
data_positive = data[data['rating_class_main'] == 1].sample(250, replace=False)
print(data_positive.shape)
data_positive.head(3)

(250, 6)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
87484,Penang Culture @ NEX,"Chinese, Asian",chloeopy,5,Went to Penang Culture for lunch with my famil...,1
70641,Knots Cafe and Living,Cafe,Rachel_PRL,4,We happened to be around the area and decided ...,1
42874,Two Chefs Bar Mexican & Italian,"Italian, Mexican",thesaint9,4,I had lunch there with my 10 year old son. We ...,1


In [177]:
# fit new dataframe to form the final sample
data_sample_ = pd.concat([data_zero, data_negative, data_positive])
print(data_sample_.shape)
data_sample_.head()

(1000, 6)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
70794,Ninethirty (by Awfully Chocolate),"Singaporean, Cafe",Esther L,3,Just go for their cakes and desserts since the...,0
18074,Wah Lok,$$ - $$$,panosh416,3,I suppose it’s the same as to expect to go to ...,0
70760,Ninethirty (by Awfully Chocolate),"Singaporean, Cafe",charmie2015,3,Chef must be trained to know the doneness of m...,0
14050,Bistecca Tuscan Steakhouse,"Italian, Steakhouse",Flashpacker_Dubai,3,Patchy service.. responsive and considerate by...,0
26274,Song Fa Bak Kut Teh The Centrepoint,"Asian, Chinese",Jahja Hendrawan S,3,Pork Pork Soup is pretty tasty and Braised Por...,0


In [178]:
# randomise the order of the sample dataframe
data_sample = data_sample_.sample(frac=1)
data_sample.head()

Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
45274,Salt tapas & bar,"Bar, Mediterranean",ntan002,3,Easily accessible as it is next to one of the ...,0
8210,Peach Blossoms,"Chinese, Asian",Willowsinoz,3,We went here thinking it would bea fine dining...,0
87946,Three Buns Quayside,"American, Bar",Tazziejosh,1,Had a horrible experience here. Stopped in for...,-1
68505,Ikoi Japanese Restaurant,"Japanese, Sushi",Faith L,3,Had a dinner here with my colleagues. It is al...,0
13115,Waku Ghin,"Seafood, Asian",zephyrtan,5,"If You're In A Rush For Meeting , Theatre Or S...",1


In [179]:
# final check on the sampled data
print(len(data_sample[data_sample['rating_class_main'] == 0]) == 500)
print(len(data_sample[data_sample['rating_class_main'] == -1]) == 250)
print(len(data_sample[data_sample['rating_class_main'] == 1]) == 250)

True
True
True


## Export the sampled data

In [180]:
data_sample.to_csv('./data/sampled-data-for-annontation.csv', index=False)