In [17]:
import numpy as np
import pandas as pd

## Basic

In [18]:
# import the scraped data
data_raw_ = pd.read_csv('./data/trip-advisor-comments.csv')
data_raw_.head()

Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment
0,Positano @ RP,"Italian, European",aisvslife98,5,I enjoyed my time here with my girlfriends! Fa...
1,Positano @ RP,"Italian, European",Odyssey44198198885,5,Wonderful and amazing service experience. Defi...
2,Positano @ RP,"Italian, European",Ninifazelin,5,Great food and wonderful service! Will definit...
3,Positano @ RP,"Italian, European",Amaliamazlan,5,Not my first time in Positano and definitely w...
4,Positano @ RP,"Italian, European",Shahzanstim,5,Excellent service from the staff. The beef was...


In [19]:
data_raw_.shape

(97190, 5)

In [20]:
# remove duplicates in the data
data_raw = data_raw_.drop_duplicates(subset={"Reviewer\'s Name","Comment"}, keep='first', inplace=False)
data_raw.shape

(88042, 5)

In [21]:
#Checking to see how much % of data still remains after removing duplications
(data_raw['Reviewer\'s Name'].size*1.0)/(data_raw_['Reviewer\'s Name'].size*1.0)*100

90.58750900298385

In [22]:
# check distribution of the data's rating label
data_raw['Rating'].value_counts()

5    46304
4    25976
3     8837
1     3464
2     3461
Name: Rating, dtype: int64

## Preprocess the data
##### Partition the ratings to 3 classes only
-1 (negative) <- 1,2  
0 (neutral) <- 3  
1 (positive) <- 4,5

In [23]:
def partition(x):
    if x < 3:
        return -1
    elif x == 3:
        return 0
    else:   
        return 1

In [24]:
ratings = data_raw['Rating']
rating_converted = ratings.map(partition)
data_raw['rating_class_main'] = rating_converted
data_raw.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
0,Positano @ RP,"Italian, European",aisvslife98,5,I enjoyed my time here with my girlfriends! Fa...,1
1,Positano @ RP,"Italian, European",Odyssey44198198885,5,Wonderful and amazing service experience. Defi...,1
2,Positano @ RP,"Italian, European",Ninifazelin,5,Great food and wonderful service! Will definit...,1
3,Positano @ RP,"Italian, European",Amaliamazlan,5,Not my first time in Positano and definitely w...,1
4,Positano @ RP,"Italian, European",Shahzanstim,5,Excellent service from the staff. The beef was...,1


In [25]:
data_raw.dtypes

Restaurant Name      object
Restaurant Type      object
Reviewer's Name      object
Rating                int64
Comment              object
rating_class_main     int64
dtype: object

In [26]:
# check distribution of the data's class label
data_raw['rating_class_main'].value_counts()

 1    72280
 0     8837
-1     6925
Name: rating_class_main, dtype: int64

In [27]:
# export non-duplicated patitioned csv for SOLR
data_raw.to_csv('./data/trip-advisor-comments-no-duplicate-partitioned.csv', index=False)

## Choosing the data
We will want the comments to be longer as it will be more informative for the annontator

In [28]:
# import data
data_raw = pd.read_csv('./data/trip-advisor-comments-no-duplicate-partitioned.csv')
data_raw.head()

Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
0,Positano @ RP,"Italian, European",aisvslife98,5,I enjoyed my time here with my girlfriends! Fa...,1
1,Positano @ RP,"Italian, European",Odyssey44198198885,5,Wonderful and amazing service experience. Defi...,1
2,Positano @ RP,"Italian, European",Ninifazelin,5,Great food and wonderful service! Will definit...,1
3,Positano @ RP,"Italian, European",Amaliamazlan,5,Not my first time in Positano and definitely w...,1
4,Positano @ RP,"Italian, European",Shahzanstim,5,Excellent service from the staff. The beef was...,1


In [33]:
# check distribution of the data's class label
data_raw['rating_class_main'].value_counts()

 1    72280
 0     8837
-1     6925
Name: rating_class_main, dtype: int64

In [29]:
# a constant that removes the entries either less than this amount of words
N = 30

In [30]:
# remove those rows where the data entries are less than N words
data = data_raw[data_raw['Comment'].str.split().str.len().ge(N)]
data.head(5)

Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
0,Positano @ RP,"Italian, European",aisvslife98,5,I enjoyed my time here with my girlfriends! Fa...,1
3,Positano @ RP,"Italian, European",Amaliamazlan,5,Not my first time in Positano and definitely w...,1
9,Positano @ RP,"Italian, European",faithfu7,5,"Great food . Love the pasta , milk shake . Sta...",1
14,Positano @ RP,"Italian, European",Chuanhantravels,5,"Service was awesome very friendly staff, food ...",1
18,Positano @ RP,"Italian, European",Matsayshi29,5,First time here and truly wowed by the quality...,1


In [31]:
data.shape

(65270, 6)

In [34]:
# check distribution of the data's rating label
data['rating_class_main'].value_counts()

 1    51732
 0     7327
-1     6211
Name: rating_class_main, dtype: int64

## Sample the data
Choose 250 from class 1  
Choose 250 from class -1  
Choose 500 from class 0

In [42]:
# sample 500 neutral comments
data_zero = data[data['rating_class_main'] == 0].sample(500, replace=False)
print(data_zero.shape)
data_zero.head(3)

(500, 6)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
44414,Salt tapas & bar,"Bar, Mediterranean",MelbourneFrog69,3,We went there for a quick bite and a drink. Th...,0
80817,Gudetama Cafe,Cafe,Wiwan,3,This is our 2nd visit. We came not long after ...,0
62612,The Coastal Settlement,"Bar, Diner",AlvinSohYK,3,Nice relax ambience tucked away in a quiet par...,0


In [36]:
# sample 250 negative comments
data_negative = data[data['rating_class_main'] == -1].sample(250, replace=False)
print(data_negative.shape)
data_negative.head(3)

(250, 6)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
57955,Raffles Courtyard,"Bar, European",Mgn12,1,Major disappointment. I booked for 7 pm. The s...,-1
68342,Bee Cheng Hiang,"Asian, Singaporean",Aw1009,1,I bought 500g minced pork bakkwa from your Nor...,-1
81583,Dim Sum Haus,"Chinese, Asian",anastasiabY320YC,1,A restaurant that does not treat their custome...,-1


In [37]:
# sample 250 positive comments
data_positive = data[data['rating_class_main'] == 1].sample(250, replace=False)
print(data_positive.shape)
data_positive.head(3)

(250, 6)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
59401,Gokul Vegetarian,"Indian, Asian",Lucy J,4,"I had masala dosa here, which was so tasty.a s...",1
57603,Southbridge,"Bar, Fusion",Lars R,5,Good food and superb drinks in a small hideawa...,1
18016,Garibaldi Italian Restaurant & Bar,"Italian, European",shirley2h,5,These were the 2nd time i visited the restaura...,1


In [38]:
# fit new dataframe to form the final sample
data_sample_ = pd.concat([data_zero, data_negative, data_positive])
print(data_sample_.shape)
data_sample_.head()

(1000, 6)


Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
45485,The Refinery Restaurant & Bar,"Asian, Bar",jadeylf,3,The Refinery is located in a fun and relaxing ...,0
85514,The Soup Spoon,$$ - $$$,PKA3300,3,Avoidable food. Avoidable service. \n\nWe went...,0
23993,Bread Street Kitchen,$$ - $$$,OSwander,3,We visited in January 2020 after reading about...,0
24900,Lagnaa...barefoot dining,"Indian, Asian",timothy k,3,"Dinner for four, we all considered the food to...",0
13563,The Song of India,"Indian, Asian",pradeepc406,3,Hi \n\nWe were a group of 7 people and I was c...,0


In [39]:
# randomise the order of the sample dataframe
data_sample = data_sample_.sample(frac=1)
data_sample.head()

Unnamed: 0,Restaurant Name,Restaurant Type,Reviewer's Name,Rating,Comment,rating_class_main
11970,CE LA VI,"Asian, Fusion",mmuazhiim,5,Situated on the top of Marina Bay sands this r...,1
39026,I am...,"Dutch, European",PCSim,5,Only learnt of this restaurant from my Friend....,1
16109,The National Kitchen by Violet Oon at the Nati...,"Asian, Singaporean",otnielaldi,5,"Great restaurant ambience, with much attention...",1
72948,Ginger Thai,"Thai, Bar",Sharijer,3,"Giving it this rating, because there was no di...",0
51623,Brotzeit (Raffles City),"German, Bar",Mglap,2,Visited the place as we need to eat some weste...,-1


In [40]:
# final check on the sampled data
print(len(data_sample[data_sample['rating_class_main'] == 0]) == 500)
print(len(data_sample[data_sample['rating_class_main'] == -1]) == 250)
print(len(data_sample[data_sample['rating_class_main'] == 1]) == 250)

True
True
True


## Export the sampled data

In [None]:
data_sample.to_csv('./data/sampled-data-for-annontation.csv', index=False)