In [2]:
# imports
import nltk
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import json

from nltk.stem import PorterStemmer
from nltk import word_tokenize, sent_tokenize
try:
    from nltk.corpus import stopwords
    from string import punctuation
    stop = stopwords.words('english')
    modals = ['can', 'will', 'must', 'should', 'might']
except:
    nltk.download('stopwords')

In [122]:
json_data = {}

with open('../../data/raw/yp_leilanis-lahaina-2_rws.json') as f:
    json_data = json.loads(f.read())

dataset = json_normalize(json_data['reviews'])
dataset.head()

Unnamed: 0,author,datePublished,description,ratingValue
0,Giorgio C.,2018-10-13,"Try good service, beach front so a bit loud. M...",4
1,Maxx C.,2018-10-05,When we arrived they gave us a choice of eatin...,5
2,Al D.,2018-10-04,Stopped in here on a Tuesday evening around 8p...,4
3,Zachary D.,2018-09-29,Hawaiian chain type restaurant with pretty dec...,4
4,Chilly P.,2018-07-10,Oh my. Where do I even begin...\n\nLet's start...,1


In [3]:
dataset1 = pd.read_csv('../../data/raw/yp_competitors_rws_0001_0050.csv')
dataset2 = pd.read_csv('../../data/raw/yp_competitors_rws_0051_2506.csv')
dataset = pd.concat([dataset1, dataset2])
dataset.head()

Unnamed: 0,alias,ratingValue,dataPublished,description,author
0,kimos-maui-lahaina,5,2019-01-06,I stumbled across this great restaurant overlo...,Bella L.
1,kimos-maui-lahaina,5,2019-01-04,Excellent view on the ocean at sunset.\nExcell...,Rachou A.
2,kimos-maui-lahaina,3,2018-12-25,This place was not what the reviews portrayed ...,Ozzetta B.
3,kimos-maui-lahaina,2,2018-12-08,We were excited to repeat our Keoki's (in Kaua...,Arleen C.
4,kimos-maui-lahaina,3,2018-11-29,"If you're looking for a tourist spot, this is ...",Carol B.


In [4]:
dataset.shape

(454035, 5)

In [5]:
reviews = dataset.description
reviews.head()

0    I stumbled across this great restaurant overlo...
1    Excellent view on the ocean at sunset.\nExcell...
2    This place was not what the reviews portrayed ...
3    We were excited to repeat our Keoki's (in Kaua...
4    If you're looking for a tourist spot, this is ...
Name: description, dtype: object

In [6]:
dataset['sentiment'] = dataset.ratingValue.apply(lambda x: 1 if x > 3 else 0)

In [7]:
dataset['word_count'] = reviews.apply(lambda x: len(word_tokenize(x)))

In [8]:
dataset['sent_count'] = reviews.apply(lambda x: len(sent_tokenize(x)))

In [9]:
dataset['chr_count'] = reviews.str.len()

In [10]:
dataset['avg_word_len'] = reviews.apply(lambda x: (sum(len(w) for w in word_tokenize(x))) / len(word_tokenize(x)))

In [11]:
dataset['avg_sent_len'] = reviews.apply(lambda x: len(word_tokenize(x)) / len(sent_tokenize(x)))

In [12]:
dataset['num_of_stopwords'] = reviews.apply(lambda x: len([x for x in word_tokenize(x) if x in stop]))

In [13]:
dataset['num_of_modals'] = reviews.apply(lambda x: len([x for x in word_tokenize(x) if x in stop]))

In [14]:
dataset['hashtags'] = reviews.apply(lambda x: len([x for x in x.split() if x.startswith('#')]))

In [15]:
dataset['mentions'] = reviews.apply(lambda x: len([x for x in x.split() if x.startswith('@')]))

In [16]:
dataset['numerics'] = reviews.apply(lambda x: len([x for x in word_tokenize(x) if x.isdigit()]))

In [17]:
dataset['uppercase_cnt'] = reviews.apply(lambda x: len([x for x in word_tokenize(x) if x.isupper()]))

In [18]:
dataset['punctuation_cnt'] = reviews.apply(lambda x: len([x for x in word_tokenize(x) if x in punctuation]))

In [19]:
dataset['vocab_cnt'] = reviews.apply(lambda x: len(set(i.lower() for i in word_tokenize(x) if i.isalpha())))

In [20]:
dataset['ratio_lexical'] = reviews.apply(lambda x: len(set(word_tokenize(x))) / len(word_tokenize(x)))

In [21]:
dataset['ratio_content'] = dataset.apply(lambda x: (x.word_count - x.num_of_stopwords) / x.word_count, axis=1)

In [22]:
dataset.head()

Unnamed: 0,alias,ratingValue,dataPublished,description,author,sentiment,word_count,sent_count,chr_count,avg_word_len,...,num_of_stopwords,num_of_modals,hashtags,mentions,numerics,uppercase_cnt,punctuation_cnt,vocab_cnt,ratio_lexical,ratio_content
0,kimos-maui-lahaina,5,2019-01-06,I stumbled across this great restaurant overlo...,Bella L.,1,135,11,664,4.022222,...,44,44,0,0,0,10,17,76,0.637037,0.674074
1,kimos-maui-lahaina,5,2019-01-04,Excellent view on the ocean at sunset.\nExcell...,Rachou A.,1,36,5,160,3.611111,...,13,13,0,0,0,0,6,25,0.777778,0.638889
2,kimos-maui-lahaina,3,2018-12-25,This place was not what the reviews portrayed ...,Ozzetta B.,0,275,14,1229,3.596364,...,119,119,0,0,2,5,31,130,0.534545,0.567273
3,kimos-maui-lahaina,2,2018-12-08,We were excited to repeat our Keoki's (in Kaua...,Arleen C.,0,475,34,2226,3.783158,...,188,188,0,0,2,7,43,206,0.484211,0.604211
4,kimos-maui-lahaina,3,2018-11-29,"If you're looking for a tourist spot, this is ...",Carol B.,0,168,10,776,3.732143,...,61,61,0,0,2,1,21,97,0.64881,0.636905


In [23]:
dataset.to_csv('../../data/processed/yp_competitors_rws_0001_0256_basicfeatures.csv', index=False)

In [24]:
dataset_test = pd.read_csv('../../data/processed/yp_competitors_rws_0001_0256_basicfeatures.csv')
dataset_test.head()

Unnamed: 0,alias,ratingValue,dataPublished,description,author,sentiment,word_count,sent_count,chr_count,avg_word_len,...,num_of_stopwords,num_of_modals,hashtags,mentions,numerics,uppercase_cnt,punctuation_cnt,vocab_cnt,ratio_lexical,ratio_content
0,kimos-maui-lahaina,5,2019-01-06,I stumbled across this great restaurant overlo...,Bella L.,1,135,11,664,4.022222,...,44,44,0,0,0,10,17,76,0.637037,0.674074
1,kimos-maui-lahaina,5,2019-01-04,Excellent view on the ocean at sunset.\nExcell...,Rachou A.,1,36,5,160,3.611111,...,13,13,0,0,0,0,6,25,0.777778,0.638889
2,kimos-maui-lahaina,3,2018-12-25,This place was not what the reviews portrayed ...,Ozzetta B.,0,275,14,1229,3.596364,...,119,119,0,0,2,5,31,130,0.534545,0.567273
3,kimos-maui-lahaina,2,2018-12-08,We were excited to repeat our Keoki's (in Kaua...,Arleen C.,0,475,34,2226,3.783158,...,188,188,0,0,2,7,43,206,0.484211,0.604211
4,kimos-maui-lahaina,3,2018-11-29,"If you're looking for a tourist spot, this is ...",Carol B.,0,168,10,776,3.732143,...,61,61,0,0,2,1,21,97,0.64881,0.636905


In [141]:
dataset.to_csv('../../data/processed/yp_leilanis-lahaina-2_rws_basicfeatures.csv', index=False)

In [30]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19018 entries, 0 to 19017
Data columns (total 21 columns):
alias               19018 non-null object
ratingValue         19018 non-null int64
dataPublished       19018 non-null object
description         19018 non-null object
author              19018 non-null object
sentiment           19018 non-null int64
word_count          19018 non-null int64
sent_count          19018 non-null int64
chr_count           19018 non-null int64
avg_word_len        19018 non-null float64
avg_sent_len        19018 non-null float64
num_of_stopwords    19018 non-null int64
num_of_modals       19018 non-null int64
hashtags            19018 non-null int64
mentions            19018 non-null int64
numerics            19018 non-null int64
uppercase_cnt       19018 non-null int64
punctuation_cnt     19018 non-null int64
vocab_cnt           19018 non-null int64
ratio_lexical       19018 non-null float64
ratio_content       19018 non-null float64
dtypes: float64(4)

In [142]:
dataset_test = pd.read_csv('../../data/processed/yp_leilanis-lahaina-2_rws_basicfeatures.csv')
dataset_test.head()

Unnamed: 0,author,datePublished,description,ratingValue,sentiment,word_count,sent_count,chr_count,avg_word_len,avg_sent_len,num_of_stopwords,num_of_modals,hashtags,mentions,numerics,uppercase_cnt,punctuation_cnt,vocab_cnt,ratio_lexical,ratio_content
0,Giorgio C.,2018-10-13,"Try good service, beach front so a bit loud. M...",4,1,83,8,447,4.53012,10.375,18,18,0,0,0,1,13,60,0.771084,0.783133
1,Maxx C.,2018-10-05,When we arrived they gave us a choice of eatin...,5,1,190,9,889,3.731579,21.111111,81,81,0,0,2,2,18,110,0.642105,0.573684
2,Al D.,2018-10-04,Stopped in here on a Tuesday evening around 8p...,4,1,109,5,503,3.688073,21.8,53,53,0,0,0,2,6,72,0.724771,0.513761
3,Zachary D.,2018-09-29,Hawaiian chain type restaurant with pretty dec...,4,1,149,8,728,3.973154,18.625,50,50,0,0,3,0,15,101,0.778523,0.66443
4,Chilly P.,2018-07-10,Oh my. Where do I even begin...\n\nLet's start...,1,0,463,41,2134,3.717063,11.292683,146,146,0,0,3,20,68,217,0.555076,0.684665


In [143]:
dataset_test.columns

Index(['author', 'datePublished', 'description', 'ratingValue', 'sentiment',
       'word_count', 'sent_count', 'chr_count', 'avg_word_len', 'avg_sent_len',
       'num_of_stopwords', 'num_of_modals', 'hashtags', 'mentions', 'numerics',
       'uppercase_cnt', 'punctuation_cnt', 'vocab_cnt', 'ratio_lexical',
       'ratio_content'],
      dtype='object')

In [161]:
dataset.groupby(by='sentiment').mean()

Unnamed: 0_level_0,ratingValue,word_count,sent_count,chr_count,avg_word_len,avg_sent_len,num_of_stopwords,num_of_modals,hashtags,mentions,numerics,uppercase_cnt,punctuation_cnt,vocab_cnt,ratio_lexical,ratio_content
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,2.342513,126.919105,8.199656,596.316695,3.821712,15.766697,47.459552,47.459552,0.0,0.001721,0.876076,2.781411,13.889845,71.710843,0.713926,0.637807
1,4.529978,91.528497,6.698742,432.162842,3.867571,13.600866,32.367876,32.367876,0.002961,0.002221,0.470022,1.755736,10.738712,55.247224,0.746751,0.664668


In [162]:
dataset.groupby(by='ratingValue').mean()

Unnamed: 0_level_0,sentiment,word_count,sent_count,chr_count,avg_word_len,avg_sent_len,num_of_stopwords,num_of_modals,hashtags,mentions,numerics,uppercase_cnt,punctuation_cnt,vocab_cnt,ratio_lexical,ratio_content
ratingValue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0.0,154.79646,9.690265,732.982301,3.835064,16.656746,59.424779,59.424779,0.0,0.0,1.283186,3.548673,15.734513,83.442478,0.697509,0.62723
2,0.0,144.974359,9.141026,681.423077,3.810474,15.863303,54.762821,54.762821,0.0,0.0,0.910256,3.262821,15.551282,80.410256,0.694039,0.633846
3,0.0,107.794872,7.189103,504.266026,3.822495,15.396037,39.474359,39.474359,0.0,0.003205,0.711538,2.262821,12.391026,63.112179,0.729815,0.643618
4,1.0,95.341732,6.692913,450.182677,3.871286,14.221212,34.048819,34.048819,0.001575,0.00315,0.544882,1.746457,11.080315,57.072441,0.740863,0.661342
5,1.0,88.146648,6.703911,416.181564,3.864276,13.050699,30.877095,30.877095,0.00419,0.001397,0.403631,1.763966,10.435754,53.628492,0.751973,0.667618
