In [1]:
import pandas as pd
import numpy as np

In [2]:
# Define paths to extraction files.
extraction_paths = {'ta': '/Users/khalil/data/tripadvisor/extractions.csv',
                    'ba': '/Users/khalil/data/beeradvocate/extractions-clean.csv',
                    'yp': '/Users/khalil/data/yelp/extractions-clean.csv'}



df_extractions_ta = pd.read_csv(extraction_paths['ta']).rename(columns={'member_id': 'user_id', 'hotel_id': 'item_id'})
df_extractions_ba = pd.read_csv(extraction_paths['ba'])
df_extractions_yp = pd.read_csv(extraction_paths['yp'])

datasets = {
    'ta': df_extractions_ta, 'ba': df_extractions_ba, 'yp': df_extractions_yp
}

In [3]:
df_extractions_ta.head()

Unnamed: 0,feature,review_id,user_id,item_id,city,sentiment,amenity
0,breakfast,review_174376340,5B020C0FB46F812D76FA2EB9A4B4125F,Lynam_s_Hotel,Dublin,0.2,free breakfast
1,money,review_174376340,5B020C0FB46F812D76FA2EB9A4B4125F,Lynam_s_Hotel,Dublin,0.0,room service
2,jacket,review_174376340,5B020C0FB46F812D76FA2EB9A4B4125F,Lynam_s_Hotel,Dublin,0.0,room service
3,trip,review_174376340,5B020C0FB46F812D76FA2EB9A4B4125F,Lynam_s_Hotel,Dublin,0.4,business center
4,appeal,review_174282794,7E6A02ECD950A5D66EEDE5D0F9518CC8,Lynam_s_Hotel,Dublin,0.5,room service


In [4]:
df_extractions_ba.head()

Unnamed: 0,feature,feature_index,item_id,opinion,opinion_index,opinion_pattern,original_feature,rating,rating_date,review_id,sentence_idx,sentence_str,sentiment,user_id,amenity
0,taste,1,773,beautiful,3.0,FEATURE_VBZ_JJ,taste,4.0,2004-01-23 22:13:24,r87066,7,"The taste is beautiful, it is impressively lig...",0.85,bditty187,taste
1,flavor,19,773,other,23.0,FEATURE_VBZ_CC_DT_JJ,flavor,4.0,2004-01-23 22:13:24,r87066,7,"The taste is beautiful, it is impressively lig...",-0.125,bditty187,taste
2,end,25,773,other,23.0,JJ_CD_FEATURE,ends,4.0,2004-01-23 22:13:24,r87066,7,"The taste is beautiful, it is impressively lig...",-0.125,bditty187,taste
3,body,1,773,thin,3.0,FEATURE_VBZ_JJ,body,4.0,2004-01-23 22:13:24,r87066,8,"The body is thin, medium-light; the sensation ...",-0.4,bditty187,palate
4,sensation,6,773,thin,3.0,JJ_JJ_DT_FEATURE,sensation,4.0,2004-01-23 22:13:24,r87066,8,"The body is thin, medium-light; the sensation ...",-0.4,bditty187,palate


In [5]:
df_extractions_yp.head()

Unnamed: 0,feature,feature_index,item_id,opinion,opinion_index,opinion_pattern,original_feature,rating,rating_date,review_id,sentence_idx,sentence_str,sentiment,user_id,amenity
0,food,1.0,5UmKMjUEUNdYWqANhGckJw,excellent,0.0,JJ_FEATURE,food,5.0,2014-02-13,KPvLNJ21_4wbYNctrOwWdQ,0.0,Excellent food.,1.0,Iu6AxdBYGR4A0wspR9BYHA,service
1,food,3.0,PdWe3jF9YEu5fWBiQXZIgg,boring,2.0,JJ_FEATURE,food,1.0,2014-11-27,RITnsXg_bhrtbvSj9XQ4fg,0.0,Over priced boring food.,-1.0,0Hmez6GVD2c_mGkhCgP9PQ,service
2,customer service,1.0,5UmKMjUEUNdYWqANhGckJw,superb,0.0,JJ_FEATURE_NN,customer,5.0,2014-02-13,KPvLNJ21_4wbYNctrOwWdQ,1.0,Superb customer service.,1.0,Iu6AxdBYGR4A0wspR9BYHA,service
3,burrito,3.0,VwCYKGji0en-y4ebOXNsnA,super,2.0,JJ_FEATURE,burrito,5.0,2014-02-15,TufeAX-p-b-dDQQPVCnpSw,2.0,The carnitas super burrito is amazing,0.333333,fhWhD-DmDBTmrBP0icp0_Q,food
4,rcvd,2.0,PdWe3jF9YEu5fWBiQXZIgg,little,4.0,FEATURE_IN_JJ,rcvd,1.0,2014-11-27,RITnsXg_bhrtbvSj9XQ4fg,2.0,Where we rcvd 4 little sliders & 4wings.,-0.1875,0Hmez6GVD2c_mGkhCgP9PQ,food


## Average features per review

In [17]:
print('Average features per review:')
from IPython.display import display
for dataset, df in datasets.items():
    df_g = df.groupby('review_id', as_index=False).agg({'amenity': lambda x: len(np.unique(x))})
    df_g.rename(columns={'amenity': 'n_features'}, inplace=True)
    df_g.to_csv('../data/num-features-per-review-{}.csv'.format(dataset), index=False)
    print('- {dataset}: μ={mean:.1f} ± {stddev:.1f}'.format(dataset=dataset.upper(), 
                                                          mean=df_g.n_features.mean(), 
                                                          stddev=df_g.n_features.std()))

Average features per review:
- TA: μ=2.5 ± 1.3
- BA: μ=3.8 ± 1.1
- YP: μ=3.2 ± 1.2


## Average number of features per item


In [18]:
print('Average features per item:')
from IPython.display import display
for dataset, df in datasets.items():
    df_g = df.groupby('item_id', as_index=False).agg({'amenity': lambda x: len(np.unique(x))})
    df_g.rename(columns={'amenity': 'n_features'}, inplace=True)
    df_g.to_csv('../data/num-features-per-item-{}.csv'.format(dataset), index=False)
    print('- {dataset}: μ={mean:.1f} ± {stddev:.1f}'.format(dataset=dataset.upper(), 
                                                          mean=df_g.n_features.mean(), 
                                                          stddev=df_g.n_features.std()))

Average features per item:
- TA: μ=10.8 ± 3.5
- BA: μ=4.4 ± 1.0
- YP: μ=4.9 ± 1.6


## Average number of features per user 

In [None]:
print('Average features per user:')
for dataset, df in datasets.items():
    df_g = df.groupby('user_id', as_index=False).agg({'amenity': lambda x: len(np.unique(x))})
    df_g.rename(columns={'amenity': 'n_features'}, inplace=True)
    df_g.to_csv('../data/num-features-per-user-{}.csv'.format(dataset), index=False)
    print('- {dataset}: μ={mean:.1f} ± {stddev:.1f}'.format(dataset=dataset.upper(), 
                                                          mean=df_g.n_features.mean(), 
                                                          stddev=df_g.n_features.std()))

Average features per user:
- TA: μ=2.9 ± 1.7
- BA: μ=4.7 ± 1.0
