In [1]:
# import libraries
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
import scipy.stats as stats
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# import location and price df
df_location_price = pd.read_pickle('data/yelp_price_location.pkl')

In [3]:
df_location_price.head()

Unnamed: 0,categories,price,rating,review_count,Lat,Lon,state
0,Salad|Seafood|American (Traditional),2,3.5,372,38.997397,-77.026797,MD
1,Pizza|American (New)|Salad,2,3.5,192,38.919506,-77.224311,VA
2,Breakfast & Brunch|American (Traditional)|Burgers,2,2.5,125,39.01419,-76.92827,MD
3,Seafood,1,4.0,13,38.854145,-76.897896,MD
4,Donuts|Coffee & Tea|Breakfast & Brunch,1,4.5,2,38.79054,-77.076455,VA


In [4]:
# look at review count and rating vs state first split into 3 groups
# DC, VA, MD
DC = df_location_price[df_location_price.state == 'DC'][[
    'state', 'review_count', 'rating']].reset_index()
MD = df_location_price[df_location_price.state == 'MD'][[
    'state', 'review_count', 'rating']].reset_index()
VA = df_location_price[df_location_price.state == 'VA'][[
    'state', 'review_count', 'rating']].reset_index()

In [18]:
# set up one way anova test and tukey hsd 
def one_way_anove(groups, colname):
    '''
    groups: a list of dataframes for comparision
    filters: a string, colname for comparison
    return: datas with compare data in a list of arrays and anova result 
    '''
    datas = []
    n = len(groups)
    for i in range(n):
        data_i = groups[i][colname]
        datas.append(data_i)
    return (stats.f_oneway(*datas), datas)

In [34]:
# set up tukey hsd for post anova with significance 
def tukey_hsd(datas, group_names, colname):
    '''
    datas: from one_way_anova function, a list of data for each group
    group_names: a list of strings with group names, datas order should be same as group_names
    colname: string, the category to compare 
    return tukeyhsd table result and stacked table 
    '''
    # arrange to proper df format for tukey hsd
    df = pd.DataFrame()
    for i in range(len(group_names)):
        df[group_names[i]] = datas[i]
    stacked_df = df.stack().reset_index()
    stacked_df = stacked_df.rename(columns ={'level_0':'id','level_1':'state',0:colname})
    MultiComp = MultiComparison(stacked_df[colname],
                            stacked_df['state'])
    return (stacked_df, MultiComp.tukeyhsd().summary())

In [38]:
# one way anova of 3 samples_review count 
datas_review = one_way_anove([DC,MD,VA],'review_count')[1]
anova_result_review = one_way_anove([DC,MD,VA],'review_count')[0]
anova_result
# p value is way below 0.05 so we will assume there is significance 

F_onewayResult(statistic=70.58696711367261, pvalue=4.589897527593301e-31)

In [42]:
# tukey to figure out what is going on in details 
stacked_review = tukey_hsd(datas_review,['DC','MD','VA'],'review_count')[0]
tukey_result_review = tukey_hsd(datas_review,['DC','MD','VA'],'review_count')[1]
tukey_result_review
# MD has the most traffic then VA then DC 

group1,group2,meandiff,lower,upper,reject
DC,MD,-113.3569,-135.8076,-90.9062,True
DC,VA,-39.189,-61.2855,-17.0926,True
MD,VA,74.1679,50.3903,97.9454,True


In [43]:
# check for rating 
datas_rating = one_way_anove([DC,MD,VA],'rating')[1]
anova_result_rating = one_way_anove([DC,MD,VA],'rating')[0]
anova_result_rating
# p value is way below 0.05 so we will assume there is significance 

F_onewayResult(statistic=70.06362167068008, pvalue=7.663599803048584e-31)

In [44]:
# tukey to figure out what is going on in details 
stacked_rating = tukey_hsd(datas_review,['DC','MD','VA'],'rating')[0]
tukey_result_rating = tukey_hsd(datas_review,['DC','MD','VA'],'rating')[1]
tukey_result_rating
# MD has the most traffic then VA then DC 

group1,group2,meandiff,lower,upper,reject
DC,MD,-113.3569,-135.8076,-90.9062,True
DC,VA,-39.189,-61.2855,-17.0926,True
MD,VA,74.1679,50.3903,97.9454,True
