In [21]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats

In [2]:
from jupyterthemes import jtplot
jtplot.style(theme='onedork', context='notebook', ticks=True, grid=False)

Fake some data:

In [3]:
A = [12.6, 12, 11.8, 11.9, 13, 12.5, 14]
B = [10, 10.2, 10, 12, 14, 13]
C = [10.1, 13, 13.4, 12.9, 8.9, 10.7, 13.6, 12]
all_scores = A + B + C
company_names = (['A'] * len(A)) +  (['B'] * len(B)) +  (['C'] * len(C))
data = pd.DataFrame({'company': company_names, 'score': all_scores})

In [4]:
data.groupby('company').mean()

Unnamed: 0_level_0,score
company,Unnamed: 1_level_1
A,12.542857
B,11.533333
C,11.825


## ANOVA using statsmodels

In [5]:
lm = ols('score ~ company',data=data).fit()
table = sm.stats.anova_lm(lm)
table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
company,2.0,3.606905,1.803452,0.821297,0.455683
Residual,18.0,39.525476,2.19586,,


## 3. 1-Way ANOVA by hand (from scratch)

### compute overall mean

In [6]:
overall_mean = data['score'].mean()
overall_mean

11.980952380952381

### compute *Sum of Squares Total*

In [7]:
data['overall_mean'] = overall_mean
ss_total = sum((data['score'] - data['overall_mean'])**2)
ss_total

43.132380952380956

### compute group means

In [8]:
group_means = data.groupby('company').mean()
group_means = group_means.rename(columns = {'score': 'group_mean'})
group_means

Unnamed: 0_level_0,group_mean,overall_mean
company,Unnamed: 1_level_1,Unnamed: 2_level_1
A,12.542857,11.980952
B,11.533333,11.980952
C,11.825,11.980952


In [13]:
# add group means and overall mean to the original data frame
data = data.merge(group_means, left_on = 'company', right_index = True, suffixes=('','_y'))

### compute *Sum of Squares Residual*

In [15]:
ss_residual = sum((data['score'] - data['group_mean'])**2)
ss_residual

39.52547619047619

### compute *Sum of Squares Model*

In [16]:

ss_explained = sum((data['overall_mean'] - data['group_mean'])**2)
ss_explained

3.6069047619047634

### compute *Mean Square Residual*

In [17]:

n_groups = len(set(data['company']))
n_obs = data.shape[0]
df_residual = n_obs - n_groups
ms_residual = ss_residual / df_residual
ms_residual

2.1958597883597886

### compute *Mean Square Explained*

In [18]:

df_explained = n_groups - 1
ms_explained = ss_explained / df_explained
ms_explained

1.8034523809523817

### compute *F-Value*

In [19]:

f = ms_explained / ms_residual
f

0.8212966923081559

### compute *p-value*

In [22]:
p_value = 1 - scipy.stats.f.cdf(f, df_explained, df_residual)
p_value

0.4556832940515234

In [23]:
table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
company,2.0,3.606905,1.803452,0.821297,0.455683
Residual,18.0,39.525476,2.19586,,
