In [135]:
import pandas as pd
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [93]:
def get_ss_by_factor(df, factor_main, factor_second, target):
    df_means = df.groupby(by = [factor_main, factor_second]).mean()
    g_m = df[target].mean()
    total = 0
    for i in df[factor_main].unique():
        mean_g = df[df[factor_main]==i][target].mean()
        total +=  sum([(g_m - mean_g)**2 for x in df[df[factor_main]==i][target]])
    return total  


In [129]:
def get_ss_within(df, first_factor, second_factor, target):
    s = 0
    def get_combos():
        first = df[first_factor].unique()
        second = df[second_factor].unique()
        combos = []
        for i in first:
            for j in second:
                combos.append((i,j))
        return combos
    def get_mean(age, gender):
        f = df[df[first_factor]==age]
        s = f[f[second_factor]==gender]
        return s
    for i in get_combos():
        df_first = get_mean(i[0], i[1])
        mean = df_first[target].mean()
        print(mean)
        print([x  for x in df_first[target]])
        s += sum([(x - mean)**2 for x in df_first[target]])
    return s

In [95]:
def get_ss_total(df, target):
    g_m = df[target].mean()
    return sum([(x-g_m)**2 for x in df[target]])

In [96]:
def get_ss_both_factors(df, ss_first, ss_second, ss_within, ss_total):
    return ss_total - (ss_gender + ss_age + ss_within)


In [97]:
def get_df(df, target):
    cols = list(df.columns)
    cols.remove(target)
    df_first =  len(list(df[cols[0]].unique())) - 1
    df_second = len(list(df[cols[1]].unique())) - 1
    df_within = len(df[target]) - (len(df[cols[0]].unique())*len(df[cols[1]].unique()))
    df_both_factors = df_first * df_second
    return df_first, df_second, df_within, df_both_factors


In [98]:
def do_anova(df, target):
    cols = list(df.columns)
    cols.remove(target)
    ss_first = get_ss_by_factor(df, cols[0], cols[1], target)
    print(ss_first)
    ss_second = get_ss_by_factor(df, cols[1], cols[0], target)
    print(ss_second)
    ss_within = get_ss_within(df, cols[0], cols[1], target)
    print(ss_within)
    ss_total  = get_ss_total(df, target)
    ss_both_factors = ss_total - (ss_first + ss_second + ss_within)
    df_first, df_second, df_within, df_both_factors = get_df(df, target)
    mean_first = ss_first/df_first
    mean_second = ss_second/df_second
    mean_ss_within = ss_within/df_within
    mean_both_factors = ss_both_factors/df_both_factors
    first_f_score = mean_first/mean_ss_within
    second_f_score = mean_second/mean_ss_within
    both_factors_f_score = mean_both_factors/mean_ss_within
    p_first = stats.f.sf(first_f_score, df_first, df_within)
    p_second = stats.f.sf(second_f_score, df_second, df_within)
    p_both_factors = stats.f.sf(both_factors_f_score, df_both_factors, df_within)
    return ('within', float(both_factors_f_score), float(p_both_factors)),\
        (cols[0], float(first_f_score), float(p_first)),\
        (cols[1], float(second_f_score), float(p_second))

In [140]:
"""TEST IT"""
d = [
    ['b', 4, '10'],
    ['b', 6, '10'],
    ['b', 8, '10'],
    ['g', 4, '10'],
    ['g', 8, '10'],
    ['g', 9, '10'],
    ['b', 6, '11'],
    ['b', 6, '11'],
    ['b', 9, '11'],
    ['g', 7, '11'],
    ['g', 10, '11'],
    ['g', 13, '11'],
    ['b', 8, '12'],
    ['b', 9, '12'],
    ['b', 13, '12'],
    ['g', 12, '12'],
    ['g', 14, '12'],
    ['g', 16, '12'],

    ]
df = pd.DataFrame(d,   columns = ['gender', 'score', 'age'])
do_anova(df, 'score')

32.0
93.0
6.0
[4, 6, 8]
7.0
[6, 6, 9]
10.0
[8, 9, 13]
7.0
[4, 8, 9]
10.0
[7, 10, 13]
14.0
[12, 14, 16]
68.0


(('within', 0.6176470588235293, 0.5555023440711991),
 ('gender', 5.647058823529411, 0.03499435061989512),
 ('age', 8.205882352941176, 0.005676729758203095))

In [141]:
formula = 'score ~ C(age) + C(gender) '
model = ols(formula, df).fit()
aov_table = anova_lm(model, typ=2)
aov_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(age),93.0,2.0,8.68,0.003534
C(gender),32.0,1.0,5.973333,0.028367
Residual,75.0,14.0,,


In [128]:
def get_data_2():
    d = [
        ['1', '1', 65],
        ['1', '2', 49],
        ['1', '3', 50],
        ['2', '1', 53],
        ['2', '2', 51],
        ['2', '3', 48],
        ['3', '1', 47],
        ['3', '2', 45],
        ['3', '3', 50],
        ['4', '1', 51],
        ['4', '2', 43],
        ['4', '3', 52],
    ]
    df = pd.DataFrame(d,   
                     columns = ['coating', 'soil', 'corrosion'])
    return df
do_anova(get_data_2(), 'corrosion')

91.99999999999991
98.66666666666667
65.0
[65]
  coating soil  corrosion
0       1    1         65
49.0
[49]
  coating soil  corrosion
1       1    2         49
50.0
[50]
  coating soil  corrosion
2       1    3         50
53.0
[53]
  coating soil  corrosion
3       2    1         53
51.0
[51]
  coating soil  corrosion
4       2    2         51
48.0
[48]
  coating soil  corrosion
5       2    3         48
47.0
[47]
  coating soil  corrosion
6       3    1         47
45.0
[45]
  coating soil  corrosion
7       3    2         45
50.0
[50]
  coating soil  corrosion
8       3    3         50
51.0
[51]
  coating soil  corrosion
9       4    1         51
43.0
[43]
   coating soil  corrosion
10       4    2         43
52.0
[52]
   coating soil  corrosion
11       4    3         52
0.0


  from ipykernel import kernelapp as app


(('within', nan, nan), ('coating', nan, nan), ('soil', nan, nan))

In [138]:
def get_data_pen():
    d = [
        ['1', '1', .97],
        ['1', '2', .48],
        ['1', '3', .48],
        ['1', '4', .46],
        ['2', '1', .77],
        ['2', '2', .14],
        ['2', '3', .22],
        ['2', '4', .25],
        ['3', '1', .67],
        ['3', '2', .39],
        ['3', '3', .57],
        ['3', '4', .19],
    ]
    df = pd.DataFrame(d,   
                     columns = ['brand', 'washing', 'color_change'])
    return df
df = get_data_pen()
df.head()
formula = 'color_change ~ C(brand) + C(washing) '
model = ols(formula, df).fit()
aov_table = anova_lm(model, typ=2)

In [139]:
aov_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(brand),0.128217,2.0,4.432303,0.065765
C(washing),0.479692,3.0,11.054926,0.007399
Residual,0.086783,6.0,,
