In [16]:
import pandas as pd
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [2]:
def get_age_data():
    d = [
    ['b', 4, '10'],
    ['b', 6, '10'],
    ['b', 8, '10'],
    ['g', 4, '10'],
    ['g', 8, '10'],
    ['g', 9, '10'],
    ['b', 6, '11'],
    ['b', 6, '11'],
    ['b', 9, '11'],
    ['g', 7, '11'],
    ['g', 10, '11'],
    ['g', 13, '11'],
    ['b', 8, '12'],
    ['b', 9, '12'],
    ['b', 13, '12'],
    ['g', 12, '12'],
    ['g', 14, '12'],
    ['g', 16, '12'],

    ]
    df = pd.DataFrame(d,   
                     columns = ['gender', 'score', 'age'])
    
    return df
def get_means_table():
    df = get_age_data()
    df_means = df.groupby(by = ['age','gender']).mean()
    return df_means



In [3]:
def get_ss_gender():
    s = 0
    def by_gender(gender):
        df = get_age_data()
        df_means = df.groupby(by = ['gender','age']).mean()
        g_means = df_means.loc[gender].mean()
        grand_mean = df.mean()
        s_gender = sum([(g_means - grand_mean)**2 for x in df[df.gender==gender].score])
        return s_gender
    s += by_gender('b')
    s += by_gender('g')
    return s

In [4]:
def get_ss_by_factor(df, factor_main, factor_second):
    df_means = df.groupby(by = [factor_main, factor_second]).mean()
    g_m = df.mean()
    total = 0
    for i in df[factor_main].unique():
        mean_age = df[df[factor_main]==i].mean()
        total +=  sum([(g_m - mean_age)**2 for x in df[df[factor_main]==i].score])
    return total  


In [5]:
def get_ss_age():
    df = get_age_data()
    df_means = df.groupby(by = ['gender','age']).mean()
    g_m = df.mean()
    total = 0
    for i in df['age'].unique():
        mean_age = df[df.age==i].mean()
        total +=  sum([(g_m - mean_age)**2 for x in df[df.age==i].score])
    return total  



In [6]:
def get_ss_within():
    s = 0
    df = get_age_data()
    def get_mean(age, gender):
        f = df[df['age']==age]
        s = f[f['gender']==gender]
        return s
    data = [('10', 'b'), ('11', 'b'), ('12','b'), ('10', 'g'), ('11', 'g'), ('12', 'g')]
    for i in data:
        df_age = get_mean(i[0], i[1])
        mean = df_age.mean()
        s += sum([(x - mean)**2 for x in df_age.score])
    return s


In [None]:
def get_ss_within_gen(df, first_factor, second_factor, target):
    s = 0
    def get_combos():
        first = df[first_factor].unique()
        second = df[second_factor].unique()
        combos = []
        for i in first:
            for j in second:
                combos.append((i,j))
        return combos
    def get_mean(age, gender):
        f = df[df[first_factor]==age]
        s = f[f[second_factor]==gender]
        return s
    data = get_combos()
    for i in data:
        df_first = get_mean(i[0], i[1])
        mean = df_first.mean()
        s += sum([(x - mean)**2 for x in df_first[target]])
    return s


In [7]:
def get_sum_squares_total():
    df = get_age_data()
    g_m = df.mean()
    return sum([(x-g_m)**2 for x in df.score])

In [None]:
def get_sum_squares_total_gen(df, target):
    g_m = df.mean()
    return sum([(x-g_m)**2 for x in df[target]])

In [22]:
ss_gender = get_ss_gender()
ss_age = get_ss_age()
ss_total  = get_sum_squares_total()
ss_within = get_ss_within()
ss_both_factors = ss_total - (ss_gender + ss_age + ss_within)
df_gender = len(['b', 'g']) - 1
df_age = len(['10', '11', '12']) -1
df_within = 12
df_both_factors = df_gender * df_age
df = get_age_data()
df_total = len(df.score) -1
mean_ss_age = ss_age/df_age
mean_ss_gender = ss_gender/df_gender
mean_ss_within = ss_within/df_within
mean_both_factors = ss_both_factors/df_both_factors
gender_f_score = mean_ss_gender/mean_ss_within
age_f_score = mean_ss_age/mean_ss_within
both_factors_f_score = mean_both_factors/mean_ss_within
gender_f_score
#F(1, 12 )
p_gender = stats.f.sf(gender_f_score, df_gender, df_within)
p_age = stats.f.sf(age_f_score, df_age, df_within)
p_both_factors = stats.f.sf(both_factors_f_score, df_both_factors, df_within)
print(ss_gender)
print(ss_total)
print(mean_ss_within)
print('p gender {p}'.format(p= p_gender))
print('p age {p}'.format( p = p_age))

2


AssertionError: 

In [17]:
def lib_test():
    data = get_age_data()
    formula = 'score ~ C(age) + C(gender)'
    model = ols(formula, data).fit()
    aov_table = anova_lm(model, typ=2)
    return aov_table
lib_test()

Unnamed: 0,sum_sq,df,F,PR(>F)
C(age),93.0,2.0,8.68,0.003534
C(gender),32.0,1.0,5.973333,0.028367
Residual,75.0,14.0,,


In [None]:
ss_gender = get_ss_by_factor(get_age_data(), 'gender', 'age')
ss_age = get_ss_by_factor(get_age_data(), 'age', 'gender')
ss_within = get_ss_within_gen(get_age_data(), 'age', 'gender', 'score')
ss_total  = get_sum_squares_total()

ss_gender_2 = get_ss_gender()
ss_age_2 = get_ss_age()
ss_within_2 = get_ss_within()
ss_total_2 = get_sum_squares_total_gen(df, 'score')
print(ss_age, ss_age_2)
print(ss_gender, ss_gender_2)
print(ss_within, ss_within_2)
print(ss_total, ss_total_2)

