<a href="https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch15_anova.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modern statistics: Intuition, Math, Python, R
## Mike X Cohen (sincxpress.com)
### https://www.amazon.com/dp/B0CQRGWGLY
#### Code for Chapter 14 (ANOVA)

---

# About this code file:

### This notebook will reproduce most of the figures in this chapter (some figures were made in Inkscape), and illustrate the statistical concepts explained in the text. The point of providing the code is not just for you to recreate the figures, but for you to modify, adapt, explore, and experiment with the code.

### Solutions to all exercises are at the bottom of the notebook.

#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE.

In [None]:
# import libraries and set settings
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from IPython.display import display
from matplotlib.font_manager import FontProperties # for making tables

# pingouin isn't pre-installed on colab
!pip install pingouin
import pingouin as pg
import pandas as pd
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import AnovaRM


# define global figure properties used for publication
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format
plt.rcParams.update({'font.size':14,             # font size
                     'savefig.dpi':300,          # output resolution
                     'axes.titlelocation':'left',# title location
                     'axes.spines.right':False,  # remove axis bounding box
                     'axes.spines.top':False,    # remove axis bounding box
                     })

# Figure 14.3: Critical F by df's

In [None]:
# Define the degrees of freedom
df1_values = np.arange(1,10)
df2_values = np.arange(5,30)

# Create a 2D numpy array to store the critical F values
critFvals = np.zeros((len(df2_values),len(df1_values)))

# critical F values for each df pair
for i, df1 in enumerate(df1_values):
  for j, df2 in enumerate(df2_values):
    critFvals[j,i] = stats.f.ppf(.95, df1, df2)


# Plot the matrix as a heatmap
plt.figure(figsize=(4,6))
plt.imshow(critFvals, origin='lower', cmap='gray', interpolation='nearest',aspect='auto',
           extent=[df1_values[0],df1_values[-1],df2_values[0],df2_values[-1]],vmin=2,vmax=5)
plt.colorbar(label='Critical F Value')
plt.xlabel(r'Numerator $df$')
plt.ylabel(r'Denominator $df$')
plt.xticks(df1_values[::2])
plt.title(f'Critical F values\nfor df pairs',loc='center')

plt.tight_layout()
plt.savefig('anova_fCritBydf.png')
plt.show()

# Figure 14.4: F-distributions

In [None]:
# Define the x range
x = np.linspace(0,3.5,1000)

# Define the degrees of freedom pairs
df_pairs = [(6,30), (5,25), (4,22), (4,15), (2,30)]



plt.figure(figsize=(10,6))
for i,(df1,df2) in enumerate(df_pairs):

  # F pdf
  F = stats.f.pdf(x, df1, df2)

  # color
  c = i/len(df_pairs)

  # plot the distribution
  plt.plot(x,F,linewidth=3,color=(c,c,c),label=fr'F({df1},{df2})')

  # critical F value for p=.05
  crit_f_x = stats.f.ppf(.95,df1,df2) # this is the F value
  crit_f_y = stats.f.pdf(crit_f_x,df1,df2) # this is the y-axis coordinate (prob density)


  # Add annotation for the critical F value
  plt.annotate(text=fr'F$_C$({df1},{df2}) = {crit_f_x:.2f}',color=(c,c,c),xy=(crit_f_x,crit_f_y),rotation=90,
                xytext=(crit_f_x,crit_f_y*3),fontsize=18,
                arrowprops=dict(color=(c,c,c), arrowstyle='->',linewidth=2),
                ha='center', va='bottom')

# some niceties
plt.title('F-distributions for various df pairs',loc='center')
plt.xlabel('F')
plt.xlim([0,x[-1]])
plt.ylim([0,1.2])
plt.ylabel('Probability density')
plt.legend()

plt.tight_layout()
plt.savefig('anova-FDists.png')
plt.show()

# Figure 14.5: One-way ANOVA table

In [None]:
# Data
rows = ['Between', 'Within', 'Total']
columns = ['Source', 'SS', 'df', 'MS', 'F']
cell_text = [
    ['Between', r'$\sum_{j=1}^{k}n_j(\overline{x_j}-\overline{x})^2$', r'$k-1$', r'$\frac{SS_{Between}}{k-1}$', r'$\frac{MS_{Between}}{MS_{Within}}$'],
    ['Within', r'$\sum_{j=1}^{k}\sum_{i=1}^{n_j}(x_{ij}-\overline{x_j})^2$', r'$N-k$', r'$\frac{SS_{Within}}{N-k}$',''],
    ['Total', r'$\sum_{j=1}^{k}\sum_{i=1}^{n_j}(x_{ij}-\overline{x})^2$', r'$N-1$', '', '']
]

# Create table
fig, ax = plt.subplots()
ax.axis('off')
table = ax.table(cellText   = cell_text,
                 colLabels  = columns,
                 colColours = [(.8,.8,.8)] * len(columns),
                 cellLoc    = 'center',
                 loc        = 'center')

# adjustments
for (row, col), cell in table.get_celld().items():
  cell.set_text_props(fontproperties=FontProperties(family='serif'))
  if row==0: cell.set_text_props(fontproperties=FontProperties(weight='bold',size=16))
  if row>0 and col>2: cell.set_text_props(fontproperties=FontProperties(size=20))

table.auto_set_font_size(False)
table.scale(1.8,4)

# export
plt.savefig('anova_ANOVAtable.png',bbox_inches='tight')
plt.show()

# Figure 14.6: Bar plot used for Tukey test description

In [None]:
y = [ 5,5,10,11]
L = ['A','B','C','D']

plt.figure(figsize=(6,3))
plt.bar(range(len(L)),y,color=(.3,.3,.3),edgecolor='k')
plt.xticks(range(len(L)),labels=L)
plt.xlabel('Condition (level)')
plt.ylabel('Outcome variable')

plt.tight_layout()
plt.savefig('anova-4tukey.png')
plt.show()

# Figure 14.7: Q-distributions with various df pairs

In [None]:
# Define the x range
x = np.linspace(0,6,100)

# Define the degrees of freedom pairs
df_pairs = [(6,30), (5,25), (4,22), (4,15), (2,30)]



plt.figure(figsize=(10,6))
for i,(df1,df2) in enumerate(df_pairs):

  # Q pdf
  Q = stats.studentized_range.pdf(x,df1,df2)

  # color
  c = i/len(df_pairs)

  # plot the distribution
  plt.plot(x,Q,linewidth=3,color=(c,c,c),label=fr'Q({df1},{df2})')

  # critical Q value for p=.05
  crit_q_x = stats.studentized_range.ppf(.95,df1,df2) # this is the F value
  crit_q_y = stats.studentized_range.pdf(crit_q_x,df1,df2) # this is the y-axis coordinate (prob density)


  # Add annotation for the critical Q value
  plt.annotate(text=fr'Q$_C$({df1},{df2}) = {crit_q_x:.2f}',color=(c,c,c),xy=(crit_q_x,crit_q_y),rotation=90,
                xytext=(crit_q_x,crit_q_y*3),fontsize=18,
                arrowprops=dict(color=(c,c,c), arrowstyle='->',linewidth=2),
                ha='center', va='bottom')

# some niceties
plt.title('Q-distributions for various df pairs',loc='center')
plt.xlabel('Q')
plt.xlim([0,x[-1]])
plt.ylim([0,.7])
plt.ylabel('Probability density')
plt.legend()

plt.tight_layout()
plt.savefig('anova-QDists.png')
plt.show()

# Figure 13.14: rmANOVA table

In [None]:
# Data
rows = ['Between', 'Subjects', 'Within', 'Total']
columns = ['Source', 'SS', 'df', 'MS', 'F']
cell_text = [
    ['Between', r'$N\sum_{j=1}^{k} (\overline{x_j} - \overline{x})^2$', r'$k-1$', r'$\frac{SS_{Between}}{k-1}$', r'$\frac{MS_{Between}}{MS_{Within}}$'],
    ['Subjects', r'$\sum_{i=1}^{N}(\overline{x_i}-\overline{x})^2$', r'$N-1$', r'$\frac{SS_{Subjects}}{N-1}$', r'$\frac{MS_{Subjects}}{MS_{Within}}$'],
    ['Within', r'$SS_{T} - SS_{B} - SS_{S}$', r'$(N-1)(k-1)$', r'$\frac{SS_{Within}}{(N-1)(k-1)}$',''],
    ['Total', r'$\sum_{j=1}^{k}\sum_{i=1}^{n_j}(x_{ij}-\overline{x})^2$', r'$Nk-1$', '', '']
]


# Create table
fig, ax = plt.subplots()
ax.axis('off')
table = ax.table(cellText   = cell_text,
                 colLabels  = columns,
                 colColours = [(.8,.8,.8)] * len(columns),
                 cellLoc    = 'center',
                 loc        = 'center')

# adjustments
from matplotlib.font_manager import FontProperties
for (row, col), cell in table.get_celld().items():
  cell.set_text_props(fontproperties=FontProperties(family='serif'))
  if row==0: cell.set_text_props(fontproperties=FontProperties(weight='bold',size=16))
  if row>0 and col>2: cell.set_text_props(fontproperties=FontProperties(size=20))

table.auto_set_font_size(False)
table.scale(1.8,4)

# export
plt.savefig('anova_rmANOVAtable.png',bbox_inches='tight')
plt.show()

# Figures 14.15 - 14.18: Example rmANOVA (the "snacks study")

In [None]:
data = {
    'Participant': ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8']*4,
    'Snack': ['Baseline']*8 + ['Chocolate']*8 + ['Chips']*8 + ['Ice Cream']*8,
    'Mood': [5, 7, 6, 6, 5, 8, 7, 6,  # Baseline
             6, 8, 8, 7, 8, 9, 8, 7,  # Chocolate
             5, 7, 6, 5, 4, 6, 4, 6,  # Chips
             7, 9, 7, 8, 7, 9, 8, 9]  # Ice Cream
}

df = pd.DataFrame(data)

# show the data in "long" format
df[::4]

In [None]:
# show the data in "wide" format
df.pivot(index='Participant', columns='Snack', values='Mood')

In [None]:
# Plot the data
plt.figure(figsize=(8,4))
sns.boxplot(x='Snack', y='Mood', data=df, palette='BuPu')
plt.title('Mood scores by Snack type',loc='center')

plt.tight_layout()
plt.savefig('anova_rmSnackRes.png')
plt.show()

In [None]:
rmANOVA = pg.rm_anova(data=df, dv='Mood', within='Snack',
                      subject='Participant', detailed=True)
rmANOVA

In [None]:
# pairwise comparisons
pairwise_tests = pg.pairwise_tests(data=df, dv='Mood', within='Snack',
                                    subject='Participant',padjust='bonferroni')

print(pairwise_tests)

In [None]:
# FYI, this is the code to implement a Tukey test using statsmodels.
# The Tukey test is not appropriate for repeated-measures factors,
# although the conclusions here are the same as in the previous cell.
m_comp = sm.stats.multicomp.MultiComparison(df['Mood'],df['Snack'])
tukey_result = m_comp.tukeyhsd()

print(tukey_result)

In [None]:
# calculate the mean for each group
group_means = df.groupby('Snack')['Mood'].mean()

# column of predicted data
df['Predicted'] = df['Snack'].map(group_means)

# column of residuals
df['Residual'] = df['Mood'] - df['Predicted']

# show a few rows
df[::4]

# Figure 14.21: Inspecting ANOVA results

In [None]:
_,axs = plt.subplots(1,3,figsize=(10,3.5))

# histogram
axs[0].hist(df['Residual'],bins=5,facecolor=(.7,.7,.7),edgecolor='k')
axs[0].set(xlabel='Residuals',ylabel='Count')
axs[0].set_title(r'$\bf{A}$)  Residuals histogram')

# residuals by fitted values
axs[1].plot(df['Predicted'], df['Residual'],'ko',markersize=10,markerfacecolor=(.7,.7,.7))
axs[1].axhline(y=0, color='k', linestyle='-', zorder=-2)
axs[1].set(xlabel='Predicted values',ylabel='Residuals',xlim=[5,8.5],ylim=[-2.5,2.5])
axs[1].set_title(r'$\bf{B}$)  Residuals vs. $\hat{y}$')

# QQ plot
stats.probplot(df['Residual'],dist='norm',plot=axs[2])
axs[2].get_lines()[0].set(markerfacecolor=(.7,.7,.7),
                          markeredgecolor='k',
                          markersize=10,
                          alpha=.7)
axs[2].get_lines()[1].set(zorder=-1,color='k')
axs[2].set_title(r'$\bf{C}$)  QQ-plot')


plt.tight_layout()
plt.savefig('anova_residuals.png')
plt.show()

# Figure 14.23: 2-way ANOVA table

In [None]:
rows = ['Between A', 'Between B', 'Interaction AB', 'Within', 'Total']
columns = ['Source', 'SS', 'df', 'MS', 'F']

cell_text = [
    ['Between A', r'$SS_A$', r'$A-1$', r'$\frac{SS_A}{df_A}$', r'$\frac{MS_A}{MS_W}$'],
    ['Between B', r'$SS_B$', r'$B-1$', r'$\frac{SS_B}{df_B}$', r'$\frac{MS_B}{MS_W}$'],
    ['Interaction AB', r'$SS_{AB}$', r'$(A-1)(B-1)$', r'$\frac{SS_{AB}}{df_{AB}}$', r'$\frac{MS_{AB}}{MS_W}$'],
    ['Within', r'$SS_W$', r'$N-AB$', r'$\frac{SS_W}{df_W}$', ''],
    ['Total', r'$SS_T$', r'$N-1$', '', '']
]


# Create table
fig, ax = plt.subplots()
ax.axis('off')
table = ax.table(cellText   = cell_text,
                 colLabels  = columns,
                 colColours = [(.8,.8,.8)] * len(columns),
                 cellLoc    = 'center',
                 loc        = 'center')

# adjustements
from matplotlib.font_manager import FontProperties
for (row, col), cell in table.get_celld().items():
  cell.set_text_props(fontproperties=FontProperties(family='serif'))
  if row==0: cell.set_text_props(fontproperties=FontProperties(weight='bold',size=16))
  if row>0 and col>2: cell.set_text_props(fontproperties=FontProperties(size=20))

table.auto_set_font_size(False)
table.scale(1.8,4)

# export
plt.savefig('anova_2ANOVAtable.png',bbox_inches='tight')
plt.show()

# Figure 14.24: Simulate data for a one-way ANOVA

In [None]:
# group means and number of levels
level_means = [ 0,.1,.5 ]

# sample size and dataset size
nLevels = len(level_means)
samplesize = 34
nDataRows = samplesize*nLevels # total rows in the dataset

# create the column with group assignments
group_column = np.tile(np.arange(nLevels), samplesize)


# column data (initialize as zeros, then modulate by level_means)
col_data = np.zeros(nDataRows)
for i in range(nLevels):

  # row selection
  whichrows = group_column==i

  # population cell mean
  cellMean = level_means[i]

  # random data for those rows
  col_data += np.random.normal(loc=cellMean,scale=1,size=nDataRows)*whichrows



# import data into a dataframe
df = pd.DataFrame({
        'Group'  : group_column,
        'Value'  : col_data   })

In [None]:
# visualization
_,axs = plt.subplots(1,2,figsize=(10,4))

### example data showing formatting

# need a copy for formatting
dfd = df.copy()
dfd['Group'] = dfd['Group'].map('{:.0f}'.format)
dfd['Value'] = dfd['Value'].map('{:.2f}'.format)

table = axs[0].table(cellText   = dfd[:9].values,
                     colLabels  = dfd.columns,
                     colColours = [(.8,.8,.8)] * len(dfd.columns),
                     cellLoc    = 'center',
                     loc        = 'center')

# adjustments
for (row, col), cell in table.get_celld().items():
  cell.set_text_props(fontproperties=FontProperties(family='serif'))
  if row==0: cell.set_text_props(fontproperties=FontProperties(weight='bold',size=14))

table.scale(.7,1.8)
table.auto_set_font_size(False)
table.set_fontsize(14)
axs[0].axis('off')
axs[0].set_title(r'$\bf{A}$)  Data format')


### boxplots of data
sns.boxplot(x='Group', y='Value', data=df, palette='BuPu',ax=axs[1])
axs[1].set_title(r'$\bf{B}$)  Data box plots')

plt.tight_layout()
plt.savefig('anova_sim1b.png')
plt.show()

In [None]:
# One-way ANOVA
pg.anova(dv='Value', between='Group', data=df, detailed=True)

# Figure 14.25: Parametric experiment on a one-way ANOVA

In [None]:
samplesizes = np.arange(5,151)

# group means and number of levels
level_means = [ 0,.2,.4 ]
nLevels = len(level_means)


## run the experiment!
pvals = np.zeros(len(samplesizes))

for expi,N in enumerate(samplesizes):

  # setup
  nDataRows = N*nLevels # total rows in the dataset

  # create the column subject and group assignments
  group_column = np.tile(np.arange(nLevels), N)

  # column data (initialize as zeros, then modulate by group_mean)
  col_data = np.zeros(nDataRows)
  for i in range(nLevels):
    col_data += np.random.normal(loc=level_means[i],
                                 size=nDataRows)*(group_column==i)

  # import data into a dataframe
  df = pd.DataFrame({ 'Group':group_column, 'Value':col_data })

  # run the ANOVA and store the p-value
  anova = pg.anova(dv='Value', between='Group', data=df)

  pvals[expi] = anova['p-unc'].item()



## visualization
plt.figure(figsize=(8,4))
plt.plot(samplesizes,np.log(pvals),'ks',markersize=10,markerfacecolor=(.8,.8,.8))
plt.axhline(y=np.log(.05),color='k',linestyle='--',zorder=-1)
plt.xlabel('Sample size')
plt.ylabel('log(p)')
plt.xlim([samplesizes[0]-2,samplesizes[-1]+2])

plt.tight_layout()
plt.savefig('anova_sim1b_exp.png')
plt.show()

# Figure 14.26: Simulate data for a one-way repeated-measures ANOVA

In [None]:
# group means and number of levels
level_means = [ 0,.1,.5 ]

# sample size and dataset size
samplesize = 34
nLevels = len(level_means)
nDataRows = samplesize*nLevels # total rows in the dataset


# create the column subject and group assignments
subject_column = np.repeat(np.arange(samplesize), nLevels)
group_column = np.tile(np.arange(nLevels), samplesize)

# column data (initialize as zeros, then modulate by group_mean)
col_data = np.zeros(nDataRows)
for i in range(nLevels):

  # row selection
  whichrows = (group_column==i)

  # population cell mean
  cellMean = level_means[i]

  # random data for those rows
  col_data += np.random.normal(loc=cellMean,scale=1,size=nDataRows)*whichrows


# import data into a dataframe
df = pd.DataFrame({
        'Subject': subject_column,
        'Group'  : group_column,
        'Value'  : col_data   })

In [None]:
# visualization
_,axs = plt.subplots(1,2,figsize=(10,4))

### example data showing formatting

# need a copy for formatting
dfd = df.copy()
dfd['Subject'] = dfd['Subject'].map('{:.0f}'.format)
dfd['Group'] = dfd['Group'].map('{:.0f}'.format)
dfd['Value'] = dfd['Value'].map('{:.2f}'.format)

table = axs[0].table(cellText   = dfd[:9].values,
                     colLabels  = dfd.columns,
                     colColours = [(.8,.8,.8)] * len(dfd.columns),
                     cellLoc    = 'center',
                     loc        = 'center')

# adjustments
for (row, col), cell in table.get_celld().items():
  cell.set_text_props(fontproperties=FontProperties(family='serif'))
  if row==0: cell.set_text_props(fontproperties=FontProperties(weight='bold',size=14))

table.scale(.7,1.8)
table.auto_set_font_size(False)
table.set_fontsize(14)
axs[0].axis('off')

axs[0].set_title(r'$\bf{A}$)  Data format')


### boxplots of data
sns.boxplot(x='Group', y='Value', data=df, palette='BuPu',ax=axs[1])
axs[1].set_title(r'$\bf{B}$)  Data box plots')

plt.tight_layout()
plt.savefig('anova_sim1r.png')
plt.show()

In [None]:
# One-way repeated measures ANOVA
pg.rm_anova(dv='Value', within='Group', subject='Subject', data=df, detailed=True)

# Figure 14.27: Simulate data for a two-way between-subjects ANOVA

In [None]:
# subjects per group
n = 30

# population cell means
# "factor A" is the number of rows, "factor B" is the number of columns
group_means = [ [ 1,1,1.5,.5 ],
                [ 1,1,.5,1.5 ] ]

factA,factB = np.shape(group_means)
nDataRows = n*factA*factB # total rows in the dataset

# create the column subject and group assignments
colA = np.repeat(np.arange(factA), n*factB)
colB = np.repeat(np.tile(np.arange(factB), factA), n)


# column data (initialize as zeros, then modulate by group_mean)
col_data = np.zeros(nDataRows)
for a in range(factA):
  for b in range(factB):

    # row selection
    whichrows = (colA==a) & (colB==b)

    # population cell mean
    cellMean = group_means[a][b]

    # random data for those rows
    col_data += np.random.normal(loc=cellMean,scale=1,size=nDataRows)*whichrows



# Create dataframe
df = pd.DataFrame({
      'A' : colA,
      'B' : colB,
      'y' : col_data
})

# print dataframe
#print(df.to_string())

In [None]:
# visualization
_,axs = plt.subplots(1,2,figsize=(10,4))

### example data showing formatting

# need a copy for formatting
dfd = df.copy()
dfd['A'] = dfd['A'].map('{:.0f}'.format)
dfd['B'] = dfd['B'].map('{:.0f}'.format)
dfd['y'] = dfd['y'].map('{:.2f}'.format)

table = axs[0].table(cellText  = dfd[:11].values,
                    colLabels  = dfd.columns,
                    colColours = [(.8,.8,.8)] * len(dfd.columns),
                    cellLoc    = 'center',
                    loc        = 'center')

# adjustments
for (row, col), cell in table.get_celld().items():
  cell.set_text_props(fontproperties=FontProperties(family='serif'))
  if row==0: cell.set_text_props(fontproperties=FontProperties(weight='bold',size=14))

table.scale(.7,1.6)
table.auto_set_font_size(False)
table.set_fontsize(13)
axs[0].axis('off')
axs[0].set_title(r'$\bf{A}$)  Data format')


### boxplots of data
sns.boxplot(x='A', y='y', hue='B', data=df, palette='BuPu',ax=axs[1])
axs[1].set_title(r'$\bf{B}$)  Data box plots')

plt.tight_layout()
plt.savefig('anova_sim2b.png')
plt.show()

In [None]:
# two-way ANOVA
print(pg.anova(data=df, dv='y', between=['A','B'], detailed=True))

# Figure 14.28: Experiment: Interaction by standard deviation

In [None]:
stdevs = np.linspace(2,.2,43)

# subjects per group
n = 30

# population cell means
# "factor A" is the number of rows, "factor B" is the number of columns
group_means = [ [ 1,1,1.3,.7 ],
                [ 1,1,.7,1.3 ] ]

factA,factB = np.shape(group_means)
nDataRows = n*factA*factB # total rows in the dataset

# create the column subject and group assignments
colA = np.repeat(np.arange(factA), n*factB)
colB = np.repeat(np.tile(np.arange(factB), factA), n)




### run the experiment
intpvals = np.zeros((len(stdevs),2))

for expi,std in enumerate(stdevs):

  # column data (initialize as zeros, then modulate by level_mean)
  col_data = np.zeros(nDataRows)
  for a in range(factA):
    for b in range(factB):
      whichrows = (colA==a) & (colB==b)
      cellMean = group_means[a][b]
      col_data += np.random.normal(loc=cellMean,scale=std, # modulate the standard deviation
                                   size=nDataRows)*whichrows

  # Create dataframe
  df = pd.DataFrame({
        'A' : colA,
        'B' : colB,
        'y' : col_data
  })

  # store interaction p-value ("[2]" b/c the interaction term is the 3rd row of the table
  intpvals[expi,:] = pg.anova(data=df,dv='y',between=['A','B'])['p-unc'][1:3]

  if expi==len(stdevs)//2: df2plot=df.copy()


## visualization
_,axs = plt.subplots(1,2,figsize=(11,4))

# boxplots
sns.barplot(x='A', y='y', hue='B', data=df2plot, palette='BuPu',ax=axs[0])
axs[0].set_title(fr'$\bf{{A}}$)  Bar plot of data (std={stdevs[len(stdevs)//2]:.2f})')

# plot the p-values with a + for p<.05
axs[1].plot(stdevs,np.log(intpvals[:,0]),'ks',markersize=10,markerfacecolor=(.4,.4,.4),label='Main effect of "B"')
axs[1].plot(stdevs[intpvals[:,0]<.05],np.log(intpvals[intpvals[:,0]<.05,0]),'w+',markersize=10)
axs[1].plot(stdevs,np.log(intpvals[:,1]),'ko',markersize=10,markerfacecolor=(.9,.9,.9),label='Interaction')
axs[1].plot(stdevs[intpvals[:,1]<.05],np.log(intpvals[intpvals[:,1]<.05,1]),'k+',markersize=10)

# some other adjustments
axs[1].axhline(y=np.log(.05),color='k',linestyle='--')
axs[1].set(xlabel='Population standard deviation',ylabel='log(p)')
axs[1].legend()
axs[1].set_title(r'$\bf{B}$)  P-values')

plt.tight_layout()
plt.savefig('anova_sim2b_std.png')
plt.show()

# Figure 14.29: Two-way mixed-effects ANOVA

In [None]:
# subjects per group
n = 30

# population cell means
# "factor A" is the number of rows, "factor B" is the number of columns
# Factor B is repeated-measures; Factor A is between-subjects
group_means = [ [1.1,1.2,1.3],
                [2,2.2,2.5] ]

factA,factB = np.shape(group_means)
nDataRows = n*factA*factB # total rows in the dataset

# create the column subject and group assignments
colA = np.repeat(np.arange(factA), n*factB)#,np.repeat(np.arange(factA), n*factB)
colB = np.tile(np.arange(factB), n*factA)#,np.repeat(np.tile(np.arange(factB), factA), n)
colS = np.floor(np.arange(nDataRows)/factB)



# column data
col_data = np.zeros(nDataRows)
for a in range(factA):
  for b in range(factB):

    # row selection
    whichrows = (colA==a) & (colB==b)

    # population cell mean
    cellMean = group_means[a][b]

    # random data for those rows
    col_data += np.random.normal(loc=cellMean,scale=1,size=nDataRows)*whichrows



# Create data
df = pd.DataFrame({
          'A' : colA, # between-subjects levels
          'B' : colB, # within-subjects level
         'ID' : colS, # subject ID (to know which data values are repeated)
          'y' : col_data
})

# print dataframe
print(df.to_string())

In [None]:
# Run the mixed-design ANOVA
pg.mixed_anova(data=df, dv='y', between='A', within='B', subject='ID')

In [None]:
# visualization
_,axs = plt.subplots(1,2,figsize=(10,4))

### example data showing formatting

# need a copy for formatting
dfd = df.copy()
dfd['A']  = dfd['A'].map('{:.0f}'.format)
dfd['B']  = dfd['B'].map('{:.0f}'.format)
dfd['ID'] = dfd['ID'].map('{:.0f}'.format)
dfd['y']  = dfd['y'].map('{:.2f}'.format)

table = axs[0].table(cellText   = dfd[:11].values,
                     colLabels  = dfd.columns,
                     colColours = [(.8,.8,.8)] * len(dfd.columns),
                     cellLoc    = 'center',
                     loc        = 'center')

# adjustments
for (row, col), cell in table.get_celld().items():
  cell.set_text_props(fontproperties=FontProperties(family='serif'))
  if row==0: cell.set_text_props(fontproperties=FontProperties(weight='bold',size=14))

table.scale(.7,1.6)
table.auto_set_font_size(False)
table.set_fontsize(13)
axs[0].axis('off')
axs[0].set_title(r'$\bf{A}$)  Data format')


### boxplots of data
sns.boxplot(x='A', y='y', hue='B', data=df, palette='BuPu',ax=axs[1])
axs[1].set_title(r'$\bf{B}$)  Data box plots')

plt.tight_layout()
plt.savefig('anova_sim2w.png')
plt.show()

# Exercise 1

In [None]:
### the raw data
elves  = np.array([17, 20, 16, 22, 20, 12, 15, 23,  9, 22, 21, 19, 12    ])
dwarfs = np.array([15, 14, 15, 25, 19, 16, 20, 18, 18, 15, 18, 13, 14, 15])
trolls = np.array([14, 16, 11, 17, 12, 13, 10, 12, 10, 18, 13, 14, 11, 20])

### descriptive statistics

# sample sizes
Nelves  = len(elves)
Ndwarfs = len(dwarfs)
Ntrolls = len(trolls)

# means
mean_elves  = np.mean(elves)
mean_dwarfs = np.mean(dwarfs)
mean_trolls = np.mean(trolls)

# standard errors
sem_elves  = np.std(elves, ddof=1) / np.sqrt(Nelves)
sem_dwarfs = np.std(dwarfs,ddof=1) / np.sqrt(Ndwarfs)
sem_trolls = np.std(trolls,ddof=1) / np.sqrt(Ntrolls)

In [None]:
# create an error bar plot
plt.figure(figsize=(8,5))

# the bars
plt.bar(range(3),[mean_elves,mean_dwarfs,mean_trolls],color=(.7,.7,.7))
plt.errorbar(range(3),[mean_elves,mean_dwarfs,mean_trolls],
             yerr=[sem_elves,sem_dwarfs,sem_trolls],fmt='ko')

# text in bars
plt.text(0,mean_elves/2, f'Mean={mean_elves:.1f}\nN={Nelves}',ha='center')
plt.text(1,mean_dwarfs/2,f'Mean={mean_dwarfs:.1f}\nN={Ndwarfs}',ha='center')
plt.text(2,mean_trolls/2,f'Mean={mean_trolls:.1f}\nN={Ntrolls}',ha='center')

plt.xticks(range(3),['Elves', 'Dwarfs', 'Trolls'])
plt.yticks(np.arange(19,step=3))
plt.ylabel('Number of spells per minute')
plt.title('Spell-casting speeds of elves, dwarfs, and trolls',loc='center')

plt.tight_layout()
plt.savefig('anova_magicalMeans.png')
plt.show()

In [None]:
# Stack the data into a single array for convenience
all_data = np.hstack((elves,dwarfs,trolls))

# Calculate the overall mean
total_mean = np.mean(all_data)


# Calculate SS_Between
ss_between = 0
for group in [elves,dwarfs,trolls]:
  ss_between += len(group) * (group.mean() - total_mean)**2

# Could also use list comprehension, but I think a loop is more readable.
#ss_between = np.sum([len(group) * (group.mean() - overall_mean)**2 for group in [elves,dwarfs,trolls]])



# Calculate SS_Within
ss_within = np.sum( (elves  - elves.mean())**2  ) + \
            np.sum( (dwarfs - dwarfs.mean())**2 ) + \
            np.sum( (trolls - trolls.mean())**2 )


# Calculate SS Total
ss_total = ss_between + ss_within

# Calculate degrees of freedom for between, within, and total
df_between = 3 - 1  # number of groups minus 1
df_within = len(all_data) - 3  # number of observations minus number of groups
df_total = len(all_data) - 1  # number of observations minus 1

# Calculate MS_Between and MS_Within
ms_between = ss_between / df_between
ms_within = ss_within / df_within

# Calculate F statistic and associated p-value
f_stat = ms_between / ms_within
p_value = 1 - stats.f.cdf(f_stat, df_between, df_within)


# Print out the ANOVA table
print('Source\t|    SS\t\tdf\t  MS\t F\tp-value')
print('-'*56)
print(f'Between\t| {ss_between:6.2f}\t {df_between}\t{ms_between:.2f}\t{f_stat:.2f}\t{p_value:.4f}')
print(f'Within\t| {ss_within:6.2f}\t{df_within}\t{ms_within:.2f}')
print(f'Total\t| {ss_total:6.2f}\t{df_total}')

In [None]:
# effect sizes
eta2 = ss_between / ss_total
omega2 = (ss_between - df_between*ms_within) / (ss_total+ms_within)

print(f'eta^2   = {eta2:.3f}')
print(f'omega^2 = {omega2:.3f}')

# Exercise 2

In [None]:
# Combine the data into one numpy array
data = np.concatenate([elves,dwarfs,trolls])

# Create group labels
group_labels = ['Elves']*Nelves + ['dwarfs']*Ndwarfs + ['trolls']*Ntrolls

# Create a DataFrame from the data
df = pd.DataFrame({'Spells':data, 'Creature':group_labels})

# print the dataframe
df[::6]

In [None]:
# Perform the one-way ANOVA
result = pg.anova(data=df, detailed=True,
                  dv='Spells', between='Creature')
result

In [None]:
# Compare with detailed=False
result = pg.anova(data=df, dv='Spells', between='Creature', detailed=False)
print(result)

In [None]:
# all pairwise comparisons using Tukey method
df.pairwise_tukey(dv='Spells', between='Creature').round(3)

In [None]:
## FYI, corresponding statsmodels code (not part of this exercise):

# create and define the model
model = ols('Spells ~ C(Creature)', data=df).fit()

# Performing ANOVA
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

# Exercise 3

In [None]:
## data parameters

# group means
mean1 = 4
mean2 = 6

# samples per group
N1 = 30
N2 = 35

## now to simulate the data
data1 = np.random.normal(mean1,2,size=N1)
data2 = np.random.normal(mean2,2,size=N2)

datacolumn = np.hstack((data1,data2))

# group labels
groups = ['1']*N1 + ['2']*N2

# convert to a pandas dataframe
df = pd.DataFrame({'TheData':datacolumn,'Group':groups})
df

In [None]:
# run the ANOVA and t-test
anova = pg.anova(data=df,dv='TheData',between='Group')
ttest = stats.ttest_ind( df['TheData'][df['Group']=='1'],
                         df['TheData'][df['Group']=='2'] )

In [None]:
# compare against t-test
print(f"ANOVA: F{anova['ddof1'].item(),anova['ddof2'].item()} = {anova['F'].item():.3f}, p = {anova['p-unc'].item():.3f}")

print(f'\nT-test: t({N1+N2-2}) = {ttest.statistic:.2f}, p = {ttest.pvalue:.3f}')

print(f'\nt^2 = {ttest.statistic**2:.3f}')

# Exercise 4

In [None]:
## data parameters

# sample size
N = 20

## simulate the data
data = np.random.normal(0,1,size=3*N)

# replace the final two data points with outliers (fixed to 10)
data[-2:] = 10

# group labels
groups = ['1']*N + ['2']*N + ['3']*N

# convert to a pandas dataframe
df = pd.DataFrame({'TheData':data,'Group':groups})

# run an ANOVA
pg.anova(data=df,dv='TheData',between='Group')

In [None]:
## data parameters

# sample size
N = 50
nOutliers = 3

# group labels
groups = ['1']*N + ['2']*N + ['3']*N

# experiment params
isSig = 0  # counter
nTests = 300 # number of tests to simulate


# now for the experiment!
for i in range(nTests):

  ##simulate the data
  data = np.random.normal(0,1,size=3*N)
  data[-nOutliers:] = np.random.normal(10,1,size=nOutliers)

  # run an ANOVA
  df = pd.DataFrame({'TheData':data,'Group':groups})
  anova = pg.anova(data=df,dv='TheData',between='Group')

  # count if significant
  isSig += anova['p-unc'].item()<.05


# print the results
print(f'{isSig} of {nTests} tests ({isSig*100/nTests:.2f}%) had p<.05 with N={N} and {nOutliers} outliers in group 3.')

# Exercise 5

In [None]:
# Here is one possible way to do it:
# 10 factors, each with only 1 sample, and one additional group with 20 samples.

# Numerator (between-group) df: (number of groups - 1) = (10+1 - 1) = 10
# Denominator (within-group) df: (total number of observations - number of groups) = (10 + 20 - 11) = 19
# So in this contrived example, the numerator df (10) is smaller than the denominator df (19).

# Exercise 6

In [None]:
## data parameters
N = 10000

## simulate the data
data1 = np.random.normal(0,1,size=N)
data2 = np.random.normal(.1,1,size=N)
data  = np.concatenate((data1,data2),axis=0)

# group labels
groups = ['1']*N + ['2']*N

# convert to a pandas dataframe
df = pd.DataFrame({'TheData':data,'Group':groups})

# run an ANOVA
pg.anova(data=df,dv='TheData',between='Group')

In [None]:
# sample size
N = 10000

# experiment params
nTests = 300 # number of tests to simulate
groups = ['1']*N + ['2']*N
pvals = np.zeros(nTests)  # counter
peta2 = np.zeros(nTests)


# now for the experiment!
for i in range(nTests):

  ##simulate the data
  data1 = np.random.normal(0,1,size=N)
  data2 = np.random.normal(.01,1,size=N)
  data  = np.concatenate((data1,data2),axis=0)

  # run an ANOVA
  df = pd.DataFrame({'TheData':data,'Group':groups})
  anova = pg.anova(data=df,dv='TheData',between='Group')

  # count if significant
  pvals[i] = anova['p-unc'].item()
  peta2[i] = 100*anova['np2'].item()


# print the results
print(f'{np.sum(pvals<.05)} of {nTests} tests ({np.sum(pvals<.05)*100/nTests:.2f}%) had p<.05 with N={N}.')

In [None]:
_,axs = plt.subplots(1,2,figsize=(10,4))

axs[0].plot(pvals<.05,peta2,'ko',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.5)
axs[0].set(xlim=[-.5,1.5],xticks=[0,1],xticklabels=['p>.05','p<.05'],ylabel=r'Partial $\eta^2$ (%)')
axs[0].set_title(r'$\bf{A}$)  Effect sizes by significance')

axs[1].plot(pvals,peta2,'ks',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.5)
axs[1].set(xlabel='P-values',ylabel=r'Partial $\eta^2$ (%)')
axs[1].set_title(r'$\bf{B}$)  Effect sizes by p-values')

plt.tight_layout()
plt.savefig('anova_ex6.png')
plt.show()

In [None]:
### repeat for random sample size

# experiment params
nTests = 300 # number of tests to simulate
groups = ['1']*N + ['2']*N
pvals = np.zeros(nTests)  # counter
peta2 = np.zeros(nTests)


# now for the experiment!
for i in range(nTests):

  # sample size
  N = np.random.randint(10,10000)
  groups = ['1']*N + ['2']*N

  ##simulate the data
  data1 = np.random.normal(0,1,size=N)
  data2 = np.random.normal(np.random.rand()**2,1,size=N)
  data  = np.concatenate((data1,data2),axis=0)

  # run an ANOVA
  df = pd.DataFrame({'TheData':data,'Group':groups})
  anova = pg.anova(data=df,dv='TheData',between='Group')

  # count if significant
  pvals[i] = anova['p-unc'].item()
  peta2[i] = 100*anova['np2'].item()


# print the results
print(f'{np.sum(pvals<.05)} of {nTests} tests ({np.sum(pvals<.05)*100/nTests:.2f}%) had p<.05 with N={N}.')

In [None]:
_,axs = plt.subplots(1,2,figsize=(10,4))

axs[0].plot(pvals<.05,peta2,'ko',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.5)
axs[0].set(xlim=[-.5,1.5],xticks=[0,1],xticklabels=['p>.05','p<.05'],ylabel=r'Partial $\eta^2$ (%)')
axs[0].set_title(r'$\bf{A}$)  Effect sizes by significance')

axs[1].plot(np.log(pvals),peta2,'ks',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.5)
axs[1].set(xlabel='log(p-values)',ylabel=r'Partial $\eta^2$ (%)')
axs[1].set_title(r'$\bf{B}$)  Effect sizes by p-values')

plt.tight_layout()
plt.savefig('anova_ex6b.png')
plt.show()

# Exercise 7

In [None]:
# create data
n_subjects = 30
n_conditions = 3
data = np.random.normal(size=(n_subjects,n_conditions))
data[:,1] += .25 # small offset to measurement #2
data[:,2] += .5  # small offset to measurement #3

# Create a DataFrame
df1 = pd.DataFrame(data, columns=['Cond1','Cond2','Cond3'])

# Convert to long format
df = pd.melt(df1.reset_index(), id_vars=['index'], value_vars=['Cond1','Cond2','Cond3'])
df.columns = ['Subject', 'Condition', 'Value']

# repeated-measures ANOVA
rmANOVA = pg.rm_anova(data=df, dv='Value', within='Condition', subject='Subject', detailed=True)
print('Results of a repeated-measures ANOVA:')
display(rmANOVA)

# between-subjects ANOVA
ANOVA = pg.anova(data=df,dv='Value', between='Condition',detailed=True)
print(f'\n\nResults of a between-subjects ANOVA')
display(ANOVA)

In [None]:
# FYI, using statsmodels (not part of the exercise)
# repeated measures ANOVA
rm_anova = AnovaRM(df, 'Value', 'Subject', within=['Condition'])
results = rm_anova.fit()
print(results)

# between-subjects ANOVA
model = ols('Value ~ C(Condition)', data=df).fit()
anova_results = sm.stats.anova_lm(model, typ=1)
print(anova_results)

In [None]:
# now for the experiment
nReps = 200

# initialize a matrix of p-values
pvals = np.zeros((nReps,2))

# start the experiment
for i in range(nReps):

  # generate the data (NOTE: the commented code at the end is for exercise 8)
  data = np.random.normal(size=(n_subjects,n_conditions)) #+ np.arange(n_subjects)[:,None]
  data[:,1] += .25
  data[:,2] += .5

  # Create a DataFrame
  df1 = pd.DataFrame(data, columns=['Cond1','Cond2','Cond3'])
  df = pd.melt(df1.reset_index(), id_vars=['index'], value_vars=['Cond1','Cond2','Cond3'])
  df.columns = ['Subject', 'Condition', 'Value']

  # the two ANOVAs on the same data
  rmANOVA = pg.rm_anova(data=df, dv='Value', within='Condition', subject='Subject')
  ANOVA = pg.anova(data=df,dv='Value', between='Condition')

  # store the p-values
  pvals[i,0] = rmANOVA['p-unc'].item()
  pvals[i,1] = ANOVA['p-unc'].item()


In [None]:
# visualize the p-values
_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].plot(np.arange(200),pvals[:,0],'ks',markersize=10,markerfacecolor=(.2,.2,.2),alpha=.5,label='Repeated')
axs[0].plot(np.arange(200),pvals[:,1],'ko',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.5,label='Between')
axs[0].set(xlabel='Test number',ylabel='P-value')
axs[0].set_title(r'$\bf{A}$)  P-values from both tests')
axs[0].legend()

axs[1].hist(np.diff(pvals,axis=1),bins='fd',color=(.5,.5,.5))
axs[1].set_title(r'$\bf{B}$)  Histogram of p-value differences')
axs[1].set(xlabel=r'$p_{between}-p_{repeated}$',ylabel='Count')
axs[1].set(xlim=[-.5,.5])


plt.tight_layout()
plt.savefig('anova_ex7b.png')
plt.show()

# Exercise 8

In [None]:
# adapt (or copy/paste) the code from Exercise 7, and replace
data = np.random.normal(size=(n_subjects, n_conditions))

# with
data = np.random.normal(size=(n_subjects, n_conditions)) + np.arange(n_subjects)[:,None]

# The idea is to use 'broadcasting' to add the index number to each data row.
data

In [None]:
# code to make the figure
data1 = np.random.normal(size=(n_subjects, n_conditions))
data1[:,1] += .25
data1[:,2] += .5

data2 = np.random.normal(size=(n_subjects, n_conditions)) + np.arange(n_subjects)[:,None]
data2[:,1] += .25
data2[:,2] += .5


fig,axs = plt.subplots(1,3,figsize=(10,4))

axs[0].plot(data1.T,'o')
axs[0].set_title(fr'$\bf{{A}}$)  Ex.7 data (std={np.std(data1):.1f})')

axs[1].plot(data2.T,'o')
axs[1].set_title(fr'$\bf{{B}}$)  Ex.8 data  (std={np.std(data2):.1f})')

axs[2].plot(np.zeros(n_subjects),data1[:,2]-data1[:,0],'ko')
axs[2].plot(np.ones(n_subjects), data2[:,2]-data2[:,0],'ko')
axs[2].set(xlim=[-.5,1.5],xticks=[0,1],xticklabels=['Ex.7 data','Ex.8 data'],xlabel='"2"-"0" diffs')
axs[2].set_title(r'$\bf{C}$)  Differences')

for a in axs[:2]:
  a.set(xlim=[-.5,2.5],xticks=[0,1,2],xlabel='Group')

plt.tight_layout()
plt.savefig('anova_ex8.png')
plt.show()

# Exercise 9

In [None]:
# population cell means
# "factor A" is the number of rows, "factor B" is the number of columns
group_means = [ [ 1,1,1.3,.7 ],
                [ 1,1,.7,1.3 ] ]

factA,factB = np.shape(group_means)

# per-cell sample sizes
cellCounts = np.random.choice(range(25,36),factA*factB)
nDataRows = np.sum(cellCounts) # total rows in the dataset


# data matrix in numpy (initialize as zeros, then modulate by group_mean)
datamat = np.zeros((nDataRows,3))
rowidx = 0
for idx in range(factA*factB):

  # convert linear to matrix index (to get group_means)
  a,b = np.unravel_index(idx,(factA,factB))

  # population cell mean
  cellMean = group_means[a][b]

  # random data
  celldata = np.random.normal(loc=cellMean,scale=1,size=cellCounts[idx])

  # add to matrix
  datamat[rowidx:rowidx+cellCounts[idx],0] = a
  datamat[rowidx:rowidx+cellCounts[idx],1] = b
  datamat[rowidx:rowidx+cellCounts[idx],2] = celldata

  # update row counter
  rowidx += cellCounts[idx]



# Create dataframe
df = pd.DataFrame(datamat,columns=['A','B','val'])


# two-way ANOVAs
for i in range(1,4):
  print(f'Type-{i} ANOVA table:')
  print(pg.anova(data=df, dv='val', between=['A','B'], ss_type=i))
  print(f'\n\n')

# Exercise 10

In [None]:
# Original source: https://www.rdocumentation.org/packages/datasets/versions/3.6.2/topics/ToothGrowth

url = "https://sincxpress.com/ToothGrowth.csv"

data = pd.read_csv(url)
data

In [None]:
# show the data
plt.figure(figsize=(8,5))

# boxplot
sns.boxplot(x='dose', y='len', hue='supp', data=data, palette='gray')

# offsets (manually coded)
offsets = [.2,-.2, 1.2,.8, 2.2,1.8 ]
i=0 # counter

# loop through all conditions to plot individual data points
for d in np.unique(data['dose']):
  for s in np.unique(data['supp']):

    # the data just from this condition
    tmpY = data[(data['dose']==d) & (data['supp']==s)]['len']
    tmpX = np.random.normal(loc=offsets[i],scale=.02,size=len(tmpY))

    # plot those values, with a bit of offset
    plt.plot(tmpX,tmpY,'ko',markerfacecolor='w')
    i+=1 # update counter


plt.ylabel('Tooth length (mm)') # more informative
plt.xlabel('Vitamin C dose (mg/day)')
plt.tight_layout()
plt.savefig('anova_ex10.png')
plt.show()

In [None]:
# run the ANOVA!
pg.anova(data=data, dv='len', between=['supp','dose'])

# Exercise 11

In [None]:
# calculate the mean for each group
data['predictions'] = data.groupby(['dose','supp'])['len'].transform('mean')

# Subtract the group means (predicted data) from the DV (observed data) to get residuals
data['residuals'] = data['len'] - data['predictions']

# show a few rows
data[::4]

In [None]:
# empirical correlation
r = stats.pearsonr(data['predictions'],data['residuals'])

# scatter plot
plt.figure(figsize=(8,4))

plt.plot(data['predictions'], data['residuals'],'ko',
                markerfacecolor=(.8,.8,.8), markersize=12, alpha=.5)
plt.xlabel('Predicted length')
plt.ylabel('Residuals')
plt.title(f'Pearson r={r.statistic:.4f}, p={r.pvalue:.4f}',loc='center')

plt.tight_layout()
plt.savefig('anova_ex11.png')
plt.show()