In [1]:
import numpy as np 
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Two-Way ANOVA

#### Example
A physiologist was interested in learning whether smoking history and different types of stress tests influence the timing of a subject's maximum oxygen uptake, as measured in minutes. The researcher classified a subject's smoking history as either heavy smoking, moderate smoking, or non-smoking. He was interested in seeing the effects of three different types of stress tests — a test performed on a bicycle, a test on a treadmill, and a test on steps. The physiologist recruited 9 non-smokers, 9 moderate smokers, and 9 heavy smokers to participate in his experiment, for a total of $n = 27$ subjects. He then randomly assigned each of his recruited subjects to undergo one of the three types of stress tests.

Is there sufficient evidence at the $\alpha = 0.05$ significance level to conclude that smoking history has an effect on the time to maximum oxygen uptake? Is there sufficient evidence at the $\alpha = 0.05$ significance level to conclude that the type of stress test has an effect on the time to maximum oxygen uptake? And, is there evidence of an interaction between smoking history and the type of stress test? (Don't forget to define the null hypothesis $H_0$ and the alternative hypothesis $H_1$ for main effects and interaction effect.)

## Define Hypotheses:
...

Here is the data smoking history vs test on bicyle, treadmill and step:

Bicycle Test  
bicycle_nonsmoker = [12.8, 13.5, 11.2]  
bicycle_moderate = [10.9, 11.1, 9.8]  
bicycle_heavy = [8.7, 9.2, 7.5]  


Treadmill Test  
treadmill_nonsmoker = [16.2, 18.1, 17.8]  
treadmill_moderate = [15.5, 13.8, 16.2]  
treadmill_heavy = [14.7, 13.2, 8.1]  


Step Test  
step_nonsmoker = [22.6, 19.3, 18.9]  
step_moderate = [20.1, 21.0, 15.9]   
step_heavy = [16.2, 16.1, 17.8]  

### Reorganize data

In [2]:
data = {
    "Smoking": ["Nonsmoker"] * 3 + ["Moderate"] * 3 + ["Heavy"] * 3 +
               ["Nonsmoker"] * 3 + ["Moderate"] * 3 + ["Heavy"] * 3 +
               ["Nonsmoker"] * 3 + ["Moderate"] * 3 + ["Heavy"] * 3,
    "Test": ["Bicycle"] * 9 + ["Treadmill"] * 9 + ["Step"] * 9,
    "Score": [12.8, 13.5, 11.2, 10.9, 11.1, 9.8, 8.7, 9.2, 7.5,
              16.2, 18.1, 17.8, 15.5, 13.8, 16.2, 14.7, 13.2, 8.1,
              22.6, 19.3, 18.9, 20.1, 21.0, 15.9, 16.2, 16.1, 17.8]
}

# Convert to DataFrame
data = pd.DataFrame(data)
data

Unnamed: 0,Smoking,Test,Score
0,Nonsmoker,Bicycle,12.8
1,Nonsmoker,Bicycle,13.5
2,Nonsmoker,Bicycle,11.2
3,Moderate,Bicycle,10.9
4,Moderate,Bicycle,11.1
5,Moderate,Bicycle,9.8
6,Heavy,Bicycle,8.7
7,Heavy,Bicycle,9.2
8,Heavy,Bicycle,7.5
9,Nonsmoker,Treadmill,16.2


In [None]:


# Calculate the means
overall_mean = ...
smoking_means = ...
test_means = ...


# Calculate the sums of squares for each source of variation

# Smoking
smoking_ss = ...

# Test
test_ss = ...

# Interaction Smoking * Test
interaction_ss =  ...


# Total sum of squares
total_ss =  ...

# Error (Residual) sum of squares
error_ss = total_ss - smoking_ss - test_ss - interaction_ss

# Calculate degrees of freedom (df)
df_smoking = len(smoking_means) - 1
df_test = len(test_means) - 1
df_interaction = df_smoking * df_test
df_error = len(data) - (df_smoking + df_test + df_interaction + 1)

# Calculate Mean Squares (MS)
ms_smoking = ...
ms_test = ...
ms_interaction = ...
ms_error = ...

# Calculate F-statistics
f_smoking = ...
f_test = ...
f_interaction = ...

# Calculate p-values
p_smoking = ...
p_test = ...
p_interaction = ...

# Create an ANOVA table
anova_table = pd.DataFrame({
    'Source': ['Smoking', 'Test', 'Smoking * Test', 'Error'],
    'SS': [smoking_ss, test_ss, interaction_ss, error_ss],
    'df': [df_smoking, df_test, df_interaction, df_error],
    'MS': [ms_smoking, ms_test, ms_interaction, ms_error],
    'F': [f_smoking, f_test, f_interaction, np.nan],
    'p-value': [p_smoking, p_test, p_interaction, np.nan]
})

print(anova_table)


### Compare your from-scratch results with two-way ANOVA from the library statsmodels

In [None]:

# Fit two-way ANOVA model
model = ols("Score ~ Smoking*Test", data=data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

In [None]:
print(p_smoking < 0.05)
print(p_test < 0.05)
print(p_interaction < 0.05)


### Please explain your results using $\alpha = 0.05$ significance level.

- Smoking: 



- Test: 



- Interaction:




# General Factorial Design

The quality control department of a fabric finishing plant is studying the effect of several factors on the dyeing of cotton-synthetic cloth used to manufacture men's shirts. Three operators, three cycle times, and two temperatures were selected, and three small specimens of cloth were dyed under each set of conditions. The finished cloth was compared to a standard, and a numerical score was assigned. 

Define the levels of each factor as the following:
- Temperature takes values 300 and 350.
- Cycle Time  takes values 40, 50, 60. 
- Operator takes values 1, 2, 3.

Define the model for this experiment and calculate the parameters $\alpha_i, \beta_j, \gamma_k$ for the main effects and interaction effects into the model. Find confidence interval of $0.05$ significance level for each parameter. Then, create the ANOVA table. Explain your results.

### Data

In [3]:
data = {
    'Temperature': [300, 300, 300, 350, 350, 350] * 9,
    'CycleTime': [40] * 18 + [50] * 18 + [60] * 18,
    'Operator': [1, 2, 3] * 18,
    'Score': [
        23, 27, 31, 24, 38, 34, 
        24, 28, 32, 23, 36, 36, 
        25, 26, 29, 28, 35, 39,
        36, 34, 33, 37, 34, 34, 
        35, 38, 34, 39, 38, 36, 
        36, 39, 35, 35, 36, 31,
        28, 35, 26, 26, 36, 28, 
        24, 35, 27, 29, 37, 26, 
        27, 34, 25, 25, 34, 24
    ]
}

data = pd.DataFrame(data)


print(data)


    Temperature  CycleTime  Operator  Score
0           300         40         1     23
1           300         40         2     27
2           300         40         3     31
3           350         40         1     24
4           350         40         2     38
5           350         40         3     34
6           300         40         1     24
7           300         40         2     28
8           300         40         3     32
9           350         40         1     23
10          350         40         2     36
11          350         40         3     36
12          300         40         1     25
13          300         40         2     26
14          300         40         3     29
15          350         40         1     28
16          350         40         2     35
17          350         40         3     39
18          300         50         1     36
19          300         50         2     34
20          300         50         3     33
21          350         50      

In [None]:
#Canculate the means 
overall_mean = ...
temperature_means = ...
cycle_time_means = ...
operator_means = ...

# Main effects
# Temperature
alpha = ...

# Cycle Time
beta = ...

# Operator
gamma = ...

# interaction effects

# Interaction effect Temperature * Cycle Time
interaction_alpha_beta = ...

# Interaction effect Temperature * Operator
interaction_alpha_gamma = ... 

# Interaction effect  Cycle Time * Operator
interaction_beta_gamma = ...


# Interaction effect Temperature * Cycle Time * Operator
interaction_alpha_beta_gamma = ...


# Model Results
print("Main Effects for Temperature")
print(alpha)

print("Main Effects for Cycle Time")
print(beta)

print("Main Effects for Operator")
print(gamma)

print("Interaction Effects for Temperature * Cycle Time")
print(interaction_alpha_beta)


print("Interaction Effects for Temperature * Operator")
print(interaction_alpha_gamma)

print("Interaction Effects for Cycle Time * Operator")
print(interaction_beta_gamma)

print("Interaction Effects for Temperature * Cycle Time * Operator")
print(interaction_alpha_beta_gamma)


### Confidence Intervals

In [None]:
# Residual Variance (MSE)
data2 = data.copy()
data2['Predicted'] = data2.apply(lambda row: overall_mean + alpha[row['Temperature']] + beta[row['CycleTime']] + gamma[row['Operator']], axis=1)
data2['Residual'] = data2['Score'] - data2['Predicted']


#Calculate mean square error
MSE = ...

# Calculate standard Errors
n = len(data2) / (len(alpha) * len(beta) * len(gamma))  
se_alpha = ...
se_beta = ...
se_gamma = ...
se_interaction = ...

# 95% confidence level
significance_level = 0.05  
# Find the degrees of freedom for error
df_error = ...

# Find the critical value
t_crit = ...

# Compute the confidence intervals

# CI for Main Effects
ci_alpha = ...
ci_beta = ...
ci_gamma = ...

# CI for Interaction Effects
ci_interaction_alpha_beta = ...
ci_interaction_alpha_gamma = ...
ci_interaction_beta_gamma = ...
ci_interaction_alpha_beta_gamma = ...

print("Confidence Intervals for Temperature")
for temp, ci in ci_alpha.items():
    print(f"Temperature {temp}: {ci}")

print("Confidence Intervals for Cycle Time")
for cycle, ci in ci_beta.items():
    print(f"Cycle Time {cycle}: {ci}")

print("Confidence Intervals for Operator")
for op, ci in ci_gamma.items():
    print(f"Operator {op}: {ci}")

print("Confidence Intervals for Interaction - Temperature * Cycle Time")
for key, ci in ci_interaction_alpha_beta.items():
    print(f"Temperature  {key[0]}, Cycle Time {key[1]}: {ci}")
    
print("Confidence Intervals for Interaction - Temperature * Operator")
for key, ci in ci_interaction_alpha_gamma.items():
    print(f"Temperature {key[0]}, Operator {key[1]}: {ci}")

print("Confidence Intervals for Interaction - Cycle Time * Operator")
for key, ci in ci_interaction_beta_gamma.items():
    print(f"Cycle Time  {key[0]}, Operator {key[1]}: {ci}")
    
print("Confidence Intervals for Interaction - Temperature * Cycle Time * Operator")
for key, ci in ci_interaction_alpha_beta_gamma.items():
    print(f"Temperature  {key[0]}, Cycle Time {key[1]}, Operator {key[2]}: {ci}")

### Add your comments for the confidence intervals
...

### Get three-way ANOVA-table and conclude your results

In [None]:
# Fit three-way ANOVA model
# Convert categorical variables to categorical type
data['Temperature'] = data['Temperature'].astype('category')
data['CycleTime'] = data['CycleTime'].astype('category')
data['Operator'] = data['Operator'].astype('category')

model = ...
anova_table = ...
print(anova_table)

In [None]:
significance_level = 0.05
p_A = ...

In [None]:
p_B = ...

In [None]:
p_C = ...

In [None]:
p_AB = ...

In [None]:
p_AC = ...

In [None]:
p_BC = ...

In [None]:
p_ABC = ...