[구글 코랩(Colab)에서 실행하기](https://colab.research.google.com/github/lovedlim/bigdata_analyst_cert/blob/main/part3/ch2/ch2_anova.ipynb)

# 1. 일원 분산 분석

### 1. 기본학습

In [1]:
import pandas as pd
df = pd.DataFrame({
    'A': [10.5, 11.3, 10.8, 9.6, 11.1, 10.2, 10.9, 11.4, 10.5, 10.3],
    'B': [11.9, 12.4, 12.1, 13.2, 12.5, 11.8, 12.2, 12.9, 12.4, 12.3],
    'C': [11.2, 11.7, 11.6, 10.9, 11.3, 11.1, 10.8, 11.5, 11.4, 11.0],
    'D': [9.8, 9.4, 9.1, 9.5, 9.6, 9.9, 9.2, 9.7, 9.3, 9.4]
})
print(df.head(2))

      A     B     C    D
0  10.5  11.9  11.2  9.8
1  11.3  12.4  11.7  9.4


In [2]:
from scipy import stats

print("=== 정규성 검정 ===")
print(stats.shapiro(df['A']))
print(stats.shapiro(df['B']))
print(stats.shapiro(df['C']))
print(stats.shapiro(df['D']))

print("\n === 등분산 검정 ===")
print(stats.levene(df['A'], df['B'], df['C'], df['D']))

print("\n === 일원 분산 분석 ===")
print(stats.f_oneway(df['A'], df['B'], df['C'], df['D']))

=== 정규성 검정 ===
ShapiroResult(statistic=0.9649055004119873, pvalue=0.840017557144165)
ShapiroResult(statistic=0.9468040466308594, pvalue=0.63086998462677)
ShapiroResult(statistic=0.9701647162437439, pvalue=0.8923683762550354)
ShapiroResult(statistic=0.9752339720726013, pvalue=0.9346861243247986)

 === 등분산 검정 ===
LeveneResult(statistic=1.9355354288758708, pvalue=0.14127835331346628)

 === 일원 분산 분석 ===
F_onewayResult(statistic=89.12613851177174, pvalue=1.001838152252373e-16)


### 2. 심화학습

In [3]:
# 데이터 재구조화 (긴 형태)
df_melt = df.melt()
print(df_melt.head())

  variable  value
0        A   10.5
1        A   11.3
2        A   10.8
3        A    9.6
4        A   11.1


In [4]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
model = ols('value ~ variable', df_melt).fit()
print(anova_lm(model))

            df    sum_sq    mean_sq          F        PR(>F)
variable   3.0  43.21875  14.406250  89.126139  1.001838e-16
Residual  36.0   5.81900   0.161639        NaN           NaN


In [5]:
import pandas as pd
from scipy import stats

# 데이터
df = pd.DataFrame({
    'A': [10.5, 11.3, 10.8, 9.6, 11.1, 10.2, 10.9, 11.4, 10.5, 10.3],
    'B': [11.9, 12.4, 12.1, 13.2, 12.5, 11.8, 12.2, 12.9, 12.4, 12.3],
    'C': [11.2, 11.7, 11.6, 10.9, 11.3, 11.1, 10.8, 11.5, 11.4, 11.0],
    'D': [9.8, 9.4, 9.1, 9.5, 9.6, 9.9, 9.2, 9.7, 9.3, 11.4]
})

# 정규성 검정
print(stats.shapiro(df['A']))
print(stats.shapiro(df['B']))
print(stats.shapiro(df['C']))
print(stats.shapiro(df['D']))

# Kruskal-Wallis 검정
stats.kruskal(df['A'], df['B'], df['C'], df['D'])

ShapiroResult(statistic=0.9649055004119873, pvalue=0.840017557144165)
ShapiroResult(statistic=0.9468040466308594, pvalue=0.63086998462677)
ShapiroResult(statistic=0.9701647162437439, pvalue=0.8923683762550354)
ShapiroResult(statistic=0.7489405274391174, pvalue=0.0034642363898456097)


KruskalResult(statistic=30.96597802610577, pvalue=8.64185335838648e-07)

# 이원 분산 분석

### 1. 기본학습

In [6]:
import pandas as pd
# df = pd.read_csv("tree.csv")
df = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part3/ch2/tree.csv")
print(df.sample(10))

    나무  비료        성장률
94   D   1  61.078918
33   B   1  44.422891
42   B   2  56.843517
37   B   1  35.403299
117  D   3  59.313220
105  D   2  72.040509
114  D   3  69.076390
99   D   1  62.654129
116  D   3  70.652882
0    A   1  54.967142


In [7]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

model = ols('성장률 ~ 나무 + 비료 + 나무:비료', data=df).fit()
anova_table = sm.stats.anova_lm(model)
print(anova_table)

             df       sum_sq      mean_sq          F        PR(>F)
나무          3.0  4783.353938  1594.451313  18.391274  9.016693e-10
비료          1.0   873.322002   873.322002  10.073374  1.942421e-03
나무:비료       3.0   394.801585   131.600528   1.517952  2.137666e-01
Residual  112.0  9709.960792    86.696078        NaN           NaN


In [8]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

model = ols('성장률 ~ C(나무) + C(비료) + C(나무):C(비료)', data=df).fit()
anova_table = sm.stats.anova_lm(model)
print(anova_table)

                df       sum_sq      mean_sq          F        PR(>F)
C(나무)          3.0  4783.353938  1594.451313  18.855528  6.600012e-10
C(비료)          2.0  1127.924259   563.962129   6.669256  1.857612e-03
C(나무):C(비료)    6.0   717.520672   119.586779   1.414199  2.157357e-01
Residual     108.0  9132.639448    84.561476        NaN           NaN


In [9]:
print(format(6.600012e-10, '.11f'))
print(format(1.857612e-03, '.11f'))
print(format(2.157357e-01, '.11f'))

0.00000000066
0.00185761200
0.21573570000


In [10]:
model = ols('성장률 ~ C(나무) * C(비료)', data=df).fit()
anova_table = sm.stats.anova_lm(model)
print(anova_table)

                df       sum_sq      mean_sq          F        PR(>F)
C(나무)          3.0  4783.353938  1594.451313  18.855528  6.600012e-10
C(비료)          2.0  1127.924259   563.962129   6.669256  1.857612e-03
C(나무):C(비료)    6.0   717.520672   119.586779   1.414199  2.157357e-01
Residual     108.0  9132.639448    84.561476        NaN           NaN


### 2. 심화학습

In [11]:
from scipy.stats import shapiro
cond_tree_A = df['나무'] == 'A'
cond_tree_B = df['나무'] == 'B'
cond_tree_C = df['나무'] == 'C'
cond_tree_D = df['나무'] == 'D'

cond_fert_1 = df['비료'] == 1
cond_fert_2 = df['비료'] == 2
cond_fert_3 = df['비료'] == 3

print(shapiro(df[cond_tree_A & cond_fert_1]['성장률']))
print(shapiro(df[cond_tree_A & cond_fert_2]['성장률']))
print(shapiro(df[cond_tree_A & cond_fert_3]['성장률']))

print(shapiro(df[cond_tree_B & cond_fert_1]['성장률']))
print(shapiro(df[cond_tree_B & cond_fert_2]['성장률']))
print(shapiro(df[cond_tree_B & cond_fert_3]['성장률']))

print(shapiro(df[cond_tree_C & cond_fert_1]['성장률']))
print(shapiro(df[cond_tree_C & cond_fert_2]['성장률']))
print(shapiro(df[cond_tree_C & cond_fert_3]['성장률']))

print(shapiro(df[cond_tree_D & cond_fert_1]['성장률']))
print(shapiro(df[cond_tree_D & cond_fert_2]['성장률']))
print(shapiro(df[cond_tree_D & cond_fert_3]['성장률']))

ShapiroResult(statistic=0.9076584577560425, pvalue=0.26529115438461304)
ShapiroResult(statistic=0.9507257342338562, pvalue=0.6770924925804138)
ShapiroResult(statistic=0.9587481617927551, pvalue=0.7715094685554504)
ShapiroResult(statistic=0.9670253992080688, pvalue=0.8619790077209473)
ShapiroResult(statistic=0.9694889783859253, pvalue=0.8860675096511841)
ShapiroResult(statistic=0.9038048386573792, pvalue=0.2410866618156433)
ShapiroResult(statistic=0.9532947540283203, pvalue=0.7075414657592773)
ShapiroResult(statistic=0.8972557783126831, pvalue=0.20435576140880585)
ShapiroResult(statistic=0.9268559217453003, pvalue=0.41768908500671387)
ShapiroResult(statistic=0.9675582647323608, pvalue=0.8673287630081177)
ShapiroResult(statistic=0.926199734210968, pvalue=0.41156959533691406)
ShapiroResult(statistic=0.9591564536094666, pvalue=0.7762126326560974)


In [12]:
from scipy.stats import levene
print(levene(df[cond_tree_A]['성장률'],
             df[cond_tree_B]['성장률'],
             df[cond_tree_C]['성장률'],
             df[cond_tree_D]['성장률']))
print(levene(df[cond_fert_1]['성장률'],
             df[cond_fert_2]['성장률'],
             df[cond_fert_3]['성장률']))

LeveneResult(statistic=0.2803571398775752, pvalue=0.8394849531848961)
LeveneResult(statistic=1.7817228766588382, pvalue=0.1728849870829638)
