<a href="https://colab.research.google.com/github/rajdeepbanerjee-git/JNCLectures_Intro_to_ML/blob/main/Week2/2025/week2_1way_ANOVA_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import f


In [4]:
data = pd.read_csv('./crop-data_ANOVA.csv')
data.tail()

Unnamed: 0,crop_density,fertilizer_types,crop_yield
91,high,synthetic,177.405292
92,low,synthetic,178.141644
93,high,synthetic,177.710612
94,low,synthetic,177.687264
95,high,synthetic,177.118176


#### Note:
- two-categorical variables, one continuous
- the categorical variables will be assumed to be independent
- we will create groups using the unique categories for each of 'crop_density' and 'fertilizer_types'

In [9]:
# Define a function to calculate One-Way ANOVA manually
def one_way_anova(df, group_col, value_col):
    overall_mean = df[value_col].mean()

    # Between-group sum of squares (SSB)
    ssb = df.groupby(group_col)[value_col].apply(lambda x: len(x) * (x.mean() - overall_mean)**2).sum()

    # Within-group sum of squares (SSW)
    ssw = df.groupby(group_col)[value_col].apply(lambda x: ((x - x.mean())**2).sum()).sum()

    # Degrees of freedom
    df_between = df[group_col].nunique() - 1
    df_within = len(df) - df[group_col].nunique()

    # Mean squares
    ms_between = ssb / df_between
    ms_within = ssw / df_within

    # F-statistic
    f_stat = ms_between / ms_within

    # p-value
    p_value = 1 - f.cdf(f_stat, df_between, df_within)

    return {
        'SSB': ssb,
        'SSW': ssw,
        'DF_between': df_between,
        'DF_within': df_within,
        'MS_between': ms_between,
        'MS_within': ms_within,
        'F-statistic': f_stat,
        'p-value': p_value
    }





In [12]:
# Apply One-Way ANOVA for crop_density and fertilizer_types separately
anova_density = one_way_anova(df = data,
                              group_col = 'crop_density',
                              value_col = 'crop_yield')
anova_fertilizer = one_way_anova(df = data,
                                 group_col = 'fertilizer_types',
                                 value_col = 'crop_yield')

# Compile results into a DataFrame
anova_results_one_way_manual = pd.DataFrame({
    'Source': ['Crop Density', 'Fertilizer Type'],
    'SSB': [anova_density['SSB'], anova_fertilizer['SSB']],
    'SSW': [anova_density['SSW'], anova_fertilizer['SSW']],
    'DF_between': [anova_density['DF_between'], anova_fertilizer['DF_between']],
    'DF_within': [anova_density['DF_within'], anova_fertilizer['DF_within']],
    'MS_between': [anova_density['MS_between'], anova_fertilizer['MS_between']],
    'MS_within': [anova_density['MS_within'], anova_fertilizer['MS_within']],
    'F-statistic': [anova_density['F-statistic'], anova_fertilizer['F-statistic']],
    'p-value': [anova_density['p-value'], anova_fertilizer['p-value']]
})

In [13]:
anova_results_one_way_manual

Unnamed: 0,Source,SSB,SSW,DF_between,DF_within,MS_between,MS_within,F-statistic,p-value
0,Crop Density,5.121681,36.832552,1,94,5.121681,0.391836,13.070994,0.000485
1,Fertilizer Type,6.068047,35.886186,2,93,3.034023,0.385873,7.862752,0.0007


####    Same, but using package

In [16]:
# Import necessary libraries for ANOVA using a package
import statsmodels.api as sm
from statsmodels.formula.api import ols

# One-Way ANOVA for Crop Density
model_density = ols('crop_yield ~ C(crop_density)', data=data).fit()
anova_density_pkg = sm.stats.anova_lm(model_density, typ=2)

# One-Way ANOVA for Fertilizer Type
model_fertilizer = ols('crop_yield ~ C(fertilizer_types)', data=data).fit()
anova_fertilizer_pkg = sm.stats.anova_lm(model_fertilizer, typ=2)

# Combine the results into a single DataFrame
anova_results_one_way_pkg = pd.DataFrame({
    'Source': ['Crop Density', 'Fertilizer Type'],
    'SSB': [anova_density_pkg['sum_sq'][0], anova_fertilizer_pkg['sum_sq'][0]],
    'SSW': [anova_density_pkg['sum_sq'][1], anova_fertilizer_pkg['sum_sq'][1]],
    'DF_between': [anova_density_pkg['df'][0], anova_fertilizer_pkg['df'][0]],
    'DF_within': [anova_density_pkg['df'][1], anova_fertilizer_pkg['df'][1]],
    'F-statistic': [anova_density_pkg['F'][0], anova_fertilizer_pkg['F'][0]],
    'p-value': [anova_density_pkg['PR(>F)'][0], anova_fertilizer_pkg['PR(>F)'][0]]
})


  'SSB': [anova_density_pkg['sum_sq'][0], anova_fertilizer_pkg['sum_sq'][0]],
  'SSW': [anova_density_pkg['sum_sq'][1], anova_fertilizer_pkg['sum_sq'][1]],
  'DF_between': [anova_density_pkg['df'][0], anova_fertilizer_pkg['df'][0]],
  'DF_within': [anova_density_pkg['df'][1], anova_fertilizer_pkg['df'][1]],
  'F-statistic': [anova_density_pkg['F'][0], anova_fertilizer_pkg['F'][0]],
  'p-value': [anova_density_pkg['PR(>F)'][0], anova_fertilizer_pkg['PR(>F)'][0]]


In [17]:
anova_results_one_way_pkg

Unnamed: 0,Source,SSB,SSW,DF_between,DF_within,F-statistic,p-value
0,Crop Density,5.121681,36.832552,1.0,94.0,13.070994,0.000485
1,Fertilizer Type,6.068047,35.886186,2.0,93.0,7.862752,0.0007
