# Effect Coding

## Data

In [1]:
import pandas as pd

data = [
    ['A', 1],
    ['A', 3],
    ['A', 2],
    ['A', 2],
    ['B', 2],
    ['B', 3],
    ['B', 4],
    ['B', 3],
    ['C', 5],
    ['C', 6],
    ['C', 4],
    ['C', 5],
    ['D', 10],
    ['D', 10],
    ['D', 9],
    ['D', 11]
]

df = pd.DataFrame(data, columns=['group', 'y'])
df

Unnamed: 0,group,y
0,A,1
1,A,3
2,A,2
3,A,2
4,B,2
5,B,3
6,B,4
7,B,3
8,C,5
9,C,6


## Means

In [3]:
df['y'].mean()

5.0

In [2]:
df.groupby(['group']).mean()

Unnamed: 0_level_0,y
group,Unnamed: 1_level_1
A,2.0
B,3.0
C,5.0
D,10.0


## Dummy coding

The interecept is the mean of the reference group `D`.

In [19]:
from sklearn.preprocessing import OneHotEncoder

X = df[df.columns.drop(['y'])]

encoder = OneHotEncoder()
encoder.fit(X)

In [21]:
X = pd.DataFrame(
    encoder.transform(X).todense(),
    columns=encoder.get_feature_names_out()
).drop(columns=['group_D'])
X

Unnamed: 0,group_A,group_B,group_C
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
5,0.0,1.0,0.0
6,0.0,1.0,0.0
7,0.0,1.0,0.0
8,0.0,0.0,1.0
9,0.0,0.0,1.0


In [22]:
y = df['y']
y

0      1
1      3
2      2
3      2
4      2
5      3
6      4
7      3
8      5
9      6
10     4
11     5
12    10
13    10
14     9
15    11
Name: y, dtype: int64

In [23]:
from sklearn.linear_model import LinearRegression

m = LinearRegression()
m.fit(X, y)

m.intercept_, m.coef_

(10.000000000000002, array([-8., -7., -5.]))

## Effect coding

The intercept is the grand mean.

In [40]:
def encode(v, ref, neg):
    if v == ref:
        return 1
    if v == neg:
        return -1
    return 0

X = pd.DataFrame({
    'group_A': df['group'].apply(lambda v: encode(v, 'A', 'D')),
    'group_B': df['group'].apply(lambda v: encode(v, 'B', 'D')),
    'group_C': df['group'].apply(lambda v: encode(v, 'C', 'D'))
})
X

Unnamed: 0,group_A,group_B,group_C
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,0,1,0
5,0,1,0
6,0,1,0
7,0,1,0
8,0,0,1
9,0,0,1


In [29]:
m = LinearRegression()
m.fit(X, y)

m.intercept_, m.coef_

(5.0, array([-3.00000000e+00, -2.00000000e+00, -2.22044605e-16]))