# Goal
The goal is simple: demonstrate similarities and differences among different categorical coding systems for using in regression, either OLS or Ridge/Regularization.

# packages

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import patsy

# data
generate random 100 observations, 4 categories in a group variable, code regression predictors using different designs: treatment (aka, dummy), Helmert, onehot (aka, dummy or treatment without an intercept so all group means are estimated) 

In [3]:
y = np.random.normal(loc = 0, scale = 1, size = 100)
group = np.repeat(["A", "B", "C", "D"], repeats = 25)
X_treatment = patsy.dmatrix("~ C(group, Treatment)")[:, 1:]

# for comparison to Treatment
X_get_dummies_drop = pd.get_dummies(group, drop_first = True)
X_helmert = patsy.dmatrix("~ C(group, Helmert)")[:, 1:]

# the -1 in the formula drops the intercept (aka, "bias term")
X_onehot = patsy.dmatrix("~ C(group, [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]) - 1")[:, :]

# for comparison to onehot
X_get_dummies_nodrop = pd.get_dummies(group, drop_first = False)

# ridge fits
fit ridge regressions with same alpha, same random state

In [4]:
np.random.RandomState(519083475)
ridge_fit_treatment = Ridge(fit_intercept = True, alpha = 0.5).fit(X_treatment, y)
ridge_fit_get_dummies_drop = Ridge(fit_intercept = True, alpha = 0.5).fit(X_get_dummies_drop, y)
ridge_fit_helmert = Ridge(fit_intercept = True, alpha = 0.5).fit(X_helmert, y)
ridge_fit_onehot = Ridge(fit_intercept = False, alpha = 0.5).fit(X_onehot, y)
ridge_fit_get_dummies_nodrop = Ridge(fit_intercept = False, alpha = 0.5).fit(X_get_dummies_nodrop, y)

## intercept and coefficients
Different coefficients

In [5]:
pd.DataFrame({"coef": ["b0", "b1", "b2", "b3"], "treatment": np.append(ridge_fit_treatment.coef_, ridge_fit_treatment.intercept_), "get_dummies_drop": np.append(ridge_fit_get_dummies_drop.coef_, ridge_fit_get_dummies_drop.intercept_), "helmert": np.append(ridge_fit_helmert.coef_, ridge_fit_helmert.intercept_), "onehot": ridge_fit_onehot.coef_, "get_dummies_nodrop": ridge_fit_get_dummies_nodrop.coef_})

Unnamed: 0,coef,treatment,get_dummies_drop,helmert,onehot,get_dummies_nodrop
0,b0,0.099775,0.099775,0.052764,0.000538,0.000538
1,b1,0.287645,0.287645,0.081368,0.105031,0.105031
2,b2,-0.146806,-0.146806,-0.069849,0.292901,0.292901
3,b3,0.005361,0.005361,0.065514,-0.14155,-0.14155


## RMSE
Same RMSE (with rounding errors)

In [7]:
for fit, X in zip([ridge_fit_treatment, ridge_fit_get_dummies_drop, ridge_fit_helmert, ridge_fit_onehot, ridge_fit_get_dummies_nodrop], [X_treatment, X_get_dummies_drop, X_helmert, X_onehot, X_get_dummies_nodrop]):
    print(mean_squared_error(y_true = y, y_pred = fit.predict(X), squared = False))

0.9128284151706533
0.9128284151706533
0.9128191450325455
0.9128253870575815
0.9128253870575815
