# Example of 3 encoding strategies and coefficient interpretation
- using both statsmodels linear regression and sklearn linear regression to show coefficient outputs

In [1]:
import pandas as pd
import numpy as np

# Creating a dataset
- color mapping
    - one is reference (green)
    - two is red
    - three is blue

In [59]:
one = np.random.random(size=10)
two = one+1
three = two+1

In [139]:
one_df = pd.DataFrame([one], index=['y']).T
one_df['red'] = -1
one_df['blue'] = -1

two_df = pd.DataFrame([two], index=['y']).T
two_df['blue'] = 0
two_df['red'] = 1

three_df = pd.DataFrame([three], index=['y']).T
three_df['blue'] = 1
three_df['red'] = 0

In [140]:
new = pd.concat([one_df, two_df, three_df], sort=False)
new.reset_index(inplace=True, drop=True)

In [141]:
# new = 1-n-1 encoding
new

Unnamed: 0,y,red,blue
0,0.810654,-1,-1
1,0.983621,-1,-1
2,0.442504,-1,-1
3,0.20233,-1,-1
4,0.760286,-1,-1
5,0.531391,-1,-1
6,0.053199,-1,-1
7,0.954313,-1,-1
8,0.953323,-1,-1
9,0.639992,-1,-1


In [142]:
old = new.copy()
old['green'] = [1]*10 + [0]*20 
old.loc[:9, 'red'] = 0
old.loc[:9, 'blue'] = 0

In [143]:
# old = n encoding
old

Unnamed: 0,y,red,blue,green
0,0.810654,0,0,1
1,0.983621,0,0,1
2,0.442504,0,0,1
3,0.20233,0,0,1
4,0.760286,0,0,1
5,0.531391,0,0,1
6,0.053199,0,0,1
7,0.954313,0,0,1
8,0.953323,0,0,1
9,0.639992,0,0,1


In [144]:
n1 = old.copy()
n1.drop(columns=['green'], inplace=True)

In [145]:
# n1 = n - 1 encoding
n1

Unnamed: 0,y,red,blue
0,0.810654,0,0
1,0.983621,0,0
2,0.442504,0,0
3,0.20233,0,0
4,0.760286,0,0
5,0.531391,0,0
6,0.053199,0,0
7,0.954313,0,0
8,0.953323,0,0
9,0.639992,0,0


In [43]:
import statsmodels.api as sm

In [84]:
from sklearn.linear_model import LinearRegression

In [127]:
def make_coef_df(lin_model, X):
    df = pd.DataFrame([lin_model.coef_], columns=X.columns)
    df['intercept'] = lin_model.intercept_
    return df

# Encoding: N

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,y,R-squared:,0.876
Model:,OLS,Adj. R-squared:,0.867
Method:,Least Squares,F-statistic:,95.25
Date:,"Tue, 16 Jul 2019",Prob (F-statistic):,5.86e-13
Time:,10:36:57,Log-Likelihood:,-7.1785
No. Observations:,30,AIC:,20.36
Df Residuals:,27,BIC:,24.56
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.2249,0.044,27.608,0.000,1.134,1.316
red,0.4083,0.085,4.806,0.000,0.234,0.583
blue,1.4083,0.085,16.577,0.000,1.234,1.583
green,-0.5917,0.085,-6.965,0.000,-0.766,-0.417

0,1,2,3
Omnibus:,3.74,Durbin-Watson:,2.013
Prob(Omnibus):,0.154,Jarque-Bera (JB):,2.678
Skew:,-0.573,Prob(JB):,0.262
Kurtosis:,2.089,Cond. No.,1.91e+16


In [173]:
# Without intercept added - reference importances for N strategy
X_old = old[['red', 'blue', 'green']]
y_old = old['y']

est_old = sm.OLS(y_old, X_old).fit()
est_old.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.876
Model:,OLS,Adj. R-squared:,0.867
Method:,Least Squares,F-statistic:,95.25
Date:,"Tue, 16 Jul 2019",Prob (F-statistic):,5.86e-13
Time:,10:32:20,Log-Likelihood:,-7.1785
No. Observations:,30,AIC:,20.36
Df Residuals:,27,BIC:,24.56
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
red,1.6332,0.102,15.939,0.000,1.423,1.843
blue,2.6332,0.102,25.699,0.000,2.423,2.843
green,0.6332,0.102,6.179,0.000,0.423,0.843

0,1,2,3
Omnibus:,3.74,Durbin-Watson:,2.013
Prob(Omnibus):,0.154,Jarque-Bera (JB):,2.678
Skew:,-0.573,Prob(JB):,0.262
Kurtosis:,2.089,Cond. No.,1.0


#### scikit learn (most similar to our models)

In [174]:
X_old = old[['red', 'blue', 'green']]
y_old = old['y']

lin_old = LinearRegression()
lin_old.fit(X_old, y_old)

old_details = make_coef_df(lin_old, X_old)
old_details

Unnamed: 0,red,blue,green,intercept
0,1.147423e-16,1.0,-1.0,1.633161


#### importance is coefficients plus intercept

In [185]:
old_importance = old_details.copy()
old_importance = old_importance.apply(lambda x: x + old_details['intercept'])
old_importance.drop(columns=['intercept'], inplace=True)
old_importance

Unnamed: 0,red,blue,green
0,1.633161,2.633161,0.633161


Note: if you run the same statsmodels code with an added intercept (below), same strategy as above for getting importances - add intercept to each coefficient

In [None]:
# # with intercept added - not correct for N strategy, right?
# X_old = old[['red', 'blue', 'green']]
# y_old = old['y']

# X_old = sm.add_constant(X_old)

# est_old = sm.OLS(y_old, X_old).fit()
# est_old.summary()

# encoding: N - 1
- Green is left out. This means green is the reference and is captured in the intercepts. So coefficients/importance for each of the other variables is in relation to green. For absolute importance of each variable, add intercept (right?)
    - green = intercept
    - red = intercept + red
    - blue = intercept + blue

In [157]:
X_n1 = n1[['red', 'blue']]
y_n1 = n1['y']

X_n1 = sm.add_constant(X_n1)

est_n1 = sm.OLS(y_n1, X_n1).fit()
est_n1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.876
Model:,OLS,Adj. R-squared:,0.867
Method:,Least Squares,F-statistic:,95.25
Date:,"Tue, 16 Jul 2019",Prob (F-statistic):,5.86e-13
Time:,10:13:10,Log-Likelihood:,-7.1785
No. Observations:,30,AIC:,20.36
Df Residuals:,27,BIC:,24.56
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6332,0.102,6.179,0.000,0.423,0.843
red,1.0000,0.145,6.901,0.000,0.703,1.297
blue,2.0000,0.145,13.802,0.000,1.703,2.297

0,1,2,3
Omnibus:,3.74,Durbin-Watson:,2.013
Prob(Omnibus):,0.154,Jarque-Bera (JB):,2.678
Skew:,-0.573,Prob(JB):,0.262
Kurtosis:,2.089,Cond. No.,3.73


In [158]:
X_n1 = n1[['red', 'blue']]
y_n1 = n1['y']


lin_n1 = LinearRegression()
lin_n1.fit(X_n1, y_n1)

n1_details = make_coef_df(lin_n1, X_n1)
n1_details

Unnamed: 0,red,blue,intercept
0,1.0,2.0,0.633161


In [159]:
n1_importance = n1_details.copy()
n1_importance.columns = ['red', 'blue', 'green']
n1_importance['red'] = n1_importance['red'] + n1_details['intercept']
n1_importance['blue'] = n1_importance['blue'] + n1_details['intercept']
n1_importance

Unnamed: 0,red,blue,green
0,1.633161,2.633161,0.633161


# Encoding: 1 - n - 1
- Green is left out as a variable, and green observations are coded `-1` for red and blue instead of `0`. From Steve: This centers the effects of all the dummy variables around the middle value instead of choosing one variable as zero and all others being centered around that (e.g. green as reference in N-1 example above).

In [160]:
X_new = new[['red', 'blue']]
y_new = new['y']

X_new = sm.add_constant(X_new)

est_new = sm.OLS(y_new, X_new).fit()
est_new.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,y,R-squared:,0.876
Model:,OLS,Adj. R-squared:,0.867
Method:,Least Squares,F-statistic:,95.25
Date:,"Tue, 16 Jul 2019",Prob (F-statistic):,5.86e-13
Time:,10:16:44,Log-Likelihood:,-7.1785
No. Observations:,30,AIC:,20.36
Df Residuals:,27,BIC:,24.56
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.6332,0.059,27.608,0.000,1.512,1.755
red,-1.665e-16,0.084,-1.99e-15,1.000,-0.172,0.172
blue,1.0000,0.084,11.953,0.000,0.828,1.172

0,1,2,3
Omnibus:,3.74,Durbin-Watson:,2.013
Prob(Omnibus):,0.154,Jarque-Bera (JB):,2.678
Skew:,-0.573,Prob(JB):,0.262
Kurtosis:,2.089,Cond. No.,1.73


In [165]:
X_new = new[['red', 'blue']]
y_new = new['y']

lin_new = LinearRegression()
lin_new.fit(X_new, y_new)

new_details = make_coef_df(lin_new, X_new)
new_details

Unnamed: 0,red,blue,intercept
0,2.095702e-16,1.0,1.633161


#### importances
- red = red + intercept
- blue = blue + intercept
- green = intercept - red - blue

In [190]:
new_importance = new_details.copy()
new_importance['red'] = new_importance['red'] + new_details['intercept']
new_importance['blue'] = new_importance['blue'] + new_details['intercept']
new_importance['green'] = new_details['intercept'] - new_details['red'] - new_details['blue']
new_importance.drop(columns=['intercept'], inplace=True)
new_importance

Unnamed: 0,red,blue,green
0,1.633161,2.633161,0.633161
