In [383]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression as lr
from statsmodels.regression.mixed_linear_model import MixedLM as mlm
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [378]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
'''
ToDo:
Review HLM theory
Understand random effects
Read up on demeaning and test with continuous feature
Understand why coefficient for cooler is sensitive to sample size by group but not to cooler assignment probability
'''

In [359]:
n1=50000
n2=int(n1/2)
p1=[2/3,1/3]
p2=[1/3,2/3]
p3=[0.5,0.5]

# nongold=poor
# effect of cooler = 3
data_nongold = pd.DataFrame({
    'y_profit': 20 - 0.5*np.random.randn(n1),
    'X_gold':0,
    'w_cooler': np.random.choice([0,1], size=(n1,), p=p1)
    }).assign(y_profit=lambda df: np.where(df.w_cooler, df.y_profit+3, df.y_profit))

# gold=rich
# effect of cooler is 5
data_gold = pd.DataFrame({
    'y_profit': 25 - 0.5*np.random.randn(n2),
    'X_gold':1,
    'w_cooler': np.random.choice([0,1], size=(n2,), p=p1)
    }).assign(y_profit=lambda df: np.where(df.w_cooler, df.y_profit+5, df.y_profit))

data = data_gold.append(data_nongold)
print(data.shape)

(75000, 3)


In [349]:
### Interaction terms ###
## Should test out with continuous variables to determine necessity of demeaning

# Coeff. for w_cooler and di are same regardless of sample size imbalance 
# across groups and are the desired values (poor / rich)
# Standard interaction
data_reg = data.assign(
    demeaned_interaction=lambda df:
    df.w_cooler * df.X_gold)

# 'CORRECT'
# data_reg = data.assign(
#     demeaned_interaction=lambda df:
#     df.w_cooler * (df.X_gold-df.X_gold.mean()))

In [350]:
# Diff. in conditional mean when treatment is imbalanced (p1 and p2) belies 
# the true average effect, 4 (yields 6 when sample size is equal for groups )
# If more coolers are asigned to the 'rich' group, this artificially 
# inflates the effect of the coolers (using equal prob for both cooler distributions yields 4)
data_reg[data_reg.w_cooler==1].y_profit.mean() - data_reg[data_reg.w_cooler==0].y_profit.mean() 

5.506057320884718

In [351]:
# w_cooler coef is mean of treatment effects when samples are equal size - why?
lm = lr().fit(data_reg[['X_gold', 'w_cooler']],
                            data_reg.y_profit)

In [352]:
print(lm.coef_)
print(lm.intercept_)

[6.11679798 3.66120352]
19.777056942949343


In [353]:
# w_cooler coef is mean of groups when samples are equal size
lm = lr().fit(data_reg[['X_gold', 'w_cooler', 'demeaned_interaction']],
                            data_reg.y_profit)

In [354]:
print(lm.coef_)
lm.intercept_

[4.99605183 2.99395619 2.00861648]


19.99994424125624

In [310]:
data_reg['pred']=lm.predict(data_reg[['X_gold', 'w_cooler', 'demeaned_interaction']])
data_reg.pred.unique()

array([29.99061876, 25.00330994, 20.0006572 , 22.99105811])

In [311]:
data_reg.head(10)

Unnamed: 0,y_profit,X_gold,w_cooler,demeaned_interaction,pred
0,29.843811,1,1,1,29.990619
1,29.821441,1,1,1,29.990619
2,29.638247,1,1,1,29.990619
3,29.778221,1,1,1,29.990619
4,29.636182,1,1,1,29.990619
5,30.639803,1,1,1,29.990619
6,24.607369,1,0,0,25.00331
7,24.571288,1,0,0,25.00331
8,30.243053,1,1,1,29.990619
9,30.805136,1,1,1,29.990619


In [312]:
data_reg.tail(10)

Unnamed: 0,y_profit,X_gold,w_cooler,demeaned_interaction,pred
49990,19.744814,0,0,0,20.000657
49991,19.174592,0,0,0,20.000657
49992,22.877663,0,1,0,22.991058
49993,20.24799,0,0,0,20.000657
49994,19.759925,0,0,0,20.000657
49995,20.199899,0,0,0,20.000657
49996,19.875817,0,0,0,20.000657
49997,23.827834,0,1,0,22.991058
49998,23.479412,0,1,0,22.991058
49999,22.425026,0,1,0,22.991058


# HLM

In [366]:
mod = mlm(data_reg.y_profit.values, data_reg.w_cooler.values
          , data_reg.X_gold)
res = mod.fit()

In [367]:
res.summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,y
No. Observations:,75000,Method:,REML
No. Groups:,2,Scale:,0.4483
Min. group size:,25000,Likelihood:,-76350.1439
Max. group size:,50000,Converged:,Yes
Mean group size:,37500.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
x1,3.661,0.005,705.742,0.000,3.651,3.671
Group Var,102.628,50.147,,,,


In [370]:
res.params

array([  3.6612098 , 228.95212851])

In [372]:
res.random_effects

{0: Group Var    19.777053
 dtype: float64, 1: Group Var    25.893846
 dtype: float64}

In [374]:
md = smf.mixedlm("y_profit ~ w_cooler", data_reg, groups=data_reg["X_gold"])

mdf = md.fit()

print(mdf.summary())

          Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: y_profit   
No. Observations: 75000   Method:             REML       
No. Groups:       2       Scale:              0.4482     
Min. group size:  25000   Likelihood:         -76342.3102
Max. group size:  50000   Converged:          Yes        
Mean group size:  37500.0                                
---------------------------------------------------------
              Coef.  Std.Err.    z    P>|z| [0.025 0.975]
---------------------------------------------------------
Intercept     22.835    1.666  13.707 0.000 19.570 26.101
w_cooler       3.661    0.005 705.770 0.000  3.651  3.671
Group Var      5.551    4.894                            



In [375]:
mdf.params

Intercept    22.835452
w_cooler      3.661208
Group Var    12.383877
dtype: float64

In [376]:
# Deviation from 'grand intercept' for each group
mdf.random_effects

{0: Group   -3.058392
 dtype: float64, 1: Group    3.05839
 dtype: float64}

In [398]:
data_reg['intercept']=1
endog = data_reg[['intercept', 'w_cooler']]

In [399]:
md = sm.MixedLM(data_reg.y_profit, endog,
           data_reg.X_gold, endog)
mdf = md.fit()
print(mdf.summary())

               Mixed Linear Model Regression Results
Model:                MixedLM    Dependent Variable:    y_profit   
No. Observations:     75000      Method:                REML       
No. Groups:           2          Scale:                 0.2494     
Min. group size:      25000      Likelihood:            -54370.1622
Max. group size:      50000      Converged:             Yes        
Mean group size:      37500.0                                      
-------------------------------------------------------------------
                         Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------------------
intercept                22.498    1.496 15.039 0.000 19.566 25.430
w_cooler                  3.998    0.699  5.716 0.000  2.627  5.369
intercept Var             4.476    6.117                           
intercept x w_cooler Cov  1.405    2.227                           
w_cooler Var              0.978    1.299                       



In [400]:
mdf.random_effects

{0: intercept   -2.498026
 w_cooler    -1.004303
 dtype: float64, 1: intercept    2.498026
 w_cooler     1.004302
 dtype: float64}