In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression as lr
from statsmodels.regression.mixed_linear_model import MixedLM as mlm
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
'''
ToDo:
Read up on demeaning and test with continuous feature
Understand why coefficient for cooler is sensitive to sample size by group but not to cooler assignment probability
'''

'\nToDo:\nReview HLM theory\nUnderstand random effects\nRead up on demeaning and test with continuous feature\nUnderstand why coefficient for cooler is sensitive to sample size by group but not to cooler assignment probability\n'

# Building simulated dataset
### Example from https://www.youtube.com/watch?v=HPC42U9xtQY

In [14]:
n1=50000
n2=int(n1/2)
p1=[2/3,1/3]
p2=[1/3,2/3]
p3=[0.5,0.5]
nongold_base_profit = 21
nongold_cooler_lift = 2
gold_base_profit = 30
gold_cooler_lift = 5
noise_var = 1

# nongold = poorer area aka area expecting less lift from cooler
data_nongold = pd.DataFrame({
    'y_profit': nongold_base_profit - noise_var*np.random.randn(n1),
    'X_gold':0,
    'w_cooler': np.random.choice([0,1], size=(n1,), p=p1)
    }).assign(y_profit=lambda df: np.where(df.w_cooler, 
                                    df.y_profit+nongold_cooler_lift, df.y_profit))

# gold = richer area aka expecting greater lift from cooler
data_gold = pd.DataFrame({
    'y_profit': gold_base_profit - noise_var*np.random.randn(n2),
    'X_gold':1,
    'w_cooler': np.random.choice([0,1], size=(n2,), p=p2)
    }).assign(y_profit=lambda df: np.where(df.w_cooler, 
                                    df.y_profit+gold_cooler_lift, df.y_profit))

data = data_gold.append(data_nongold)
print(data.shape)

(75000, 3)


In [15]:
### Interaction terms ###
## Should test out with continuous variables to determine necessity of demeaning

# Coeff. for w_cooler and di are same regardless of sample size imbalance 
# across groups and are the desired values (poor / rich)
# Standard interaction
data_reg = data.assign(
    demeaned_interaction=lambda df:
    df.w_cooler * df.X_gold)

# 'CORRECT'
# data_reg = data.assign(
#     demeaned_interaction=lambda df:
#     df.w_cooler * (df.X_gold-df.X_gold.mean()))

In [40]:
# Diff. in conditional mean w/ imbalanced treatment across gold/nongold belies the true avg effect
# More coolers are asigned to the 'rich' group (with higher base profit) artificially inflates the lift
print(f'Average cooler effect (b/w gold and nongold): {(nongold_cooler_lift + gold_cooler_lift)/2}')
print(f'Difference in conditional mean (w/ cooler - w/o cooler): {data_reg[data_reg.w_cooler==1].y_profit.mean() - data_reg[data_reg.w_cooler==0].y_profit.mean()}')

Average cooler effect (b/w gold and nongold): 3.5
Difference in conditional mean (w/ cooler - w/o cooler): 6.214059589944174


In [41]:
# w_cooler coef is mean of treatment effects when samples are equal size - why?
lm = lr().fit(data_reg[['X_gold', 'w_cooler']],
                            data_reg.y_profit)

In [42]:
print(lm.coef_)
print(lm.intercept_)

[10.66987958  2.98234688]
20.680272191017444


In [353]:
# w_cooler coef is mean of groups when samples are equal size
lm = lr().fit(data_reg[['X_gold', 'w_cooler', 'demeaned_interaction']],
                            data_reg.y_profit)

In [354]:
print(lm.coef_)
lm.intercept_

[4.99605183 2.99395619 2.00861648]


19.99994424125624

In [310]:
data_reg['pred']=lm.predict(data_reg[['X_gold', 'w_cooler', 'demeaned_interaction']])
data_reg.pred.unique()

array([29.99061876, 25.00330994, 20.0006572 , 22.99105811])

# Fitting HLM/Mixed Effects Model with random slopes for cooler (one for both gold and nongold) and random intercepts
### https://www.statsmodels.org/stable/examples/notebooks/generated/mixed_lm_example.html

In [24]:
data_reg['intercept']=1
exog = data_reg[['intercept', 'w_cooler']]

In [25]:
md = sm.MixedLM(data_reg.y_profit, exog,
           groups=data_reg.X_gold, exog_re=exog)
mdf = md.fit()
print(mdf.summary())

               Mixed Linear Model Regression Results
Model:               MixedLM    Dependent Variable:    y_profit    
No. Observations:    75000      Method:                REML        
No. Groups:          2          Scale:                 1.0003      
Min. group size:     25000      Likelihood:            -106449.9472
Max. group size:     50000      Converged:             Yes         
Mean group size:     37500.0                                       
-------------------------------------------------------------------
                         Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-------------------------------------------------------------------
intercept                25.510    2.427 10.509 0.000 20.752 30.267
w_cooler                  3.485    1.046  3.331 0.001  1.435  5.536
intercept Var            11.785    7.056                           
intercept x w_cooler Cov  2.878    2.219                           
w_cooler Var              2.189    0.689                       



# Comparing random+fixed effects with data generating params

In [35]:
re = mdf.random_effects

In [31]:
print(f'Random intercept for nongold + grand intercept: {re[0]["intercept"] + mdf.params[0]}')
print(f'Nongold base profit: {nongold_base_profit}')

Random intercept for nongold + grand intercept: 21.01395598683601
Nongold base profit: 21


In [34]:
print(f'Random intercept for gold + grand intercept: {re[1]["intercept"] + mdf.params[0]}')
print(f'Gold base profit: {gold_base_profit}')

Random intercept for gold + grand intercept: 30.005595251403665
Gold base profit: 30


In [38]:
print(f'Random slope for nongold + grand intercept: {re[0]["w_cooler"] + mdf.params[1]}')
print(f'Nongold cooler lift: {nongold_cooler_lift}')

Random slope for nongold + grand intercept: 1.9770518839119195
Nongold cooler lift: 2


In [37]:
print(f'Random slope for gold + grand intercept: {re[1]["w_cooler"] + mdf.params[1]}')
print(f'Gold cooler lift: {gold_cooler_lift}')

Random slope for gold + grand intercept: 4.993932686293157
Gold cooler lift: 5
