# 2章 介入効果を測るための回帰分析

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')



In [21]:
# ECサイトのユーザに対してRCTを適用したメールマーケティングを行ったデータ
dataset = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'

email_data = pd.read_csv(dataset)
print('dataset shape: ', email_data.shape)

male_df = email_data[email_data['segment'] != 'Womens E-Mail']
print('dataset shape: ', male_df.shape)

male_df['treatment'] = male_df['segment'].apply(lambda x: 1 if x == 'Mens E-Mail' else 0)

sample_size_rate = 0.5

obs_t = male_df[male_df['treatment'] == 1]
obs_c = male_df[male_df['treatment'] == 0]

# メール配信されていないグループからの該当データを削除
obs_c_biased = obs_c.drop(
    obs_c[
        (obs_c['history'] > 300) | (obs_c['recency'] < 6) | (obs_c['channel'] == 'Multichannel')
    ].sample(frac=sample_size_rate, random_state=10).index
)

# メールが配信されたグループからの該当データを削除
obs_t_biased = obs_t.drop(
    obs_t[~(
        (obs_t['history'] > 300) | (obs_t['recency'] < 6) | (obs_t['channel'] == 'Multichannel')
    )].sample(frac=sample_size_rate, random_state=10).index
)

biased_data = pd.concat([obs_c_biased, obs_t_biased])
biased_data.head()

dataset shape:  (64000, 12)
dataset shape:  (42613, 12)


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0
15,3,1) $0 - $100,58.13,1,0,Urban,1,Web,No E-Mail,1,0,0.0,0
20,9,1) $0 - $100,29.99,0,1,Surburban,1,Phone,No E-Mail,0,0,0.0,0
23,2,1) $0 - $100,29.99,0,1,Urban,1,Phone,No E-Mail,0,0,0.0,0
24,4,1) $0 - $100,78.24,1,0,Surburban,0,Web,No E-Mail,0,0,0.0,0


## 2.1.5 Rによるメールマーケティングデータの分析（回帰編）

In [22]:
biased_reg = smf.ols(
    data=biased_data,
    formula='spend ~ treatment + history'
).fit()

In [23]:
biased_reg.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,20.69
Date:,"Sat, 26 Jun 2021",Prob (F-statistic):,1.04e-09
Time:,22:50:11,Log-Likelihood:,-133050.0
No. Observations:,31925,AIC:,266100.0
Df Residuals:,31922,BIC:,266100.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3611,0.146,2.466,0.014,0.074,0.648
treatment,0.8768,0.177,4.949,0.000,0.530,1.224
history,0.0012,0.000,3.366,0.001,0.000,0.002

0,1,2,3
Omnibus:,69664.885,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,310825433.403
Skew:,20.079,Prob(JB):,0.0
Kurtosis:,484.72,Cond. No.,826.0


## 2.2.1 共変量の追加による効果への作用

### RCTデータでの単回帰

In [24]:
rct_reg = smf.ols(
    data=male_df,
    formula='spend ~ treatment'
).fit()

In [25]:
rct_reg.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6528,0.103,6.356,0.000,0.451,0.854
treatment,0.7698,0.145,5.300,0.000,0.485,1.055


### バイアスのあるデータでの単回帰
- セレクションバイアスによる効果が過剰に推定されている

In [26]:
nonrct_reg = smf.ols(
    data=biased_data,
    formula='spend ~ treatment'
).fit()

In [27]:
nonrct_reg.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5968,0.129,4.640,0.000,0.345,0.849
treatment,0.9614,0.175,5.481,0.000,0.618,1.305


### バイアスのあるデータでの重回帰
- 共変量を追加することで，セレクションバイアスの影響がより少なくなる

In [28]:
nonrct_mreg = smf.ols(
    data=biased_data,
    formula='spend ~ treatment + recency + channel + history'
).fit()

In [29]:
nonrct_mreg.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3047,0.383,0.795,0.427,-0.447,1.056
channel[T.Phone],0.1507,0.308,0.489,0.625,-0.453,0.755
channel[T.Web],0.4123,0.307,1.341,0.180,-0.190,1.015
treatment,0.8383,0.181,4.627,0.000,0.483,1.193
recency,-0.0318,0.026,-1.209,0.227,-0.083,0.020
history,0.0012,0.000,3.154,0.002,0.000,0.002
