# 1章 セレクションバイアスとRCT
- 1.4: Rによるメールマーケティングの効果の検証
    - 上記章の内容をpythonで再実装します

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import warnings
warnings.filterwarnings('ignore')



## 1.4.1 RCTを行ったデータの準備

In [2]:
# ECサイトのユーザに対してRCTを適用したメールマーケティングを行ったデータ
dataset = 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'

email_data = pd.read_csv(dataset)

print('dataset shape: ', email_data.shape)
email_data.head()

dataset shape:  (64000, 12)


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


### 女性向けメールが配信されたデータを削除

In [3]:
male_df = email_data[email_data['segment'] != 'Womens E-Mail']
print('dataset shape: ', male_df.shape)
male_df.head()

dataset shape:  (42613, 12)


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0


### 介入を表すtreatment変数を追加

In [4]:
male_df['treatment'] = male_df['segment'].apply(lambda x: 1 if x == 'Mens E-Mail' else 0)
male_df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0


## 1.4.2 RCTデータの集計と有意差検定

### 集計: グループごとのconversionの平均・spendの平均・データ数

In [5]:
summary_by_segment = male_df.groupby(['treatment']).aggregate(
    {
        'conversion': np.mean, 
        'spend': np.mean, 
        'visit': np.ma.count,
    }
)
summary_by_segment.columns = ['conversion_rate', 'spend_mean', 'count']
summary_by_segment

Unnamed: 0_level_0,conversion_rate,spend_mean,count
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.005726,0.652789,21306
1,0.012531,1.422617,21307


### 有意差検定

In [6]:
# (a)男性向けメールが配信されたグループの購買データを得る
mens_mail = male_df[male_df['treatment'] == 1]['spend'].values

# (b)メールが配信されなかったグループの購買データを得る
no_mail = male_df[male_df['treatment'] == 0]['spend'].values

# (a)(b)の平均の差に対して有意差検定を行う
stats.ttest_ind(mens_mail, no_mail, equal_var=True)

Ttest_indResult(statistic=5.300090294465472, pvalue=1.163200872605869e-07)

## 1.4.3 バイアスのあるデータによる効果の検証

### バイアスのあるデータの準備

In [7]:
sample_size_rate = 0.5

obs_t = male_df[male_df['treatment'] == 1]
obs_c = male_df[male_df['treatment'] == 0]

# メール配信されていないグループからの該当データを削除
obs_c_biased = obs_c.drop(
    obs_c[
        (obs_c['history'] > 300) | (obs_c['recency'] < 6) | (obs_c['channel'] == 'Multichannel')
    ].sample(frac=sample_size_rate, random_state=1).index
)

# メールが配信されたグループからの該当データを削除
obs_t_biased = obs_t.drop(
    obs_t[~(
        (obs_t['history'] > 300) | (obs_t['recency'] < 6) | (obs_t['channel'] == 'Multichannel')
    )].sample(frac=sample_size_rate, random_state=1).index
)

biased_data = pd.concat([obs_c_biased, obs_t_biased])
biased_data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
15,3,1) $0 - $100,58.13,1,0,Urban,1,Web,No E-Mail,1,0,0.0,0
20,9,1) $0 - $100,29.99,0,1,Surburban,1,Phone,No E-Mail,0,0,0.0,0
23,2,1) $0 - $100,29.99,0,1,Urban,1,Phone,No E-Mail,0,0,0.0,0
28,7,4) $350 - $500,435.73,0,1,Urban,1,Web,No E-Mail,0,0,0.0,0


### バイアスのあるデータの集計と有意差の検定

In [8]:
summary_by_segment_biased = biased_data.groupby(['treatment']).aggregate(
    {
        'conversion': np.mean, 
        'spend': np.mean, 
        'visit': np.ma.count,
    }
)
summary_by_segment_biased.columns = ['conversion_rate', 'spend_mean', 'count']
summary_by_segment_biased

Unnamed: 0_level_0,conversion_rate,spend_mean,count
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.00576,0.697579,14756
1,0.012989,1.501725,17169


In [9]:
# (a)男性向けメールが配信されたグループの購買データを得る
mens_mail_biased = biased_data[biased_data['treatment'] == 1]['spend'].values

# (b)メールが配信されなかったグループの購買データを得る
no_mail_biased = biased_data[biased_data['treatment'] == 0]['spend'].values

# (a)(b)の平均の差に対して有意差検定を行う
stats.ttest_ind(mens_mail_biased, no_mail_biased, equal_var=True)

Ttest_indResult(statistic=4.560664008782925, pvalue=5.118147589954738e-06)

In [10]:
# statsmodelsを使った場合 -> t統計量・p値以外にも，自由度が出ます
from statsmodels.stats.weightstats import ttest_ind

ttest_ind(mens_mail_biased, no_mail_biased, alternative='two-sided', usevar='pooled')

(4.560664008782937, 5.118147589954427e-06, 31923.0)