<a href="https://colab.research.google.com/github/pgurazada/causal_inference/blob/master/case%20studies/lenta/tuned_Tlearner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q scikit-uplift

In [2]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor

from sklift.datasets import fetch_lenta

# Data

In [3]:
data = fetch_lenta()

In [4]:
Y = data['target_name']
X = data['feature_names']

In [5]:
Y

'response_att'

In [6]:
data_df = pd.concat([data['target'], data['treatment'], data['data']], axis=1)

In [7]:
data_df.sample(5)

Unnamed: 0,response_att,group,age,cheque_count_12m_g20,cheque_count_12m_g21,cheque_count_12m_g25,cheque_count_12m_g32,cheque_count_12m_g33,cheque_count_12m_g38,cheque_count_12m_g39,...,sale_sum_6m_g24,sale_sum_6m_g25,sale_sum_6m_g26,sale_sum_6m_g32,sale_sum_6m_g33,sale_sum_6m_g44,sale_sum_6m_g54,stdev_days_between_visits_15d,stdev_discount_depth_15d,stdev_discount_depth_1m
416034,0,control,28.0,2.0,1.0,2.0,4.0,9.0,14.0,1.0,...,537.94,97.48,408.13,705.64,989.43,428.89,623.84,0.0,,0.1262
308715,0,test,44.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,69.99,0.0,0.0,0.0
682369,0,test,43.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,...,151.17,83.98,352.36,0.0,104.01,176.22,23.49,0.0,0.1166,0.1034
677489,0,test,,7.0,2.0,1.0,1.0,2.0,1.0,0.0,...,217.27,7.59,97.09,188.04,88.58,464.2,274.82,0.7071,0.2925,0.2474
130555,1,control,24.0,1.0,0.0,1.0,4.0,2.0,1.0,1.0,...,109.99,39.99,154.76,495.34,275.23,89.58,122.27,1.0351,0.303,0.3196


In [8]:
data_df.group.value_counts()

test       515892
control    171137
Name: group, dtype: int64

In [9]:
group_map = {'test': 1, 'control': 0}
gender_map = {'Ж': 0, 'М': 1}

In [10]:
data_df['gender'] = data_df['gender'].map(gender_map)
data_df['treatment'] = data_df['group'].map(group_map)

# Overall Impact

In [11]:
(
    data_df.groupby('treatment')
           .agg({'response_att': 'mean'})
)

Unnamed: 0_level_0,response_att
treatment,Unnamed: 1_level_1
0,0.102579
1,0.110126


# T-Learner

Estimated CATE:

$$
\hat{\tau}(x) = E[Y|X=x, T=1]-E[Y|X=x, T=0]=\hat{\mu}_1(x, 1) - \hat{\mu}_0(x, 0)
$$

where $\hat{\mu}_0=M_0(Y^0 \sim X^0)$, $\hat{\mu}_1=M_1(Y^1 \sim X^1)$ are any machine learning algorithms that are estimated on control and treatment subsets of training data respectively.

## Base Learners

We choose gradient boosted regressors and classifiers as base learners through hyperparameter tuning over randomly chosen sets of feature combinations.

In [12]:
NUM_ITERATIONS = 5

In [13]:
train_df, test_df = train_test_split(
    data_df.drop(columns='group'), test_size=0.3, random_state=42
)

In [14]:
train_df.shape, test_df.shape

((480920, 195), (206109, 195))

In [15]:
target = 'response_att'

In [16]:
# Split data into treated and untreated
train_0_df = train_df[train_df['treatment'] == 0]
train_1_df = train_df[train_df['treatment'] == 1]

In [17]:
random_grid_params = {
    "max_depth": [2, 4, 6, 10, 12, 14, 16],
    "learning_rate": [0.001, 0.005, 0.01, 0.03, 0.1, 0.2, 0.3]
}

In [18]:
classifier_random_grid_0 = RandomizedSearchCV(
    HistGradientBoostingClassifier(),
    random_grid_params,
    scoring="accuracy",
    n_iter=NUM_ITERATIONS,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

classifier_random_grid_0.fit(train_0_df.drop(columns=['response_att', 'treatment']), train_0_df[target])

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [19]:
tlearner_0 = classifier_random_grid_0.best_estimator_

In [20]:
tlearner_0

In [21]:
classifier_random_grid_1 = RandomizedSearchCV(
    HistGradientBoostingClassifier(),
    random_grid_params,
    scoring="accuracy",
    n_iter=NUM_ITERATIONS,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

classifier_random_grid_1.fit(train_1_df.drop(columns=['response_att', 'treatment']), train_1_df[target])

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [22]:
tlearner_1 = classifier_random_grid_1.best_estimator_

In [23]:
tlearner_1

In [26]:
tlearner_te = (
    tlearner_1.predict_proba(test_df.drop(columns=['response_att', 'treatment']))[:, 1] -
    tlearner_0.predict_proba(test_df.drop(columns=['response_att', 'treatment']))[:, 1]
)

In [27]:
tlearner_te.mean()

0.006179403127399906