**This notebook is for:**
1. Testing causalML.
https://antonsruberts.github.io/causalml-test/

In [7]:
# !pip install statsmodels --upgrade
# !pip install scipy==1.5.4 
# !pip install scikit-learn --upgrade
# !pip install numpy
# !pip install h5py
# !pip install typing-extensions
# !pip install wheel



In [9]:
!pip install --upgrade matplotlib



In [10]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from xgboost import XGBRegressor
import warnings

from causalml.inference.meta import LRSRegressor
from causalml.inference.meta import XGBTRegressor, MLPTRegressor
from causalml.inference.meta import BaseXRegressor, BaseRRegressor, BaseSRegressor, BaseTRegressor
from causalml.match import NearestNeighborMatch, MatchOptimizer, create_table_one
from causalml.propensity import ElasticNetPropensityModel
from causalml.dataset import *
from causalml.metrics import *

In [11]:
# import causalml.dataset as data
# dir(data)

In [None]:
# dataset = make_uplift_classification(n_samples=1000,
#                            treatment_name=['control', 'treatment1'],
#                            y_name='conversion',
#                            n_classification_features=10,
#                            n_classification_informative=5,
#                            n_uplift_increase_dict={'treatment1': 4},
#                            n_uplift_decrease_dict={'treatment1': 3},
#                            delta_uplift_increase_dict={'treatment1': 0.1},
#                            positive_class_proportion=0.1)

In [None]:
# Generate synthetic data using mode 1
y, X, treatment, tau, b, e = synthetic_data(mode=1, n=10000, p=8, sigma=1.0)
# tau: individual treatment effect
# b: expected outcome
# e: propensity of receiving treatment
# y: outcome variable
# X: covariates

In [None]:
# Ready-to-use S-Learner using LinearRegression
learner_s = LRSRegressor()
ate_s = learner_s.estimate_ate(X=X, treatment=treatment, y=y)
print(ate_s)
print('ATE estimate: {:.03f}'.format(ate_s[0][0]))
print('ATE lower bound: {:.03f}'.format(ate_s[1][0]))
print('ATE upper bound: {:.03f}'.format(ate_s[2][0]))

In [None]:
# XGBTRegressor() vs. BaseTRegressor(learner=XGBRegressor())
# XGBTRegressor() = BaseTRegressor(learner=XGBRegressor()).
# When you use BaseTregressor you can change the model to be used in the Two model approach. 

In [None]:
# Ready-to-use T-Learner using XGB
learner_t = XGBTRegressor()
ate_t = learner_t.estimate_ate(X=X, treatment=treatment, y=y)
print('Using the ready-to-use XGBTRegressor class')
print(ate_t)

# Calling the Base Learner class and feeding in XGB
learner_t = BaseTRegressor(learner=XGBRegressor())
ate_t = learner_t.estimate_ate(X=X, treatment=treatment, y=y)
print('\nUsing the BaseTRegressor class and using XGB (same result):')
print(ate_t)

# Calling the Base Learner class and feeding in LinearRegression
learner_t = BaseTRegressor(learner=LinearRegression())
ate_t = learner_t.estimate_ate(X=X, treatment=treatment, y=y)
print('\nUsing the BaseTRegressor class and using Linear Regression (different result):')
print(ate_t)

In [None]:
# S Learner
learner_s = LRSRegressor()
cate_s = learner_s.fit_predict(X=X, treatment=treatment, y=y)

# T Learner
learner_t = BaseTRegressor(learner=XGBRegressor())
cate_t = learner_t.fit_predict(X=X, treatment=treatment, y=y)

# X Learner with propensity score input
learner_x = BaseXRegressor(learner=XGBRegressor())
cate_x = learner_x.fit_predict(X=X, treatment=treatment, y=y, p=e)

# X Learner without propensity score input
learner_x_no_p = BaseXRegressor(learner=XGBRegressor())
cate_x_no_p = learner_x_no_p.fit_predict(X=X, treatment=treatment, y=y)

In [None]:
np.subtract(tau , results[:,x]).tolist()

In [None]:
diff = []
results = np.hstack([cate_s,cate_t,cate_x,cate_x_no_p])
labels = {0: 'single model lr',1:'two model XGBoost-reg',2: 'x model XGBoost-re w/ e', 3:'x model XGBoost-re wo/ e'}
A = []
for x in range(4):
    diff.append(np.subtract(tau , results[:,x]))
    

In [None]:
results[:,3]

In [None]:
fig, axs = plt.subplots(1,3, figsize = (16,6))
fig.subplots_adjust(hspace = 5)
axs = axs.ravel()
for x in np.arange(0,3,1):
    y = results[:,x+1]
    axs[x].hist(x = y,bins=100,alpha=0.4)
    axs[x].vlines(x = results[:,0], ymin = 0 , ymax = 1000)
    axs[x].set_title(f'{labels[x+1]} \n mean: {np.round(np.mean(y),2)}')

In [None]:
fig, axs = plt.subplots(1,4, figsize = (16,6))
fig.subplots_adjust(hspace = 5)
axs = axs.ravel()
for x in range(4):
    y = diff[x]
    axs[x].hist(x = y,bins=100,alpha=0.4)
    axs[x].set_title(f'{labels[x]} - \n MAE: {np.round(np.mean(np.abs(y)),2)}')


In [None]:
# https://github.com/uber/causalml/blob/master/examples/uplift_trees_with_synthetic_data.ipynb
from causalml.inference.tree import UpliftTreeClassifier

In [None]:
dataset, x_names = make_uplift_classification(n_samples=1000,
                           treatment_name=['control', 'treatment1'],
                           y_name='conversion',
                           n_classification_features=10,
                           n_classification_informative=5,
                           n_uplift_increase_dict={'treatment1': 4},
                           n_uplift_decrease_dict={'treatment1': 3},
                           delta_uplift_increase_dict={'treatment1': 0.3},#<------------
                           delta_uplift_decrease_dict={'treatment1': 0.1},                    
                           positive_class_proportion=0.1)

In [None]:
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=111)

In [None]:
clf = UpliftTreeClassifier(control_name='control')
clf.fit(df_train[x_names].values,
         treatment=df_train['treatment_group_key'].values,
         y=df_train['conversion'].values)
p = clf.predict(df_test[x_names].values,)

In [None]:
df_res = pd.DataFrame(p, columns=clf.classes_)
df_res.head()

In [None]:
(df_res.treatment1 - df_res.control ).mean()#<------------

In [None]:
best_treatment = df_res.idxmax(axis=1)

In [None]:
best_treatment