# BPIC 2017 numeric data tryout with treatment recommended from Bozorgi's et.al study: https://arxiv.org/abs/2009.01561

## Dependencies

In [None]:
# Skip this cell, at first
import os
os.chdir('../..')
os.getcwd()

In [None]:
import pandas as pd
import numpy as np
from ylearn.estimator_model.double_ml import DoubleML
from xgboost.sklearn import XGBRegressor
from ylearn.estimator_model.iv import NP2SLS
from CI_Experiments.config import PROJECT_DIR # if this line results ModuleNotFoundError, then execute the cell above

## Constants


In [None]:
TRAIN_PATH = PROJECT_DIR / 'data/prepared_process_logs/BPIC2017/numeric/train.csv'
TEST_PATH = PROJECT_DIR / 'data/prepared_process_logs/BPIC2017/numeric/test.csv'
RESULT_PATH = PROJECT_DIR / 'experiments/results/BPIC2017/numeric'

## Data read

In [None]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

## Treat and control values preparation

In [None]:
def split_into_data_and_treat_and_control(data: pd.DataFrame):
    columns = list(data.columns)
    treat_cols = []
    control_cols = []
    data_cols = []
    for col in columns:
        if col.endswith('treatment'):
            treat_cols.append(col)
        elif col.endswith('control'):
            control_cols.append(col)
        else:
            data_cols.append(col)
    result_ = {
        'data': data[data_cols],
        'treat': data[treat_cols],
        'control': data[control_cols]
    }
    return result_

In [None]:
splitted_train_data = split_into_data_and_treat_and_control(train_data)
splitted_test_data = split_into_data_and_treat_and_control(test_data)
train_data = splitted_train_data['data']
test_data = splitted_test_data['data']
treat_data = splitted_test_data['treat'] 
control_data = splitted_test_data['control']

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
treat_data.head()

In [None]:
control_data.head()

## Causal estimation

In [None]:
def estimate(train_data_, test_data_, treatment_: str, covariate_: list, instrument_: list, treat_, control_):
    x = XGBRegressor(n_estimators=100,max_depth=5, min_child_weight=5, learning_rate=0.1,gamma=1,reg_alpha=1,reg_lambda=1)
    y = XGBRegressor(n_estimators=100,max_depth=5, min_child_weight=5, learning_rate=0.1,gamma=1,reg_alpha=1,reg_lambda=1)
    dml_covariate = covariate_
    if instrument is not None:
        dml_covariate = dml_covariate + instrument_
    dml = DoubleML(x_model=x, y_model=y, random_state=23, is_discrete_treatment=False)
    dml.fit(train_data_, 'Outcome', treatment=treatment_, covariate=dml_covariate)
    dml_ate = dml.estimate(
        data=test_data_,
        treat=treat_,
        control=control_,
        quantity='ATE'
    )
    
    #############################
    
    x = XGBRegressor(n_estimators=100,max_depth=5, min_child_weight=5, learning_rate=0.1,gamma=1,reg_alpha=1,reg_lambda=1)
    y = XGBRegressor(n_estimators=100,max_depth=5, min_child_weight=5, learning_rate=0.1,gamma=1,reg_alpha=1,reg_lambda=1)
    npls = NP2SLS(x_model=x, y_model=y, is_discrete_treatment=False, is_discrete_outcome=True)
    npls.fit(train_data_, 'Outcome', treatment=treatment_, covariate=covariate_, instrument=instrument_)
    ite = npls.estimate(
        data=test_data_,
        treat=treat_,
        control=control_,
    )
    iv_ate = np.mean(ite)
    
    return {'dml_ate': dml_ate, 'iv_ate': iv_ate}


### First withdrawal amount treatment estimation

In [None]:
treatment = 'FirstWithdrawalAmount'
covariate = ['LoanGoal', 'ApplicationType', 'OfferedAmount', 'NumberOfTerms', 'MonthlyCost']
instrument = ['CreditScore']

treat = treat_data['FirstWithdrawalAmount_treatment']
control = control_data['FirstWithdrawalAmount_control']

fw_amount_results = estimate(train_data, test_data, treatment, covariate, instrument, treat, control)
fw_amount_results

### Number of terms treatment estimation

In [None]:
treatment = 'NumberOfTerms'
covariate = ['LoanGoal', 'ApplicationType', 'OfferedAmount', 'FirstWithdrawalAmount', 'MonthlyCost']
instrument = ['CreditScore']

treat = treat_data['NumberOfTerms_treatment']
control = control_data['NumberOfTerms_control']

nr_of_terms_results = estimate(train_data, test_data, treatment, covariate, instrument, treat, control)
nr_of_terms_results

### Results formatting and save to file

In [None]:
nr_of_terms_results['dml_ate'] = str(nr_of_terms_results['dml_ate'][0][0])
nr_of_terms_results['iv_ate'] = str(nr_of_terms_results['iv_ate'])
nr_of_terms_results

In [None]:
fw_amount_results['dml_ate'] = str(fw_amount_results['dml_ate'][0][0])
fw_amount_results['iv_ate'] = str(fw_amount_results['iv_ate'])
fw_amount_results

In [None]:
result = {'FirstWithdrawalAmountTreatment': fw_amount_results, 'NumberOfTermsTreatment': nr_of_terms_results}
result

In [None]:
import json

with open(f"{RESULT_PATH}/estimation_result.txt", 'w') as file:
    file.write(json.dumps(result))

