In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load data
bart_df = pd.read_csv("welfarenolabel3.csv")

In [4]:
# Zero imputation
bart_df[bart_df == -999] = 0
bart_df = bart_df.drop(["_merge", "_mergescore"], axis='columns')

In [5]:
# Extract treatment
W = 1 - bart_df['w'].values
# Calculate propensities
pis = W.mean() * np.ones(len(W))
# Extract outcomes
y = bart_df['y'].values

In [6]:
# Munge pre-treatment covariates
cts_covs = bart_df.columns[bart_df.dtypes != 'object']
cts_covs = list(set(cts_covs) - set(['y', 'w']))
discrete_covs = bart_df.columns[bart_df.dtypes == 'object'].tolist()
# Construct design matrix
Xdf = pd.get_dummies(
    bart_df[discrete_covs + cts_covs],
    columns=discrete_covs,
    dummy_na=True,
    drop_first=True,
).astype(float).fillna(0)
Xdf -= Xdf.mean()
Xdf = Xdf.loc[:, Xdf.std() > 0]
Xdf /= Xdf.std()
X = Xdf.values

In [7]:
# Recombine
cols = ["outcome", "treatment"] + list(Xdf.columns.values)
bart_proc_df = pd.DataFrame(np.hstack((y.reshape(-1,1), W.reshape(-1,1), X)), columns=cols)

In [8]:
# Save
bart_proc_df.to_csv("bart_dataset_processed.tsv.gz", sep="\t", index=False, compression="gzip")