In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

In [3]:
welfare = pd.read_csv("welfarelabel.csv", low_memory=False)
welfare

Unnamed: 0,year,id,wrkstat,hrs1,hrs2,evwork,occ,prestige,wrkslf,wrkgovt,...,preteen_miss,teens_miss,adults_miss,unrelat_miss,earnrs_miss,income_miss,rincome_miss,income86_miss,partyid_miss,polviews_miss
0,1986,1,working fulltime,40.000000,38.613701,1.1395408,270.00000,44.000000,someone else,private,...,0,0,0,0,0,0,0,0,0,0
1,1986,2,keeping house,41.733318,38.613701,1,195.00000,51.000000,someone else,private,...,0,0,0,1,0,0,1,0,0,0
2,1986,3,working fulltime,40.000000,38.613701,1.1395408,184.00000,51.000000,someone else,private,...,0,0,0,1,0,0,0,0,0,0
3,1986,4,retired,41.733318,38.613701,1,311.00000,36.000000,someone else,1,...,0,0,0,0,0,0,1,0,0,0
4,1986,5,working parttime,41.733318,38.613701,1.1395408,449.41599,40.335918,someone else,1.8203658,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36496,2010,2040,retired,41.733318,38.613701,1,449.41599,40.335918,someone else,private,...,0,0,0,0,0,0,1,1,0,0
36497,2010,2041,retired,41.733318,38.613701,1,449.41599,40.335918,someone else,private,...,0,0,0,0,0,0,1,1,0,0
36498,2010,2042,working fulltime,40.000000,38.613701,1.1395408,449.41599,40.335918,someone else,private,...,0,0,0,1,0,0,0,1,0,0
36499,2010,2043,working fulltime,49.000000,38.613701,1.1395408,449.41599,40.335918,someone else,private,...,0,0,0,1,0,0,1,1,0,0


Just a reference point to see how many actual covariates are in the Welfare dataset

In [4]:
def dropDummy(cols):
    filtered = []
    for col in cols:
        if "_" not in col:
            filtered.append(col)
    return filtered

realCols = dropDummy(list(welfare.columns))
print("Number of covariates for the welfare data set is", len(realCols) - 1) # Approximate number of covariates, we drop one column because of the label column

Number of covariates for the welfare data set is 61


Generating data (per page 17 of BART and BCF paper)

In [21]:
def heterogenousTreatment(X, treatments, betas, effect):
    xb = X@betas
    for i in range(X.shape[0]):
        if int(treatments[i]) > 0:
            xb[i] += 1 + effect * X[i][1] * X[i][4] # heterogenous treatment is 1 + 2*x_2*x_5
    X = np.append(X, treatments, axis=1)
    return X, xb

In [20]:
# Gives you the ability to generate data and configure treatment effect heterogeneity, whether the regression is linear, and sample size
# cc is the number of continuous covariates we want to include (before third order interactions are introduced)
def dgp(effect_type="heterogenous", effect_homogenous=3, effect_heterogenous=2, treatment_probability=0.5, order=3, linear=False, cc=4, N=1000, rho=0.2):
    error = np.random.normal(size=(N,1))
    
    # linear boolean will be used later to determine whether we want to include polynomial features
    # at this point, we add two more continuous covariates if linear to have approximately same 
    # number of covariates between the linear and nonlinear cases
    if linear: 
        cov = (np.eye(cc+2) * (1-rho)) + (np.ones((cc+2, cc+2)) * rho)
        X = np.random.multivariate_normal(np.zeros(cc+2), cov, size=N, check_valid='warn', tol=1e-8) # generate 4 continuous covariates of X
    else:
        cov = (np.eye(cc) * (1-rho)) + (np.ones((cc, cc)) * rho)
        X = np.random.multivariate_normal(np.zeros(cc), cov, size=N, check_valid='warn', tol=1e-8) # generate 4 continuous covariates of X

    X = np.append(X, np.random.randint(2, size=N).reshape((-1, 1)), axis=1) # add binary [0, 1] covariate

    cat_elements = [1, 2, 3]
    cat_probabilities = [0.2, 0.5, 0.3] # arbitrary
    X = np.append(X, np.random.choice(cat_elements, size=N, p=cat_probabilities).reshape((-1, 1)), axis=1) # add unordered categorical [1, 2, 3] covariate

    poly = PolynomialFeatures(order, interaction_only=linear) # add third order interactions if linear is false for X covariates, increases number of covariates (to have a high dimensional dataset)
    X = poly.fit_transform(X)
    X = X[:,1:] # drop the constant term
    features = (poly.get_feature_names()[1:])
    features.append("w")

    treat_elements = [0, 1]
    treat_probabilities = [1 - treatment_probability, treatment_probability]
    treatments = np.random.choice(treat_elements, size=N, p=treat_probabilities).reshape((-1, 1)) # randomly assigned treatments with propensity 0.5
    
    # configure between heterogenous and homogenous treatments
    if effect_type == "homogenous":
        betas = np.append(np.random.normal(size=X.shape[1]), [effect_homogenous]).reshape(-1,1)
        X = np.append(X, treatments, axis=1)
        xb = X@betas
    else:
        betas = np.random.normal(size=X.shape[1]).reshape(-1,1)
        X, xb = heterogenousTreatment(X, treatments, betas, effect_heterogenous)

    y = xb + error 
    return y, X, betas, features

In [22]:
y, X, trueBetas, featureNames = dgp(effect_type="heterogenous", order=3, treatment_probability=1, linear=True)