In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

from copy import deepcopy
import warnings
warnings.filterwarnings("ignore")

def fullDisplay():
    pd.set_option("display.max_rows", None, "display.max_columns", None)

def defaultDisplay():
    pd.reset_option('^display.', silent=True)

In [8]:
welfare = pd.read_csv("Data/welfare_clean.csv", low_memory=False)
treatments = welfare['w']
labels = welfare['y']
welfare.drop(columns=['w', 'y'], inplace=True)
welfare

Unnamed: 0,year,id,wrkstat,hrs1,hrs2,evwork,occ,prestige,wrkslf,wrkgovt,...,adults_miss,unrelat_miss,earnrs_miss,income_miss,rincome_miss,income86_miss,partyid_miss,polviews_miss,attblack,attblack_miss
0,0,1,7,0.004845,0.005228,1,135,0.005641,2,2,...,0,0,0,0,0,0,0,0,0.005440,0
1,0,2,1,0.005055,0.005228,0,106,0.006538,2,2,...,0,1,0,0,1,0,0,0,0.004080,0
2,0,3,7,0.004845,0.005228,1,99,0.006538,2,2,...,0,1,0,0,0,0,0,0,0.002040,0
3,0,4,3,0.005055,0.005228,0,142,0.004615,2,0,...,0,0,0,0,1,0,0,0,0.004080,0
4,0,5,8,0.005055,0.005228,1,211,0.005171,2,1,...,0,0,0,0,0,0,0,0,0.004080,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36496,15,2040,3,0.005055,0.005228,0,211,0.005171,2,2,...,0,0,0,0,1,1,0,0,0.004080,0
36497,15,2041,3,0.005055,0.005228,0,211,0.005171,2,2,...,0,0,0,0,1,1,0,0,0.006120,0
36498,15,2042,7,0.004845,0.005228,1,211,0.005171,2,2,...,0,1,0,0,0,1,0,0,0.004080,0
36499,15,2043,7,0.005935,0.005228,1,211,0.005171,2,2,...,0,1,0,0,1,1,0,0,0.005021,1


In [59]:
N = 36501

cov = welfare.cov()
means = welfare.mean(axis=0)

X = np.random.multivariate_normal(means.values, cov, size=1000, check_valid='warn', tol=1e-8)

poly = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
poly.fit(X[:, importantFeatureIndices[:3]])
higherNames = poly.get_feature_names(input_features=importantFeatureNames[:3])
full = poly.transform(X[:, importantFeatureIndices[:3]])
higher = full[:, -(full.shape[1] - 3):] # select only higher order

treatment_probability = 0.5
treat_elements = [0, 1]
treat_probabilities = [1 - treatment_probability, treatment_probability]

# randomly assigned treatments with propensity treatment_probability
treatments = np.random.choice(treat_elements, size=N, p=treat_probabilities).reshape((-1, 1))

In [65]:
treatments.shape

(36501, 1)

In [67]:
np.random.uniform(size=(N, 1)).shape

(36501, 1)

In [76]:
def heterogeneousTreatment(X, treatments, betas, effect):
    xb = X@betas
#     for i in range(X.shape[0]):
#         if int(treatments[i]) > 0:
#             xb[i] += 1 + effect * X[i][1] * X[i][4]
    treated = treatments>0
    
    # heterogeneous treatment is 1 + effect*x_2*x_5
    xb[treated] += 1 + 3*X[treated.ravel(), 1]*X[treated.ravel(), 4]

    X = np.append(X, treatments, axis=1)
    return X, xb


def dgp(welfare, effect_type="heterogeneous", effect_homogeneous=10, effect_heterogeneous=2,
        treatment_type="binary", treatment_probability=0.5, heterogenous_select=4, order=3, linearity="med", N=1000):    
    
    featureNames = list(welfare.columns)

    importantFeatureNames = ['wrkstat', 'race', 'year', 'hrs1', 'income', 'occ80', 'id', 'educ'] # top 8 important features based on Shapley visualization from 
    importantFeatureIndices = []
    for name in importantFeatureNames:
        importantFeatureIndices.append(featureNames.index(name)) 

    error = np.random.normal(size=(N,1))
    cov = welfare.cov()
    means = welfare.mean(axis=0)
    
    X = np.random.multivariate_normal(means.values, cov, size=1000, check_valid='warn', tol=1e-8)
    
    if linearity != 'full':
        select = 0 # select n most important features for interactions and polynomials
        poly = PolynomialFeatures(degree=order, interaction_only=False, include_bias=False)

        if linearity == "high": 
            select = 2
        elif linearity == "med": 
            select = 4
        elif linearity == "low": 
            select = 8
        else: # if some typo, assume baseline of med
            select = 2

        poly.fit(X[:, importantFeatureIndices[:select]])
        fullData = poly.transform(X[:, importantFeatureIndices[:select]])
        fullNames = poly.get_feature_names(input_features=importantFeatureNames[:select])
        higherData = fullData[:, -(fullData.shape[1] - select):] # select only higher order
        higherNames = fullNames[-(len(fullNames) - select):]
        
        X = np.append(X, higherData, axis=1) 
        featureNames.extend(list(higherNames))

    if treatment_type == "binary":
        # randomly assigned treatments with propensity treatment_probability
        treatments = np.random.choice([0, 1], size=N, p=[1 - treatment_probability, treatment_probability]).reshape((-1, 1))
    else:
        treatments = np.random.uniform(size=(N, 1)).shape

    # heterogeneous vs. homogeneous treatments
    if effect_type == "homogeneous":
        betas = np.append(np.random.normal(size=X.shape[1]), [effect_homogeneous]).reshape(-1,1)
        X = np.append(X, treatments, axis=1)
        xb = X@betas
        X = np.delete(X, -1, axis=1)
    else:
        betas = np.random.normal(size=X.shape[1]).reshape(-1,1)
        xb = X@betas
        treated = treatments>0
        # heterogeneous treatment is 1 + effect * (sum of first heterogenous_select important variables)
        heterogenousIndices = importantFeatureIndices[:heterogenous_select]
        heterogenousSum = 2 * X[treated.ravel(), heterogenousIndices[0]]
        for i in range(1, len(heterogenousIndices)):
            heterogenousSum += 2 * X[treated.ravel(), heterogenousIndices[i]]
        xb[treated] += 1 + heterogenousSum

    y = xb + error 
    
    return y, X, betas, featureNames, treatments

In [77]:
y, X, trueBetas, featureNames, treatments = dgp(welfare, N=1000, effect_type='homogeneous', 
                                                order=3, treatment_probability=0.5)

In [83]:
treatments

array([[0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
    