In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from copy import deepcopy

from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from econml.grf import CausalForest
from sklearn.linear_model import LassoCV

import warnings
warnings.filterwarnings("ignore")
import logging

import dowhy
from dowhy import CausalModel
import dowhy.datasets

import econml

In [3]:
# w = 0 means the question had "assistance", w = 1 means the question had "welfare"
# y = 0 means the responder said no, y = 1 means yes
welfare_raw = pd.read_csv("Data/welfarelabel.csv", low_memory=False)
labels = welfare_raw['y'].values
treatments = welfare_raw['w']
treatments = treatments.replace({0:1, 1:0}) # we want 1 to be assistance, and 0 to be welfare, so if the TE is positive then it means people responded favorably to assistance
welfare_raw['w'] = treatments
welfare_raw

Unnamed: 0,year,id,wrkstat,hrs1,hrs2,evwork,occ,prestige,wrkslf,wrkgovt,...,adults_miss,unrelat_miss,earnrs_miss,income_miss,rincome_miss,income86_miss,partyid_miss,polviews_miss,attblack,attblack_miss
0,1986,1,working fulltime,40.000000,38.613701,1.1395408,270.00000,44.000000,someone else,private,...,0,0,0,0,0,0,0,0,0.666667,0
1,1986,2,keeping house,41.733318,38.613701,1,195.00000,51.000000,someone else,private,...,0,1,0,0,1,0,0,0,0.500000,0
2,1986,3,working fulltime,40.000000,38.613701,1.1395408,184.00000,51.000000,someone else,private,...,0,1,0,0,0,0,0,0,0.250000,0
3,1986,4,retired,41.733318,38.613701,1,311.00000,36.000000,someone else,1,...,0,0,0,0,1,0,0,0,0.500000,0
4,1986,5,working parttime,41.733318,38.613701,1.1395408,449.41599,40.335918,someone else,1.8203658,...,0,0,0,0,0,0,0,0,0.500000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36496,2010,2040,retired,41.733318,38.613701,1,449.41599,40.335918,someone else,private,...,0,0,0,0,1,1,0,0,0.500000,0
36497,2010,2041,retired,41.733318,38.613701,1,449.41599,40.335918,someone else,private,...,0,0,0,0,1,1,0,0,0.750000,0
36498,2010,2042,working fulltime,40.000000,38.613701,1.1395408,449.41599,40.335918,someone else,private,...,0,1,0,0,0,1,0,0,0.500000,0
36499,2010,2043,working fulltime,49.000000,38.613701,1.1395408,449.41599,40.335918,someone else,private,...,0,1,0,0,1,1,0,0,0.615292,1


In [4]:
def cleanWelfare(welfare_raw):
    welfare = welfare_raw.copy()

    toClean = set(['commute', 'childs', 'age', 'preteen', 'adults', 'unrelat', 'earnrs'])
    toEncode = set(['year', 'occ'])

    encoders = {} # want a dictionary to keep track of columns and their encoded values so can decode when done

    for column in welfare:
        if '_' not in column:
            if column in toEncode:
                le = LabelEncoder()
                le.fit(welfare[column])
                welfare[column] = le.transform(welfare[column])
                encoders[column] = le 
            elif column in toClean:
                if column == 'commute':
                    welfare[column] = pd.to_numeric(welfare[column].apply(lambda x: 97 if x == '97+ minutes' else x), errors='coerce')
                elif column == 'childs' or column == 'earnrs':
                    welfare[column] = pd.to_numeric(welfare[column].apply(lambda x: 8 if x == 'eight or more' else x), errors='coerce')
                elif column == 'age':
                    welfare[column] = pd.to_numeric(welfare[column].apply(lambda x: 89 if x == '89 or older' else x), errors='coerce')
                elif column == 'preteen' or column == 'adults' or column =='unrelat':
                    welfare[column] = pd.to_numeric(welfare[column].apply(lambda x: 8 if x == '8 or more' else x), errors='coerce')
                else:
                    continue # should never reach here
                welfare[column] = normalize(welfare[column].values.reshape(1, -1))[0] # once column converted to float, normalize
            else:
                if welfare[column].dtype == welfare['teens'].dtype: # float64 column
                    welfare[column] = normalize(welfare[column].values.reshape(1, -1))[0]
                elif welfare[column].dtype == welfare['polviews'].dtype: # object column
                    le = LabelEncoder()
                    le.fit(welfare[column])
                    welfare[column] = le.transform(welfare[column])
                    encoders[column] = le 
                else:
                    continue # should never reach here
    return welfare, encoders


welfare, encoders = cleanWelfare(welfare_raw.drop(columns=['_merge', 'y', 'w']))
welfare

Unnamed: 0,year,id,wrkstat,hrs1,hrs2,evwork,occ,prestige,wrkslf,wrkgovt,...,adults_miss,unrelat_miss,earnrs_miss,income_miss,rincome_miss,income86_miss,partyid_miss,polviews_miss,attblack,attblack_miss
0,0,1,7,0.004845,0.005228,1,135,0.005641,2,2,...,0,0,0,0,0,0,0,0,0.005440,0
1,0,2,1,0.005055,0.005228,0,106,0.006538,2,2,...,0,1,0,0,1,0,0,0,0.004080,0
2,0,3,7,0.004845,0.005228,1,99,0.006538,2,2,...,0,1,0,0,0,0,0,0,0.002040,0
3,0,4,3,0.005055,0.005228,0,142,0.004615,2,0,...,0,0,0,0,1,0,0,0,0.004080,0
4,0,5,8,0.005055,0.005228,1,211,0.005171,2,1,...,0,0,0,0,0,0,0,0,0.004080,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36496,15,2040,3,0.005055,0.005228,0,211,0.005171,2,2,...,0,0,0,0,1,1,0,0,0.004080,0
36497,15,2041,3,0.005055,0.005228,0,211,0.005171,2,2,...,0,0,0,0,1,1,0,0,0.006120,0
36498,15,2042,7,0.004845,0.005228,1,211,0.005171,2,2,...,0,1,0,0,0,1,0,0,0.004080,0
36499,15,2043,7,0.005935,0.005228,1,211,0.005171,2,2,...,0,1,0,0,1,1,0,0,0.005021,1


In [None]:
welfare.to_csv('Data/welfare_clean.csv')

In [None]:
from sdv import SDV

sdv = SDV()
sdv.fit(metadata, tables)

In [None]:
BETA = 10

data = dowhy.datasets.linear_dataset(BETA, num_common_causes=10, num_samples=10000,
                                    num_instruments=2, num_effect_modifiers=2,
                                    num_treatments=1,
                                    treatment_is_binary=True,
                                    num_discrete_common_causes=2,
                                    num_discrete_effect_modifiers=0,
                                    one_hot_encode=False)
df=data['df']
print(df.head())
print("True causal estimate is", data["ate"])