In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from tqdm.auto import tqdm
from copy import deepcopy
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load data
welfare_raw = pd.read_csv("welfarelabel.csv", low_memory=False)
labels = welfare_raw['y'].values
treatments = welfare_raw['w']
welfare_raw.drop(columns=["_merge", 'y', 'w', 'id'], inplace=True)
welfare_raw

Unnamed: 0,year,wrkstat,hrs1,hrs2,evwork,occ,prestige,wrkslf,wrkgovt,commute,...,preteen_miss,teens_miss,adults_miss,unrelat_miss,earnrs_miss,income_miss,rincome_miss,income86_miss,partyid_miss,polviews_miss
0,1986,working fulltime,40.000000,38.613701,1.1395408,270.00000,44.000000,someone else,private,60,...,0,0,0,0,0,0,0,0,0,0
1,1986,keeping house,41.733318,38.613701,1,195.00000,51.000000,someone else,private,10,...,0,0,0,1,0,0,1,0,0,0
2,1986,working fulltime,40.000000,38.613701,1.1395408,184.00000,51.000000,someone else,private,35,...,0,0,0,1,0,0,0,0,0,0
3,1986,retired,41.733318,38.613701,1,311.00000,36.000000,someone else,1,25,...,0,0,0,0,0,0,1,0,0,0
4,1986,working parttime,41.733318,38.613701,1.1395408,449.41599,40.335918,someone else,1.8203658,25,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36496,2010,retired,41.733318,38.613701,1,449.41599,40.335918,someone else,private,20.245865,...,0,0,0,0,0,0,1,1,0,0
36497,2010,retired,41.733318,38.613701,1,449.41599,40.335918,someone else,private,20.245865,...,0,0,0,0,0,0,1,1,0,0
36498,2010,working fulltime,40.000000,38.613701,1.1395408,449.41599,40.335918,someone else,private,20.245865,...,0,0,0,1,0,0,0,1,0,0
36499,2010,working fulltime,49.000000,38.613701,1.1395408,449.41599,40.335918,someone else,private,20.245865,...,0,0,0,1,0,0,1,1,0,0


In [3]:
def cleanWelfare(welfare_raw):
    welfare = welfare_raw.copy()

    toClean = set(['commute', 'childs', 'age', 'preteen', 'adults', 'unrelat', 'earnrs'])
    toEncode = set(['year', 'occ'])

    encoders = {} # want a dictionary to keep track of columns and their encoded values so can decode when done

    for column in welfare:
        if '_' not in column:
            if column in toEncode:
                le = LabelEncoder()
                le.fit(welfare[column])
                welfare[column] = le.transform(welfare[column])
                encoders[column] = le 
            elif column in toClean:
                if column == 'commute':
                    welfare[column] = pd.to_numeric(welfare[column].apply(lambda x: 97 if x == '97+ minutes' else x), errors='coerce')
                elif column == 'childs' or column == 'earnrs':
                    welfare[column] = pd.to_numeric(welfare[column].apply(lambda x: 8 if x == 'eight or more' else x), errors='coerce')
                elif column == 'age':
                    welfare[column] = pd.to_numeric(welfare[column].apply(lambda x: 89 if x == '89 or older' else x), errors='coerce')
                elif column == 'preteen' or column == 'adults' or column =='unrelat':
                    welfare[column] = pd.to_numeric(welfare[column].apply(lambda x: 8 if x == '8 or more' else x), errors='coerce')
                else:
                    continue # should never reach here
                welfare[column] = normalize(welfare[column].values.reshape(1, -1))[0] # once column converted to float, normalize
            else:
                if welfare[column].dtype == welfare['teens'].dtype: # float64 column
                    welfare[column] = normalize(welfare[column].values.reshape(1, -1))[0]
                elif welfare[column].dtype == welfare['polviews'].dtype: # object column
                    le = LabelEncoder()
                    le.fit(welfare[column])
                    welfare[column] = le.transform(welfare[column])
                    encoders[column] = le 
                else:
                    continue # should never reach here
    return welfare, encoders

In [4]:
welfare, encoders = cleanWelfare(welfare_raw)

In [5]:
welfare

Unnamed: 0,year,wrkstat,hrs1,hrs2,evwork,occ,prestige,wrkslf,wrkgovt,commute,...,preteen_miss,teens_miss,adults_miss,unrelat_miss,earnrs_miss,income_miss,rincome_miss,income86_miss,partyid_miss,polviews_miss
0,0,7,0.004845,0.005228,1,135,0.005641,2,2,0.015315,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0.005055,0.005228,0,106,0.006538,2,2,0.002552,...,0,0,0,1,0,0,1,0,0,0
2,0,7,0.004845,0.005228,1,99,0.006538,2,2,0.008934,...,0,0,0,1,0,0,0,0,0,0
3,0,3,0.005055,0.005228,0,142,0.004615,2,0,0.006381,...,0,0,0,0,0,0,1,0,0,0
4,0,8,0.005055,0.005228,1,211,0.005171,2,1,0.006381,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36496,15,3,0.005055,0.005228,0,211,0.005171,2,2,0.005168,...,0,0,0,0,0,0,1,1,0,0
36497,15,3,0.005055,0.005228,0,211,0.005171,2,2,0.005168,...,0,0,0,0,0,0,1,1,0,0
36498,15,7,0.004845,0.005228,1,211,0.005171,2,2,0.005168,...,0,0,0,1,0,0,0,1,0,0
36499,15,7,0.005935,0.005228,1,211,0.005171,2,2,0.005168,...,0,0,0,1,0,0,1,1,0,0


In [6]:
encoders

{'year': LabelEncoder(),
 'wrkstat': LabelEncoder(),
 'evwork': LabelEncoder(),
 'occ': LabelEncoder(),
 'wrkslf': LabelEncoder(),
 'wrkgovt': LabelEncoder(),
 'occ80': LabelEncoder(),
 'indus80': LabelEncoder(),
 'marital': LabelEncoder(),
 'divorce': LabelEncoder(),
 'widowed': LabelEncoder(),
 'spwrksta': LabelEncoder(),
 'spevwork': LabelEncoder(),
 'spocc80': LabelEncoder(),
 'spind80': LabelEncoder(),
 'degree': LabelEncoder(),
 'padeg': LabelEncoder(),
 'madeg': LabelEncoder(),
 'spdeg': LabelEncoder(),
 'sex': LabelEncoder(),
 'race': LabelEncoder(),
 'res16': LabelEncoder(),
 'reg16': LabelEncoder(),
 'mobile16': LabelEncoder(),
 'family16': LabelEncoder(),
 'mawork': LabelEncoder(),
 'mawkborn': LabelEncoder(),
 'born': LabelEncoder(),
 'parborn': LabelEncoder(),
 'granborn': LabelEncoder(),
 'income': LabelEncoder(),
 'rincome': LabelEncoder(),
 'income86': LabelEncoder(),
 'partyid': LabelEncoder(),
 'polviews': LabelEncoder()}