In [135]:
import sys
sys.path.insert(0, '../scripts')
import pandas as pd
import numpy as np
import feature_gen as fg

In [136]:
# Load data that has already been augemented with ACS info
df = pd.read_csv("../data/work/block-groups_2012-2016_with-acs.csv")

In [137]:
df.shape

(15972, 43)

In [138]:
# Determine which years we care about
years_list = [2013, 2014, 2015, 2016]

In [127]:
# Add outcome label
#df['evictions-effectiveness'] = df['evictions'] / df['eviction-filings']
#df['evictions-effectiveness'].fillna(value=0, inplace=True)

In [139]:
# Filter df so that only years and areas we care about are included
evic_df = df[df['year'].isin(years_list)]
evic_df = evic_df.loc[evic_df["parent-location"] == "Cook County, Illinois"]

In [140]:
evic_df.shape
# ACS augmented data should have already been filtered for Cook County and years, so number of rows should be the same

(15972, 43)

In [141]:
# Grab all attributes from the dataframe
features = list(evic_df.columns)
print(len(features))
features

43


['GEOID',
 'year',
 'name',
 'parent-location',
 'population',
 'poverty-rate',
 'renter-occupied-households',
 'pct-renter-occupied',
 'median-gross-rent',
 'median-household-income',
 'median-property-value',
 'rent-burden',
 'pct-white',
 'pct-af-am',
 'pct-hispanic',
 'pct-am-ind',
 'pct-asian',
 'pct-nh-pi',
 'pct-multiple',
 'pct-other',
 'eviction-filings',
 'evictions',
 'eviction-rate',
 'eviction-filing-rate',
 'low-flag',
 'imputed',
 'subbed',
 'total_for_public_assistance_income',
 'with_public_assistance_income',
 'estimate_total_in_labor_force',
 'estimate_civilian_unemployed',
 'total_for_householder_tenure',
 'renter_occupied',
 'renter_moved_2015/2010_later',
 'renter_moved_2010-2014/2000-2009',
 'renter_moved_2000-2009/1990-1999',
 'renter_moved_1990-1999/1980-1989',
 'renter_moved_1980-1989/1970-1979',
 'renter_moved_1979/1969_earlier',
 'state',
 'county',
 'tract',
 'block group']

In [150]:
# Explore attribute datatypes
evic_df.dtypes

GEOID                                   int64
year                                    int64
name                                   object
parent-location                        object
population                            float64
poverty-rate                          float64
renter-occupied-households            float64
pct-renter-occupied                   float64
median-gross-rent                     float64
median-household-income               float64
median-property-value                 float64
rent-burden                           float64
pct-white                             float64
pct-af-am                             float64
pct-hispanic                          float64
pct-am-ind                            float64
pct-asian                             float64
pct-nh-pi                             float64
pct-multiple                          float64
pct-other                             float64
eviction-filings                      float64
evictions                         

In [156]:
# Explore value counts for different attributes to see if they make good candidates for discretizations
evic_df['tract'].value_counts()

80100     28
250500    28
480400    28
820101    28
150300    28
100200    28
815500    28
460500    28
812000    24
191100    24
821800    24
71500     24
800200    24
81000     24
561100    24
640300    24
700501    24
63200     24
806003    24
81300     24
831600    24
230600    24
805902    24
842100    24
805111    24
813400    24
650200    24
710400    24
100300    24
826100    24
          ..
190100     4
420300     4
130200     4
310500     4
240500     4
60500      4
240700     4
671100     4
170900     4
670400     4
310700     4
630200     4
490400     4
760802     4
802300     4
400300     4
843400     4
62400      4
580300     4
62600      4
31800      4
242800     4
310400     4
240300     4
81900      4
30701      4
630100     4
835700     4
671900     4
71600      4
Name: tract, Length: 1319, dtype: int64

In [149]:
# Explore object datatypes for possible discretizations
str_columns = [column for column in evic_df.columns if (evic_df[column].dtype=='O')]
print(str_columns)

['name', 'parent-location']


Attribute 'tract' has too many unique values to discretize. Block group 'name' is a unique identifier. 'Parent-location' attribute will be the same for all rows (i.e. Cook County). None of these attributes make good candidates for discretizations.

In [142]:
# Select attributes not to generate new features from
# Note that'block group' attribute from ACS data is a unique ID of a block group within one tract
# GEOID of a block group = ACS's state + county + tract + block group ID numbers
to_remove = ['GEOID', 'year', 'name', 'parent-location', 'state', 'county', 'tract', 'block group']
for attribute in to_remove:
    features.remove(attribute)

In [143]:
# Check updated list of features
print(len(features))
features

37


['population',
 'poverty-rate',
 'renter-occupied-households',
 'pct-renter-occupied',
 'median-gross-rent',
 'median-household-income',
 'median-property-value',
 'rent-burden',
 'pct-white',
 'pct-af-am',
 'pct-hispanic',
 'pct-am-ind',
 'pct-asian',
 'pct-nh-pi',
 'pct-multiple',
 'pct-other',
 'eviction-filings',
 'evictions',
 'eviction-rate',
 'eviction-filing-rate',
 'low-flag',
 'imputed',
 'subbed',
 'total_for_public_assistance_income',
 'with_public_assistance_income',
 'estimate_total_in_labor_force',
 'estimate_civilian_unemployed',
 'total_for_householder_tenure',
 'renter_occupied',
 'renter_moved_2015/2010_later',
 'renter_moved_2010-2014/2000-2009',
 'renter_moved_2000-2009/1990-1999',
 'renter_moved_1990-1999/1980-1989',
 'renter_moved_1980-1989/1970-1979',
 'renter_moved_1979/1969_earlier',
 'tract',
 'block group']

In [144]:
# Generate quantile features for outcome label
features_df = fg.generate_quantile_features(evic_df, features, 4, years_list=years_list)

In [145]:
print('evic_df:', evic_df.shape)
print('features_df:', features_df.shape)

evic_df: (15972, 43)
features_df: (15972, 1153)


In [146]:
# Spot check new feature names
for i in range(0, 101):
    cols = features_df.columns
    print(cols[i])

GEOID
year
name
parent-location
population
poverty-rate
renter-occupied-households
pct-renter-occupied
median-gross-rent
median-household-income
median-property-value
rent-burden
pct-white
pct-af-am
pct-hispanic
pct-am-ind
pct-asian
pct-nh-pi
pct-multiple
pct-other
eviction-filings
evictions
eviction-rate
eviction-filing-rate
low-flag
imputed
subbed
total_for_public_assistance_income
with_public_assistance_income
estimate_total_in_labor_force
estimate_civilian_unemployed
total_for_householder_tenure
renter_occupied
renter_moved_2015/2010_later
renter_moved_2010-2014/2000-2009
renter_moved_2000-2009/1990-1999
renter_moved_1990-1999/1980-1989
renter_moved_1980-1989/1970-1979
renter_moved_1979/1969_earlier
state
county
tract
block group
population_2013_4quantiles_categorical
poverty-rate_2013_4quantiles_categorical
renter-occupied-households_2013_4quantiles_categorical
pct-renter-occupied_2013_4quantiles_categorical
median-gross-rent_2013_4quantiles_categorical
median-household-income_201

In [147]:
features_df.head()

Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,tract_4quantiles_1.0,tract_4quantiles_2.0,tract_4quantiles_3.0,tract_4quantiles_4.0,tract_4quantiles_nan,block group_4quantiles_1.0,block group_4quantiles_2.0,block group_4quantiles_3.0,block group_4quantiles_4.0,block group_4quantiles_nan
0,170310101001,2013,101.1,"Cook County, Illinois",435.0,18.92,159.0,67.23,785.0,,...,1,0,0,0,0,1,0,0,0,0
1,170310101001,2014,101.1,"Cook County, Illinois",435.0,18.92,161.0,67.23,785.0,,...,1,0,0,0,0,1,0,0,0,0
2,170310101001,2015,101.1,"Cook County, Illinois",435.0,18.92,164.0,67.23,785.0,,...,1,0,0,0,0,1,0,0,0,0
3,170310101001,2016,101.1,"Cook County, Illinois",435.0,18.92,167.0,67.23,785.0,,...,1,0,0,0,0,1,0,0,0,0
4,170310101002,2013,101.2,"Cook County, Illinois",1496.0,28.28,860.0,92.78,762.0,22537.0,...,1,0,0,0,0,0,1,0,0,0


In [None]:
# Exporting the result:
features_df.to_csv('../data/work/block-groups_2012-2016_with-acs.csv', index=False)