In [1]:
import sys
sys.path.insert(0, '../scripts')
import pandas as pd
import numpy as np
import feature_gen as fg

In [28]:
# Load data that has already been augemented with ACS info
df = pd.read_csv("../data/work/block-groups_2012-2016_with-acs.csv")

In [29]:
df.shape

(19965, 43)

In [30]:
# Determine which years we care about
years_list = [2012, 2013, 2014, 2015, 2016]

In [31]:
# Add eviction effectiveness as a new feature
df['evictions-effectiveness'] = df['evictions'] / df['eviction-filings']
df['evictions-effectiveness'].fillna(value=0, inplace=True)

In [32]:
# Add outcome label of 'upper 10th by year'
# Based on target variable: evictions
col = 'evictions'
df_a = pd.DataFrame(columns=('GEOID', 'year', col, 'decile_by_year', 'upper10th_by_year'))
for year in years_list:
    temp_df = df[df['year']==year][['year', 'GEOID', col]]
    temp_df['decile_by_year'] = pd.qcut(temp_df[col].rank(method='first'), 10, labels=False)
    temp_df.loc[(temp_df['decile_by_year'] == 9), 'upper10th_by_year'] = 1
    temp_df.loc[(temp_df['decile_by_year'] < 9), 'upper10th_by_year'] = 0
    df_a = df_a.append(temp_df, ignore_index=True)
df_a = df_a.astype(dtype={'GEOID':'int64', 'year':'int64', 'decile_by_year': 'int64'})
df = df.merge(df_a)

In [33]:
# Filter df so that only years and areas we care about are included
evic_df = df[df['year'].isin(years_list)]
evic_df = evic_df.loc[evic_df["parent-location"] == "Cook County, Illinois"]

In [34]:
evic_df.shape
# ACS augmented data should have already been filtered for Cook County and years, so number of rows should be the same

(19965, 46)

In [35]:
# Grab all attributes from the dataframe
features = list(evic_df.columns)
print(len(features))

46


In [36]:
# Explore attribute datatypes
evic_df.dtypes

GEOID                                   int64
year                                    int64
name                                   object
parent-location                        object
population                            float64
poverty-rate                          float64
renter-occupied-households            float64
pct-renter-occupied                   float64
median-gross-rent                     float64
median-household-income               float64
median-property-value                 float64
rent-burden                           float64
pct-white                             float64
pct-af-am                             float64
pct-hispanic                          float64
pct-am-ind                            float64
pct-asian                             float64
pct-nh-pi                             float64
pct-multiple                          float64
pct-other                             float64
eviction-filings                      float64
evictions                         

In [37]:
# Explore value counts for different attributes to see if they make good candidates for discretizations
print('value count of tract:', len(evic_df['tract'].value_counts()))

value count of tract: 1319


In [38]:
# Explore object datatypes for possible discretizations
str_columns = [column for column in evic_df.columns if (evic_df[column].dtype=='O')]
print(str_columns)

['name', 'parent-location']


Attribute 'tract' has too many unique values to discretize. Block group 'name' is a unique identifier. 'Parent-location' attribute will be the same for all rows (i.e. Cook County), and so are 'state' and 'county'. None of these attributes make good candidates for discretizations.

Attributes 'imputed', 'low-flag', and 'subbed' are boolean variables. Won't make sense to cut into quantiles / to discretize.

In [39]:
# Select attributes not to generate new features from
# Note that'block group' attribute from ACS data is a unique ID of a block group within one tract
# GEOID of a block group = ACS's state + county + tract + block group ID numbers
to_remove = ['GEOID', 'year', 'name', 'parent-location', 'state', 'county', 'tract', 'block group', 'imputed', 'low-flag', 'subbed']
for attribute in to_remove:
    features.remove(attribute)

In [40]:
# Check updated list of features
print(len(features))
features

35


['population',
 'poverty-rate',
 'renter-occupied-households',
 'pct-renter-occupied',
 'median-gross-rent',
 'median-household-income',
 'median-property-value',
 'rent-burden',
 'pct-white',
 'pct-af-am',
 'pct-hispanic',
 'pct-am-ind',
 'pct-asian',
 'pct-nh-pi',
 'pct-multiple',
 'pct-other',
 'eviction-filings',
 'evictions',
 'eviction-rate',
 'eviction-filing-rate',
 'total_for_public_assistance_income',
 'with_public_assistance_income',
 'estimate_total_in_labor_force',
 'estimate_civilian_unemployed',
 'total_for_householder_tenure',
 'renter_occupied',
 'renter_moved_2015/2010_later',
 'renter_moved_2010-2014/2000-2009',
 'renter_moved_2000-2009/1990-1999',
 'renter_moved_1990-1999/1980-1989',
 'renter_moved_1980-1989/1970-1979',
 'renter_moved_1979/1969_earlier',
 'evictions-effectiveness',
 'decile_by_year',
 'upper10th_by_year']

In [41]:
# We want to add new features that grab the previous year's value for a given attribute, for a block

# For this task, we want to include all features above except for following ACS features:
no_prev = ['total_for_public_assistance_income','with_public_assistance_income',
'estimate_total_in_labor_force', 'estimate_civilian_unemployed', 
'total_for_householder_tenure', 'renter_occupied', 
'renter_moved_2015/2010_later', 'renter_moved_2010-2014/2000-2009', 
'renter_moved_2000-2009/1990-1999', 'renter_moved_1990-1999/1980-1989', 
'renter_moved_1980-1989/1970-1979', 'renter_moved_1979/1969_earlier']
prev_yr_features = list(set(features) - set(no_prev))

In [42]:
print(len(prev_yr_features))
prev_yr_features

23


['eviction-filing-rate',
 'rent-burden',
 'poverty-rate',
 'pct-hispanic',
 'median-household-income',
 'renter-occupied-households',
 'pct-multiple',
 'population',
 'pct-asian',
 'pct-nh-pi',
 'pct-am-ind',
 'median-gross-rent',
 'pct-af-am',
 'pct-other',
 'pct-renter-occupied',
 'upper10th_by_year',
 'evictions',
 'evictions-effectiveness',
 'decile_by_year',
 'eviction-filings',
 'eviction-rate',
 'median-property-value',
 'pct-white']

In [43]:
# Generate new features
generated_prev_yr_features = []
for feature in prev_yr_features:
    evic_df, new_feature = fg.create_prev_yr_feature(evic_df, feature)
    generated_prev_yr_features.append(new_feature)
generated_prev_yr_features

['prev-yr_eviction-filing-rate',
 'prev-yr_rent-burden',
 'prev-yr_poverty-rate',
 'prev-yr_pct-hispanic',
 'prev-yr_median-household-income',
 'prev-yr_renter-occupied-households',
 'prev-yr_pct-multiple',
 'prev-yr_population',
 'prev-yr_pct-asian',
 'prev-yr_pct-nh-pi',
 'prev-yr_pct-am-ind',
 'prev-yr_median-gross-rent',
 'prev-yr_pct-af-am',
 'prev-yr_pct-other',
 'prev-yr_pct-renter-occupied',
 'prev-yr_upper10th_by_year',
 'prev-yr_evictions',
 'prev-yr_evictions-effectiveness',
 'prev-yr_decile_by_year',
 'prev-yr_eviction-filings',
 'prev-yr_eviction-rate',
 'prev-yr_median-property-value',
 'prev-yr_pct-white']

In [44]:
# Add generated features to features list
features.extend(generated_prev_yr_features)
print(len(features))
features

58


['population',
 'poverty-rate',
 'renter-occupied-households',
 'pct-renter-occupied',
 'median-gross-rent',
 'median-household-income',
 'median-property-value',
 'rent-burden',
 'pct-white',
 'pct-af-am',
 'pct-hispanic',
 'pct-am-ind',
 'pct-asian',
 'pct-nh-pi',
 'pct-multiple',
 'pct-other',
 'eviction-filings',
 'evictions',
 'eviction-rate',
 'eviction-filing-rate',
 'total_for_public_assistance_income',
 'with_public_assistance_income',
 'estimate_total_in_labor_force',
 'estimate_civilian_unemployed',
 'total_for_householder_tenure',
 'renter_occupied',
 'renter_moved_2015/2010_later',
 'renter_moved_2010-2014/2000-2009',
 'renter_moved_2000-2009/1990-1999',
 'renter_moved_1990-1999/1980-1989',
 'renter_moved_1980-1989/1970-1979',
 'renter_moved_1979/1969_earlier',
 'evictions-effectiveness',
 'decile_by_year',
 'upper10th_by_year',
 'prev-yr_eviction-filing-rate',
 'prev-yr_rent-burden',
 'prev-yr_poverty-rate',
 'prev-yr_pct-hispanic',
 'prev-yr_median-household-income',
 'p

In [45]:
print(evic_df.shape)
evic_df.head()

(19965, 69)


Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,prev-yr_pct-other,prev-yr_pct-renter-occupied,prev-yr_upper10th_by_year,prev-yr_evictions,prev-yr_evictions-effectiveness,prev-yr_decile_by_year,prev-yr_eviction-filings,prev-yr_eviction-rate,prev-yr_median-property-value,prev-yr_pct-white
0,170310101001,2012,101.1,"Cook County, Illinois",435.0,18.92,156.0,67.23,785.0,,...,,,,,,,,,,
1,170310101001,2013,101.1,"Cook County, Illinois",435.0,18.92,159.0,67.23,785.0,,...,0.0,67.23,0.0,7.0,0.388889,7.0,18.0,4.5,255000.0,49.89
2,170310101001,2014,101.1,"Cook County, Illinois",435.0,18.92,161.0,67.23,785.0,,...,0.0,67.23,0.0,4.0,0.285714,6.0,14.0,2.52,255000.0,49.89
3,170310101001,2015,101.1,"Cook County, Illinois",435.0,18.92,164.0,67.23,785.0,,...,0.0,67.23,0.0,4.0,0.210526,6.0,19.0,2.48,255000.0,49.89
4,170310101001,2016,101.1,"Cook County, Illinois",435.0,18.92,167.0,67.23,785.0,,...,0.0,67.23,0.0,3.0,0.230769,6.0,13.0,1.83,255000.0,49.89


In [46]:
# For now, remove 2012 since ACS info has no info on year 2012
no_2012 = years_list.remove(2012)
years_list

[2013, 2014, 2015, 2016]

In [47]:
# Generate quantile features
features_df = fg.generate_quantile_features(evic_df, features, 4, years_list=no_2012)
# Discretize variables
'No variables to discretize'

'No variables to discretize'

In [48]:
# Compare original df with new df with added features
print('evic_df:', evic_df.shape)
print('features_df:', features_df.shape)

evic_df: (19965, 185)
features_df: (19965, 417)


(Code added quantile categories

In [49]:
# Spot check new feature names
for i in range(0, 277):
    cols = features_df.columns
    print(cols[i])

GEOID
year
name
parent-location
population
poverty-rate
renter-occupied-households
pct-renter-occupied
median-gross-rent
median-household-income
median-property-value
rent-burden
pct-white
pct-af-am
pct-hispanic
pct-am-ind
pct-asian
pct-nh-pi
pct-multiple
pct-other
eviction-filings
evictions
eviction-rate
eviction-filing-rate
low-flag
imputed
subbed
total_for_public_assistance_income
with_public_assistance_income
estimate_total_in_labor_force
estimate_civilian_unemployed
total_for_householder_tenure
renter_occupied
renter_moved_2015/2010_later
renter_moved_2010-2014/2000-2009
renter_moved_2000-2009/1990-1999
renter_moved_1990-1999/1980-1989
renter_moved_1980-1989/1970-1979
renter_moved_1979/1969_earlier
state
county
tract
block group
evictions-effectiveness
decile_by_year
upper10th_by_year
prev-yr_eviction-filing-rate
prev-yr_rent-burden
prev-yr_poverty-rate
prev-yr_pct-hispanic
prev-yr_median-household-income
prev-yr_renter-occupied-households
prev-yr_pct-multiple
prev-yr_population
p

In [50]:
features_df.head()

Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,prev-yr_median-property-value_4quantiles_1.0,prev-yr_median-property-value_4quantiles_2.0,prev-yr_median-property-value_4quantiles_3.0,prev-yr_median-property-value_4quantiles_4.0,prev-yr_median-property-value_4quantiles_nan,prev-yr_pct-white_4quantiles_1.0,prev-yr_pct-white_4quantiles_2.0,prev-yr_pct-white_4quantiles_3.0,prev-yr_pct-white_4quantiles_4.0,prev-yr_pct-white_4quantiles_nan
0,170310101001,2012,101.1,"Cook County, Illinois",435.0,18.92,156.0,67.23,785.0,,...,0,0,0,0,1,0,0,0,0,1
1,170310101001,2013,101.1,"Cook County, Illinois",435.0,18.92,159.0,67.23,785.0,,...,0,0,1,0,0,0,0,1,0,0
2,170310101001,2014,101.1,"Cook County, Illinois",435.0,18.92,161.0,67.23,785.0,,...,0,0,1,0,0,0,0,1,0,0
3,170310101001,2015,101.1,"Cook County, Illinois",435.0,18.92,164.0,67.23,785.0,,...,0,0,1,0,0,0,0,1,0,0
4,170310101001,2016,101.1,"Cook County, Illinois",435.0,18.92,167.0,67.23,785.0,,...,0,0,1,0,0,0,0,1,0,0


In [25]:
# Exporting the result:
features_df.to_csv('../data/work/block-groups_2012-2016_with-acs_with-gen-features.csv', index=False)