In [1]:
import sys
sys.path.insert(0, '../scripts')
import pandas as pd
import numpy as np
import feature_gen as fg

In [2]:
# Load data that has already been augemented with ACS info
df = pd.read_csv("../data/work/block-groups_2012-2016_with-acs.csv")

In [3]:
df.shape

(19965, 43)

In [4]:
# Determine which years we care about
years_list = [2012, 2013, 2014, 2015, 2016]

In [5]:
# Add eviction effectiveness as a new feature
df['evictions-effectiveness'] = df['evictions'] / df['eviction-filings']
df['evictions-effectiveness'].fillna(value=0, inplace=True)

In [6]:
# Add outcome label of 'upper 10th by year'
# Based on target variable: evictions
col = 'evictions'
df_a = pd.DataFrame(columns=('GEOID', 'year', col, 'decile_by_year', 'upper10th_by_year'))
for year in years_list:
    temp_df = df[df['year']==year][['year', 'GEOID', col]]
    temp_df['decile_by_year'] = pd.qcut(temp_df[col].rank(method='first'), 10, labels=False)
    temp_df.loc[(temp_df['decile_by_year'] == 9), 'upper10th_by_year'] = 1
    temp_df.loc[(temp_df['decile_by_year'] < 9), 'upper10th_by_year'] = 0
    df_a = df_a.append(temp_df, ignore_index=True)
df_a = df_a.astype(dtype={'GEOID':'int64', 'year':'int64', 'decile_by_year': 'int64'})
df = df.merge(df_a)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [7]:
# Filter df so that only years and areas we care about are included
evic_df = df[df['year'].isin(years_list)]
evic_df = evic_df.loc[evic_df["parent-location"] == "Cook County, Illinois"]

In [8]:
evic_df.shape
# ACS augmented data should have already been filtered for Cook County and years, so number of rows should be the same

(19965, 46)

In [9]:
# Grab all attributes from the dataframe
features = list(evic_df.columns)
print(len(features))

46


In [10]:
# Explore attribute datatypes
evic_df.dtypes

GEOID                                   int64
year                                    int64
name                                   object
parent-location                        object
population                            float64
poverty-rate                          float64
renter-occupied-households            float64
pct-renter-occupied                   float64
median-gross-rent                     float64
median-household-income               float64
median-property-value                 float64
rent-burden                           float64
pct-white                             float64
pct-af-am                             float64
pct-hispanic                          float64
pct-am-ind                            float64
pct-asian                             float64
pct-nh-pi                             float64
pct-multiple                          float64
pct-other                             float64
eviction-filings                      float64
evictions                         

In [11]:
# Explore value counts for different attributes to see if they make good candidates for discretizations
evic_df['tract'].value_counts()

250500.0    28
460500.0    28
150300.0    28
100200.0    28
820101.0    28
480400.0    28
815500.0    28
80100.0     28
63200.0     24
700501.0    24
120300.0    24
650200.0    24
191100.0    24
814200.0    24
81300.0     24
831600.0    24
231500.0    24
800200.0    24
171000.0    24
826100.0    24
816100.0    24
230600.0    24
620300.0    24
823304.0    24
561100.0    24
813400.0    24
550100.0    24
71500.0     24
834300.0    24
842800.0    24
            ..
251000.0     4
390100.0     4
822900.0     4
490400.0     4
570200.0     4
360200.0     4
140601.0     4
240500.0     4
390500.0     4
980100.0     4
61100.0      4
230800.0     4
222600.0     4
240200.0     4
60500.0      4
420200.0     4
831300.0     4
62600.0      4
570100.0     4
222500.0     4
824400.0     4
260100.0     4
831700.0     4
240700.0     4
842200.0     4
660301.0     4
350400.0     4
832000.0     4
640100.0     4
835700.0     4
Name: tract, Length: 1319, dtype: int64

In [12]:
# Explore object datatypes for possible discretizations
str_columns = [column for column in evic_df.columns if (evic_df[column].dtype=='O')]
print(str_columns)

['name', 'parent-location']


Attribute 'tract' has too many unique values to discretize. Block group 'name' is a unique identifier. 'Parent-location' attribute will be the same for all rows (i.e. Cook County), and so are 'state' and 'county'. None of these attributes make good candidates for discretizations.

Attributes 'imputed', 'low-flag', and 'subbed' are boolean variables. Won't make sense to cut into quantiles / to discretize.

In [13]:
# Select attributes not to generate new features from
# Note that'block group' attribute from ACS data is a unique ID of a block group within one tract
# GEOID of a block group = ACS's state + county + tract + block group ID numbers
to_remove = ['GEOID', 'year', 'name', 'parent-location', 'state', 'county', 'tract', 'block group', 'imputed', 'low-flag', 'subbed']
for attribute in to_remove:
    features.remove(attribute)

In [14]:
# Check updated list of features
print(len(features))
features

35


['population',
 'poverty-rate',
 'renter-occupied-households',
 'pct-renter-occupied',
 'median-gross-rent',
 'median-household-income',
 'median-property-value',
 'rent-burden',
 'pct-white',
 'pct-af-am',
 'pct-hispanic',
 'pct-am-ind',
 'pct-asian',
 'pct-nh-pi',
 'pct-multiple',
 'pct-other',
 'eviction-filings',
 'evictions',
 'eviction-rate',
 'eviction-filing-rate',
 'total_for_public_assistance_income',
 'with_public_assistance_income',
 'estimate_total_in_labor_force',
 'estimate_civilian_unemployed',
 'total_for_householder_tenure',
 'renter_occupied',
 'renter_moved_2015/2010_later',
 'renter_moved_2010-2014/2000-2009',
 'renter_moved_2000-2009/1990-1999',
 'renter_moved_1990-1999/1980-1989',
 'renter_moved_1980-1989/1970-1979',
 'renter_moved_1979/1969_earlier',
 'evictions-effectiveness',
 'decile_by_year',
 'upper10th_by_year']

In [15]:
# We want to add new features that grab the previous year's value for a given attribute, for a block

# For this task, we want to include all features above except for following:
no_prev = ['total_for_public_assistance_income','with_public_assistance_income',
'estimate_total_in_labor_force', 'estimate_civilian_unemployed', 
'total_for_householder_tenure', 'renter_occupied', 
'renter_moved_2015/2010_later', 'renter_moved_2010-2014/2000-2009', 
'renter_moved_2000-2009/1990-1999', 'renter_moved_1990-1999/1980-1989', 
'renter_moved_1980-1989/1970-1979', 'renter_moved_1979/1969_earlier', 
'decile_by_year', 'upper10th_by_year']
prev_yr_features = list(set(features) - set(no_prev))

In [17]:
print(len(prev_yr_features))
prev_yr_features

21


['pct-multiple',
 'evictions',
 'pct-am-ind',
 'median-property-value',
 'pct-af-am',
 'pct-hispanic',
 'renter-occupied-households',
 'median-household-income',
 'eviction-filings',
 'pct-asian',
 'pct-nh-pi',
 'pct-white',
 'median-gross-rent',
 'population',
 'eviction-rate',
 'evictions-effectiveness',
 'pct-other',
 'pct-renter-occupied',
 'eviction-filing-rate',
 'poverty-rate',
 'rent-burden']

In [18]:
# Generate new features
generated_prev_yr_features = []
for feature in prev_yr_features:
    evic_df, new_feature = fg.create_prev_yr_feature(evic_df, feature)
    generated_prev_yr_features.append(new_feature)
generated_prev_yr_features

['prev-yr_pct-multiple',
 'prev-yr_evictions',
 'prev-yr_pct-am-ind',
 'prev-yr_median-property-value',
 'prev-yr_pct-af-am',
 'prev-yr_pct-hispanic',
 'prev-yr_renter-occupied-households',
 'prev-yr_median-household-income',
 'prev-yr_eviction-filings',
 'prev-yr_pct-asian',
 'prev-yr_pct-nh-pi',
 'prev-yr_pct-white',
 'prev-yr_median-gross-rent',
 'prev-yr_population',
 'prev-yr_eviction-rate',
 'prev-yr_evictions-effectiveness',
 'prev-yr_pct-other',
 'prev-yr_pct-renter-occupied',
 'prev-yr_eviction-filing-rate',
 'prev-yr_poverty-rate',
 'prev-yr_rent-burden']

In [19]:
# Add generated features to features list
features.extend(generated_prev_yr_features)
print(len(features))
features

56


['population',
 'poverty-rate',
 'renter-occupied-households',
 'pct-renter-occupied',
 'median-gross-rent',
 'median-household-income',
 'median-property-value',
 'rent-burden',
 'pct-white',
 'pct-af-am',
 'pct-hispanic',
 'pct-am-ind',
 'pct-asian',
 'pct-nh-pi',
 'pct-multiple',
 'pct-other',
 'eviction-filings',
 'evictions',
 'eviction-rate',
 'eviction-filing-rate',
 'total_for_public_assistance_income',
 'with_public_assistance_income',
 'estimate_total_in_labor_force',
 'estimate_civilian_unemployed',
 'total_for_householder_tenure',
 'renter_occupied',
 'renter_moved_2015/2010_later',
 'renter_moved_2010-2014/2000-2009',
 'renter_moved_2000-2009/1990-1999',
 'renter_moved_1990-1999/1980-1989',
 'renter_moved_1980-1989/1970-1979',
 'renter_moved_1979/1969_earlier',
 'evictions-effectiveness',
 'decile_by_year',
 'upper10th_by_year',
 'prev-yr_pct-multiple',
 'prev-yr_evictions',
 'prev-yr_pct-am-ind',
 'prev-yr_median-property-value',
 'prev-yr_pct-af-am',
 'prev-yr_pct-hispan

In [20]:
print(evic_df.shape)
evic_df.head()

(19965, 67)


Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,prev-yr_pct-white,prev-yr_median-gross-rent,prev-yr_population,prev-yr_eviction-rate,prev-yr_evictions-effectiveness,prev-yr_pct-other,prev-yr_pct-renter-occupied,prev-yr_eviction-filing-rate,prev-yr_poverty-rate,prev-yr_rent-burden
0,170310101001,2012,101.1,"Cook County, Illinois",435.0,18.92,156.0,67.23,785.0,,...,,,,,,,,,,
1,170310101001,2013,101.1,"Cook County, Illinois",435.0,18.92,159.0,67.23,785.0,,...,49.89,785.0,435.0,4.5,0.388889,0.0,67.23,11.56,18.92,50.0
2,170310101001,2014,101.1,"Cook County, Illinois",435.0,18.92,161.0,67.23,785.0,,...,49.89,785.0,435.0,2.52,0.285714,0.0,67.23,8.83,18.92,50.0
3,170310101001,2015,101.1,"Cook County, Illinois",435.0,18.92,164.0,67.23,785.0,,...,49.89,785.0,435.0,2.48,0.210526,0.0,67.23,11.78,18.92,50.0
4,170310101001,2016,101.1,"Cook County, Illinois",435.0,18.92,167.0,67.23,785.0,,...,49.89,785.0,435.0,1.83,0.230769,0.0,67.23,7.92,18.92,50.0


In [21]:
# For now, remove 2012 since ACS info has no info on year 2012
no_2012 = years_list.remove(2012)
years_list

[2013, 2014, 2015, 2016]

In [22]:
# Generate quantile features
features_df = fg.generate_quantile_features(evic_df, features, 4, years_list=no_2012)
# Discretize variables
'No variables to discretize'

'No variables to discretize'

In [23]:
# Compare original df with new df with added features
print('evic_df:', evic_df.shape)
print('features_df:', features_df.shape)

evic_df: (19965, 179)
features_df: (19965, 403)


In [71]:
diff = set(evic_df.columns) - set(features_df.columns)
diff

{'decile_by_year_4quantiles',
 'estimate_civilian_unemployed_4quantiles',
 'estimate_total_in_labor_force_4quantiles',
 'eviction-filing-rate_4quantiles',
 'eviction-filings_4quantiles',
 'eviction-rate_4quantiles',
 'evictions-effectiveness_4quantiles',
 'evictions_4quantiles',
 'median-gross-rent_4quantiles',
 'median-household-income_4quantiles',
 'median-property-value_4quantiles',
 'pct-af-am_4quantiles',
 'pct-am-ind_4quantiles',
 'pct-asian_4quantiles',
 'pct-hispanic_4quantiles',
 'pct-multiple_4quantiles',
 'pct-nh-pi_4quantiles',
 'pct-other_4quantiles',
 'pct-renter-occupied_4quantiles',
 'pct-white_4quantiles',
 'population_4quantiles',
 'poverty-rate_4quantiles',
 'rent-burden_4quantiles',
 'renter-occupied-households_4quantiles',
 'renter_moved_1979/1969_earlier_4quantiles',
 'renter_moved_1980-1989/1970-1979_4quantiles',
 'renter_moved_1990-1999/1980-1989_4quantiles',
 'renter_moved_2000-2009/1990-1999_4quantiles',
 'renter_moved_2010-2014/2000-2009_4quantiles',
 'renter

In [69]:
# Spot check new feature names
for i in range(0, 137):
    cols = evic_df.columns
    print(cols[i])

GEOID
year
name
parent-location
population
poverty-rate
renter-occupied-households
pct-renter-occupied
median-gross-rent
median-household-income
median-property-value
rent-burden
pct-white
pct-af-am
pct-hispanic
pct-am-ind
pct-asian
pct-nh-pi
pct-multiple
pct-other
eviction-filings
evictions
eviction-rate
eviction-filing-rate
low-flag
imputed
subbed
total_for_public_assistance_income
with_public_assistance_income
estimate_total_in_labor_force
estimate_civilian_unemployed
total_for_householder_tenure
renter_occupied
renter_moved_2015/2010_later
renter_moved_2010-2014/2000-2009
renter_moved_2000-2009/1990-1999
renter_moved_1990-1999/1980-1989
renter_moved_1980-1989/1970-1979
renter_moved_1979/1969_earlier
state
county
tract
block group
evictions-effectiveness
decile_by_year
upper10th_by_year
prev-yr_eviction-rate
prev-yr_renter-occupied-households
prev-yr_pct-af-am
prev-yr_median-household-income
prev-yr_pct-other
prev-yr_evictions-effectiveness
prev-yr_evictions
prev-yr_pct-asian
prev-y

In [70]:
evic_df[['poverty-rate', 'poverty-rate_4quantiles', 'poverty-rate_4quantiles_categorical']]

Unnamed: 0,poverty-rate,poverty-rate_4quantiles,poverty-rate_4quantiles_categorical
0,18.92,3,3
1,18.92,3,3
2,18.92,3,3
3,18.92,3,3
4,18.92,3,3
5,28.28,4,4
6,28.28,4,4
7,28.28,4,4
8,28.28,4,4
9,28.28,4,4


In [26]:
# Spot check new feature names
for i in range(0, 277):
    cols = features_df.columns
    print(cols[i])

GEOID
year
name
parent-location
population
poverty-rate
renter-occupied-households
pct-renter-occupied
median-gross-rent
median-household-income
median-property-value
rent-burden
pct-white
pct-af-am
pct-hispanic
pct-am-ind
pct-asian
pct-nh-pi
pct-multiple
pct-other
eviction-filings
evictions
eviction-rate
eviction-filing-rate
low-flag
imputed
subbed
total_for_public_assistance_income
with_public_assistance_income
estimate_total_in_labor_force
estimate_civilian_unemployed
total_for_householder_tenure
renter_occupied
renter_moved_2015/2010_later
renter_moved_2010-2014/2000-2009
renter_moved_2000-2009/1990-1999
renter_moved_1990-1999/1980-1989
renter_moved_1980-1989/1970-1979
renter_moved_1979/1969_earlier
state
county
tract
block group
evictions-effectiveness
decile_by_year
upper10th_by_year
population_4quantiles_categorical
poverty-rate_4quantiles_categorical
renter-occupied-households_4quantiles_categorical
pct-renter-occupied_4quantiles_categorical
median-gross-rent_4quantiles_categor

In [24]:
features_df.head()

Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,prev-yr_poverty-rate_4quantiles_1.0,prev-yr_poverty-rate_4quantiles_2.0,prev-yr_poverty-rate_4quantiles_3.0,prev-yr_poverty-rate_4quantiles_4.0,prev-yr_poverty-rate_4quantiles_nan,prev-yr_rent-burden_4quantiles_1.0,prev-yr_rent-burden_4quantiles_2.0,prev-yr_rent-burden_4quantiles_3.0,prev-yr_rent-burden_4quantiles_4.0,prev-yr_rent-burden_4quantiles_nan
0,170310101001,2012,101.1,"Cook County, Illinois",435.0,18.92,156.0,67.23,785.0,,...,0,0,0,0,1,0,0,0,0,1
1,170310101001,2013,101.1,"Cook County, Illinois",435.0,18.92,159.0,67.23,785.0,,...,0,0,1,0,0,0,0,0,1,0
2,170310101001,2014,101.1,"Cook County, Illinois",435.0,18.92,161.0,67.23,785.0,,...,0,0,1,0,0,0,0,0,1,0
3,170310101001,2015,101.1,"Cook County, Illinois",435.0,18.92,164.0,67.23,785.0,,...,0,0,1,0,0,0,0,0,1,0
4,170310101001,2016,101.1,"Cook County, Illinois",435.0,18.92,167.0,67.23,785.0,,...,0,0,1,0,0,0,0,0,1,0


In [65]:
# Exporting the result:
#features_df.to_csv('../data/work/block-groups_2012-2016_with-acs_with-q-features.csv', index=False)