In [47]:
import sys
sys.path.insert(0, '../scripts')
import pandas as pd
import numpy as np
import feature_gen as fg

In [48]:
# Load data that has already been augemented with ACS info
df = pd.read_csv("../data/work/block-groups_2012-2016_with-acs.csv")

In [49]:
df.shape

(19965, 43)

In [50]:
# Determine which years we care about
years_list = [2012, 2013, 2014, 2015, 2016]

In [51]:
# Add outcome label
df['evictions-effectiveness'] = df['evictions'] / df['eviction-filings']
df['evictions-effectiveness'].fillna(value=0, inplace=True)

col = 'evictions-effectiveness'
df_a = pd.DataFrame(columns=('GEOID', 'year', col, 'decile_by_year', 'upper10th_by_year'))
for year in years_list:
    temp_df = df[df['year']==year][['year', 'GEOID', col]]
    temp_df['decile_by_year'] = pd.qcut(temp_df[col].rank(method='first'), 10, labels=False)
    temp_df.loc[(temp_df['decile_by_year'] == 9), 'upper10th_by_year'] = 1
    temp_df.loc[(temp_df['decile_by_year'] < 9), 'upper10th_by_year'] = 0
    df_a = df_a.append(temp_df, ignore_index=True)
df = df.merge(df_a)

In [52]:
# Filter df so that only years and areas we care about are included
evic_df = df[df['year'].isin(years_list)]
evic_df = evic_df.loc[evic_df["parent-location"] == "Cook County, Illinois"]

In [53]:
evic_df.shape
# ACS augmented data should have already been filtered for Cook County and years, so number of rows should be the same

(19965, 46)

In [54]:
# Grab all attributes from the dataframe
features = list(evic_df.columns)
print(len(features))
features

46


['GEOID',
 'year',
 'name',
 'parent-location',
 'population',
 'poverty-rate',
 'renter-occupied-households',
 'pct-renter-occupied',
 'median-gross-rent',
 'median-household-income',
 'median-property-value',
 'rent-burden',
 'pct-white',
 'pct-af-am',
 'pct-hispanic',
 'pct-am-ind',
 'pct-asian',
 'pct-nh-pi',
 'pct-multiple',
 'pct-other',
 'eviction-filings',
 'evictions',
 'eviction-rate',
 'eviction-filing-rate',
 'low-flag',
 'imputed',
 'subbed',
 'total_for_public_assistance_income',
 'with_public_assistance_income',
 'estimate_total_in_labor_force',
 'estimate_civilian_unemployed',
 'total_for_householder_tenure',
 'renter_occupied',
 'renter_moved_2015/2010_later',
 'renter_moved_2010-2014/2000-2009',
 'renter_moved_2000-2009/1990-1999',
 'renter_moved_1990-1999/1980-1989',
 'renter_moved_1980-1989/1970-1979',
 'renter_moved_1979/1969_earlier',
 'state',
 'county',
 'tract',
 'block group',
 'evictions-effectiveness',
 'decile_by_year',
 'upper10th_by_year']

In [55]:
# Explore attribute datatypes
evic_df.dtypes

GEOID                                  object
year                                   object
name                                   object
parent-location                        object
population                            float64
poverty-rate                          float64
renter-occupied-households            float64
pct-renter-occupied                   float64
median-gross-rent                     float64
median-household-income               float64
median-property-value                 float64
rent-burden                           float64
pct-white                             float64
pct-af-am                             float64
pct-hispanic                          float64
pct-am-ind                            float64
pct-asian                             float64
pct-nh-pi                             float64
pct-multiple                          float64
pct-other                             float64
eviction-filings                      float64
evictions                         

In [56]:
# Explore value counts for different attributes to see if they make good candidates for discretizations
evic_df['tract'].value_counts()

250500.0    28
460500.0    28
150300.0    28
100200.0    28
820101.0    28
480400.0    28
815500.0    28
80100.0     28
63200.0     24
700501.0    24
120300.0    24
650200.0    24
191100.0    24
814200.0    24
81300.0     24
831600.0    24
231500.0    24
800200.0    24
171000.0    24
826100.0    24
816100.0    24
230600.0    24
620300.0    24
823304.0    24
561100.0    24
813400.0    24
550100.0    24
71500.0     24
834300.0    24
842800.0    24
            ..
251000.0     4
390100.0     4
822900.0     4
490400.0     4
570200.0     4
360200.0     4
140601.0     4
240500.0     4
390500.0     4
980100.0     4
61100.0      4
230800.0     4
222600.0     4
240200.0     4
60500.0      4
420200.0     4
831300.0     4
62600.0      4
570100.0     4
222500.0     4
824400.0     4
260100.0     4
831700.0     4
240700.0     4
842200.0     4
660301.0     4
350400.0     4
832000.0     4
640100.0     4
835700.0     4
Name: tract, Length: 1319, dtype: int64

In [57]:
# Explore object datatypes for possible discretizations
str_columns = [column for column in evic_df.columns if (evic_df[column].dtype=='O')]
print(str_columns)

['GEOID', 'year', 'name', 'parent-location', 'decile_by_year']


Attribute 'tract' has too many unique values to discretize. Block group 'name' is a unique identifier. 'Parent-location' attribute will be the same for all rows (i.e. Cook County), and so are 'state' and 'county'. None of these attributes make good candidates for discretizations.

Attributes 'imputed', 'low-flag', and 'subbed' are boolean variables. Won't make sense to cut into quantiles / to discretize.

In [58]:
# First convert decile_by_year to int
df['decile_by_year'].astype('int64')

0        4
1        4
2        3
3        4
4        3
5        5
6        8
7        5
8        6
9        3
10       5
11       8
12       7
13       7
14       5
15       6
16       8
17       6
18       6
19       6
20       4
21       5
22       4
23       4
24       6
25       4
26       8
27       7
28       3
29       9
        ..
19935    4
19936    4
19937    5
19938    4
19939    5
19940    4
19941    7
19942    4
19943    5
19944    5
19945    6
19946    6
19947    7
19948    6
19949    7
19950    2
19951    2
19952    2
19953    9
19954    9
19955    2
19956    2
19957    2
19958    3
19959    3
19960    2
19961    2
19962    2
19963    3
19964    3
Name: decile_by_year, Length: 19965, dtype: int64

In [59]:
# Select attributes not to generate new features from
# Note that'block group' attribute from ACS data is a unique ID of a block group within one tract
# GEOID of a block group = ACS's state + county + tract + block group ID numbers
to_remove = ['GEOID', 'year', 'name', 'parent-location', 'state', 'county', 'tract', 'block group', 'imputed', 'low-flag', 'subbed']
for attribute in to_remove:
    features.remove(attribute)

In [60]:
# Check updated list of features
print(len(features))
features

35


['population',
 'poverty-rate',
 'renter-occupied-households',
 'pct-renter-occupied',
 'median-gross-rent',
 'median-household-income',
 'median-property-value',
 'rent-burden',
 'pct-white',
 'pct-af-am',
 'pct-hispanic',
 'pct-am-ind',
 'pct-asian',
 'pct-nh-pi',
 'pct-multiple',
 'pct-other',
 'eviction-filings',
 'evictions',
 'eviction-rate',
 'eviction-filing-rate',
 'total_for_public_assistance_income',
 'with_public_assistance_income',
 'estimate_total_in_labor_force',
 'estimate_civilian_unemployed',
 'total_for_householder_tenure',
 'renter_occupied',
 'renter_moved_2015/2010_later',
 'renter_moved_2010-2014/2000-2009',
 'renter_moved_2000-2009/1990-1999',
 'renter_moved_1990-1999/1980-1989',
 'renter_moved_1980-1989/1970-1979',
 'renter_moved_1979/1969_earlier',
 'evictions-effectiveness',
 'decile_by_year',
 'upper10th_by_year']

In [61]:
# for now, remove 2012 since ACS info has no info on year 2012
no_2012 = years_list.remove(2012)
years_list

[2013, 2014, 2015, 2016]

In [62]:
# Generate quantile features
features_df = fg.generate_quantile_features(evic_df, features, 4, years_list=no_2012)
# Discretize variables

In [63]:
# Compare original df with new df with added features
print('evic_df:', evic_df.shape)
print('features_df:', features_df.shape)

evic_df: (19965, 116)
features_df: (19965, 256)


In [45]:
# Spot check new feature names (change range numbers around)
for i in range(100, 256):
    cols = features_df.columns
    print(cols[i])

pct-renter-occupied_4quantiles_nan
median-gross-rent_4quantiles_1.0
median-gross-rent_4quantiles_2.0
median-gross-rent_4quantiles_3.0
median-gross-rent_4quantiles_4.0
median-gross-rent_4quantiles_nan
median-household-income_4quantiles_1.0
median-household-income_4quantiles_2.0
median-household-income_4quantiles_3.0
median-household-income_4quantiles_4.0
median-household-income_4quantiles_nan
median-property-value_4quantiles_1.0
median-property-value_4quantiles_2.0
median-property-value_4quantiles_3.0
median-property-value_4quantiles_4.0
median-property-value_4quantiles_nan
rent-burden_4quantiles_1.0
rent-burden_4quantiles_2.0
rent-burden_4quantiles_3.0
rent-burden_4quantiles_4.0
rent-burden_4quantiles_nan
pct-white_4quantiles_1.0
pct-white_4quantiles_2.0
pct-white_4quantiles_3.0
pct-white_4quantiles_4.0
pct-white_4quantiles_nan
pct-af-am_4quantiles_1.0
pct-af-am_4quantiles_2.0
pct-af-am_4quantiles_3.0
pct-af-am_4quantiles_4.0
pct-af-am_4quantiles_nan
pct-hispanic_4quantiles_1.0
pct-his

In [64]:
features_df.head()

Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,decile_by_year_4quantiles_1.0,decile_by_year_4quantiles_2.0,decile_by_year_4quantiles_3.0,decile_by_year_4quantiles_4.0,decile_by_year_4quantiles_nan,upper10th_by_year_4quantiles_1.0,upper10th_by_year_4quantiles_2.0,upper10th_by_year_4quantiles_3.0,upper10th_by_year_4quantiles_4.0,upper10th_by_year_4quantiles_nan
0,170310101001,2012,101.1,"Cook County, Illinois",435.0,18.92,156.0,67.23,785.0,,...,0,1,0,0,0,1,0,0,0,0
1,170310101001,2013,101.1,"Cook County, Illinois",435.0,18.92,159.0,67.23,785.0,,...,0,1,0,0,0,1,0,0,0,0
2,170310101001,2014,101.1,"Cook County, Illinois",435.0,18.92,161.0,67.23,785.0,,...,0,1,0,0,0,1,0,0,0,0
3,170310101001,2015,101.1,"Cook County, Illinois",435.0,18.92,164.0,67.23,785.0,,...,0,1,0,0,0,1,0,0,0,0
4,170310101001,2016,101.1,"Cook County, Illinois",435.0,18.92,167.0,67.23,785.0,,...,0,1,0,0,0,1,0,0,0,0


In [65]:
# Exporting the result:
features_df.to_csv('../data/work/block-groups_2012-2016_with-acs_with-q-features.csv', index=False)