In [1]:
import sys
sys.path.insert(0, '../scripts')
import pandas as pd
import numpy as np
import feature_gen as fg

In [2]:
# Load data that has already been augemented with ACS info
df = pd.read_csv("../data/work/block-groups_2012-2016_with-acs.csv")

In [3]:
df.shape

(15972, 43)

In [4]:
# Determine which years we care about
years_list = [2012, 2013, 2014, 2015, 2016]

In [7]:
# Add outcome label
df['evictions-effectiveness'] = df['evictions'] / df['eviction-filings']
df['evictions-effectiveness'].fillna(value=0, inplace=True)

col = 'evictions-effectiveness'
df_a = pd.DataFrame(columns=('GEOID', 'year', col, 'decile_by_year', 'upper10th_by_year'))
for year in years_list:
    temp_df = df[df['year']==year][['year', 'GEOID', col]]
    temp_df['decile_by_year'] = pd.qcut(temp_df[col].rank(method='first'), 10, labels=False)
    temp_df.loc[(temp_df['decile_by_year'] == 9), 'upper10th_by_year'] = 1
    temp_df.loc[(temp_df['decile_by_year'] < 9), 'upper10th_by_year'] = 0
    df_a = df_a.append(temp_df, ignore_index=True)
df = df.merge(df_a)

ValueError: Bin edges must be unique: array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]).
You can drop duplicate edges by setting the 'duplicates' kwarg

In [None]:
# Filter df so that only years and areas we care about are included
evic_df = df[df['year'].isin(years_list)]
evic_df = evic_df.loc[evic_df["parent-location"] == "Cook County, Illinois"]

In [None]:
evic_df.shape
# ACS augmented data should have already been filtered for Cook County and years, so number of rows should be the same

In [None]:
# Grab all attributes from the dataframe
features = list(evic_df.columns)
print(len(features))
features

In [None]:
# Explore attribute datatypes
evic_df.dtypes

In [None]:
# Explore value counts for different attributes to see if they make good candidates for discretizations
evic_df['tract'].value_counts()

In [None]:
# Explore object datatypes for possible discretizations
str_columns = [column for column in evic_df.columns if (evic_df[column].dtype=='O')]
print(str_columns)

Attribute 'tract' has too many unique values to discretize. Block group 'name' is a unique identifier. 'Parent-location' attribute will be the same for all rows (i.e. Cook County), and so are 'state' and 'county'. None of these attributes make good candidates for discretizations.

Attributes 'imputed', 'low-flag', and 'subbed' are boolean variables. Won't make sense to cut into quantiles / to discretize.

In [None]:
# Select attributes not to generate new features from
# Note that'block group' attribute from ACS data is a unique ID of a block group within one tract
# GEOID of a block group = ACS's state + county + tract + block group ID numbers
to_remove = ['GEOID', 'year', 'name', 'parent-location', 'state', 'county', 'tract', 'block group', 'imputed', 'low-flag', 'subbed']
for attribute in to_remove:
    features.remove(attribute)

In [None]:
# Check updated list of features
print(len(features))
features

In [None]:
# Generate quantile features
features_df = fg.generate_quantile_features(evic_df, features, 4, years_list=years_list)
# Discretize variables

In [None]:
# Compare original df with new df with added features
print('evic_df:', evic_df.shape)
print('features_df:', features_df.shape)

In [None]:
# Spot check new feature names (change range numbers around)
for i in range(900, 1003):
    cols = features_df.columns
    print(cols[i])

In [None]:
features_df.head()

In [None]:
# Exporting the result:
features_df.to_csv('../data/work/block-groups_2012-2016_with-acs_with-q-features.csv', index=False)