Note: fancyimpute requires tensorflow, run in docker container!

In [1]:
import pandas as pd
import numpy as np
import fancyimpute

Using TensorFlow backend.


In [2]:
import os
this_directory = os.path.realpath(".")
home_directory = os.path.split(this_directory)[0]
data_directory = os.path.join(home_directory, "data")

In [4]:
features = pd.read_pickle("{}/qozs_features.pkl".format(data_directory))
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8685 entries, 0 to 8763
Data columns (total 16 columns):
population_total           8685 non-null float64
age_median                 8682 non-null float64
p_never_married            8685 non-null float64
p_white                    8685 non-null float64
p_black                    8685 non-null float64
p_poverty                  8685 non-null float64
household_income_median    8657 non-null float64
home_value_median          8382 non-null float64
structure_year_median      8661 non-null float64
p_outofcountyflux          7822 non-null float64
p_pop_renting              8679 non-null float64
p_units_vacant             8679 non-null float64
p_mobilehomes              8679 non-null float64
p_multiple_unit_strucs     8679 non-null float64
state                      8685 non-null object
tract                      8685 non-null object
dtypes: float64(14), object(2)
memory usage: 1.1+ MB


In [5]:
non_nans = features[features.notna().all(axis=1)] ## 7702 w/o nans, 985 w/nans
nan_lines = features[features.isna().any(axis=1)]
print(len(features), 'should equal', len(non_nans) + len(nan_lines))

8685 should equal 8685


In [6]:
state_number = [n[:2] for n in features['tract'].values]
PR_indexes = np.argwhere(np.array(state_number) == '72').reshape(-1)
not_PR = features.drop(PR_indexes)
not_PR.reset_index(inplace=True, drop=True)
not_PR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7822 entries, 0 to 7821
Data columns (total 16 columns):
population_total           7822 non-null float64
age_median                 7820 non-null float64
p_never_married            7822 non-null float64
p_white                    7822 non-null float64
p_black                    7822 non-null float64
p_poverty                  7822 non-null float64
household_income_median    7798 non-null float64
home_value_median          7528 non-null float64
structure_year_median      7800 non-null float64
p_outofcountyflux          7758 non-null float64
p_pop_renting              7816 non-null float64
p_units_vacant             7816 non-null float64
p_mobilehomes              7816 non-null float64
p_multiple_unit_strucs     7816 non-null float64
state                      7822 non-null object
tract                      7822 non-null object
dtypes: float64(14), object(2)
memory usage: 977.8+ KB


## imputation

In [7]:
from fancyimpute import KNN

In [8]:
tract = not_PR['tract']
incomplete = not_PR.drop(columns=['state','tract'])

In [9]:
## make this better
missing_indexes = {}
for c in incomplete.columns:
    if incomplete[c].isna().any():
        missing_indexes[c] = np.argwhere(incomplete[c].isna()).reshape(-1)

  return getattr(obj, method)(*args, **kwds)


In [10]:
for c in missing_indexes.keys():
    print('{} missing {} values.'.format(len(missing_indexes[c]), c))

22 missing structure_year_median values.
294 missing home_value_median values.
6 missing p_multiple_unit_strucs values.
64 missing p_outofcountyflux values.
24 missing household_income_median values.
2 missing age_median values.
6 missing p_units_vacant values.
6 missing p_mobilehomes values.
6 missing p_pop_renting values.


In [11]:
fill = KNN(k=3).fit_transform(incomplete.values)

Imputing row 1/7822 with 0 missing, elapsed time: 12.629
Imputing row 101/7822 with 0 missing, elapsed time: 12.630
Imputing row 201/7822 with 0 missing, elapsed time: 12.631
Imputing row 301/7822 with 0 missing, elapsed time: 12.633
Imputing row 401/7822 with 0 missing, elapsed time: 12.634
Imputing row 501/7822 with 0 missing, elapsed time: 12.635
Imputing row 601/7822 with 0 missing, elapsed time: 12.636
Imputing row 701/7822 with 1 missing, elapsed time: 12.637
Imputing row 801/7822 with 0 missing, elapsed time: 12.639
Imputing row 901/7822 with 0 missing, elapsed time: 12.641
Imputing row 1001/7822 with 0 missing, elapsed time: 12.643
Imputing row 1101/7822 with 0 missing, elapsed time: 12.644
Imputing row 1201/7822 with 0 missing, elapsed time: 12.645
Imputing row 1301/7822 with 1 missing, elapsed time: 12.646
Imputing row 1401/7822 with 0 missing, elapsed time: 12.647
Imputing row 1501/7822 with 0 missing, elapsed time: 12.648
Imputing row 1601/7822 with 0 missing, elapsed time:

In [20]:
filled = pd.DataFrame(fill, columns = incomplete.columns)
filled.head()
## note: still need to round values

Unnamed: 0,population_total,age_median,p_never_married,p_white,p_black,p_poverty,household_income_median,home_value_median,structure_year_median,p_outofcountyflux,p_pop_renting,p_units_vacant,p_mobilehomes,p_multiple_unit_strucs
0,2761.0,36.3,0.252,0.775,0.204,0.289,34821.0,94100.0,1980.0,0.891,0.35,0.117,0.468,0.515
1,2869.0,40.1,0.199,0.865,0.104,0.288,31390.0,127600.0,1988.0,0.969,0.201,0.134,0.245,0.332
2,4537.0,43.0,0.235,0.915,0.044,0.149,44985.0,130300.0,1991.0,0.946,0.154,0.173,0.247,0.247
3,5321.0,41.5,0.255,0.842,0.129,0.179,41944.0,131100.0,1975.0,0.929,0.321,0.147,0.01,0.172
4,3398.0,29.3,0.431,0.375,0.625,0.22,27587.0,92300.0,1978.0,0.981,0.434,0.185,0.199,0.41


In [21]:
filled.iloc[missing_indexes['age_median']]['age_median']

901     47.134154
3818    42.414513
Name: age_median, dtype: float64

In [22]:
imputed_values = {}
delta_imputed_mean = {}
for c in missing_indexes.keys():
    imputed_values[c] = filled[c].iloc[missing_indexes[c]].values
    delta_imputed_mean[c] = abs(np.mean(imputed_values[c]) - np.mean(incomplete[c]))
delta_imputed_mean

{'age_median': 9.558029245823242,
 'home_value_median': 1708.5779606915312,
 'household_income_median': 2422.0081322318583,
 'p_mobilehomes': 0.034053285915867856,
 'p_multiple_unit_strucs': 0.06120604432144533,
 'p_outofcountyflux': 0.17213259507362,
 'p_pop_renting': 0.14611761658593347,
 'p_units_vacant': 0.019777412209502326,
 'structure_year_median': 1.4639213670716344}

In [23]:
# replace the tract numbers
filled['tract'] = tract
filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7822 entries, 0 to 7821
Data columns (total 15 columns):
population_total           7822 non-null float64
age_median                 7822 non-null float64
p_never_married            7822 non-null float64
p_white                    7822 non-null float64
p_black                    7822 non-null float64
p_poverty                  7822 non-null float64
household_income_median    7822 non-null float64
home_value_median          7822 non-null float64
structure_year_median      7822 non-null float64
p_outofcountyflux          7822 non-null float64
p_pop_renting              7822 non-null float64
p_units_vacant             7822 non-null float64
p_mobilehomes              7822 non-null float64
p_multiple_unit_strucs     7822 non-null float64
tract                      7822 non-null object
dtypes: float64(14), object(1)
memory usage: 916.7+ KB


In [24]:
# add LIC column back
LIC_one = pd.read_pickle("{}/qozs_1.pkl".format(data_directory))[['census_tract_number', 'Non-LIC']]
LIC_two = LIC_one.rename(columns={'census_tract_number':'tract'})
filled = filled.merge(LIC_two, how='left', on='tract')
filled.head()

Unnamed: 0,population_total,age_median,p_never_married,p_white,p_black,p_poverty,household_income_median,home_value_median,structure_year_median,p_outofcountyflux,p_pop_renting,p_units_vacant,p_mobilehomes,p_multiple_unit_strucs,tract,Non-LIC
0,2761.0,36.3,0.252,0.775,0.204,0.289,34821.0,94100.0,1980.0,0.891,0.35,0.117,0.468,0.515,1001020700,0
1,2869.0,40.1,0.199,0.865,0.104,0.288,31390.0,127600.0,1988.0,0.969,0.201,0.134,0.245,0.332,1003010200,0
2,4537.0,43.0,0.235,0.915,0.044,0.149,44985.0,130300.0,1991.0,0.946,0.154,0.173,0.247,0.247,1003010400,1
3,5321.0,41.5,0.255,0.842,0.129,0.179,41944.0,131100.0,1975.0,0.929,0.321,0.147,0.01,0.172,1003010500,0
4,3398.0,29.3,0.431,0.375,0.625,0.22,27587.0,92300.0,1978.0,0.981,0.434,0.185,0.199,0.41,1003010600,0


In [18]:
filled.to_pickle("{}/qoz_model.pkl".format(data_directory))