Note: fancyimpute requires tensorflow, run in docker container!

In [2]:
import pandas as pd
import numpy as np
import fancyimpute

Using TensorFlow backend.


In [3]:
import os
this_directory = os.path.realpath(".")
home_directory = os.path.split(this_directory)[0]
data_directory = os.path.join(home_directory, "data")

In [4]:
features = pd.read_pickle("{}/qozs_features.pkl".format(data_directory))
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8685 entries, 0 to 8763
Data columns (total 16 columns):
population_total           8685 non-null float64
age_median                 8682 non-null float64
p_never_married            8685 non-null float64
p_white                    8685 non-null float64
p_black                    8685 non-null float64
p_poverty                  8685 non-null float64
household_income_median    8657 non-null float64
home_value_median          8382 non-null float64
structure_year_median      8661 non-null float64
p_outofcountyflux          7822 non-null float64
p_pop_renting              8679 non-null float64
p_units_vacant             8679 non-null float64
p_mobilehomes              8679 non-null float64
p_multiple_unit_strucs     8679 non-null float64
state                      8685 non-null object
tract                      8685 non-null object
dtypes: float64(14), object(2)
memory usage: 1.1+ MB


In [5]:
non_nans = features[features.notna().all(axis=1)] ## 7702 w/o nans, 985 w/nans
nan_lines = features[features.isna().any(axis=1)]
print(len(features), 'should equal', len(non_nans) + len(nan_lines))

8685 should equal 8685


## imputation

In [6]:
from fancyimpute import KNN

In [19]:
tract_ns = features['tract'].reset_index(drop=True)
incomplete = features.drop(columns=['state','tract', 'p_outofcountyflux'])
tract_ns

0       01001020700
1       01003010200
2       01003010400
3       01003010500
4       01003010600
5       01003011501
6       01003011502
7       01005950100
8       01007010002
9       01009050500
10      01011952200
11      01013952800
12      01015000700
13      01015000800
14      01015002101
15      01017954300
16      01019955900
17      01021060102
18      01023956800
19      01025957902
20      01027959000
21      01029959600
22      01031010900
23      01033020200
24      01035960400
25      01037961000
26      01039962000
27      01039962700
28      01041963700
29      01043964800
           ...     
8655    55139002900
8656    55139003300
8657    55141010400
8658    55141011200
8659    55141011700
8660    56001962800
8661    56001963100
8662    56001963400
8663    56001963500
8664    56001963700
8665    56007967700
8666    56009956600
8667    56013000100
8668    56013940100
8669    56013940300
8670    56015957800
8671    56015958000
8672    56017967800
8673    56021000402


In [8]:
## make this better
missing_indexes = {}
for c in incomplete.columns:
    if incomplete[c].isna().any():
        missing_indexes[c] = np.argwhere(incomplete[c].isna()).reshape(-1)

  return getattr(obj, method)(*args, **kwds)


In [9]:
for c in missing_indexes.keys():
    print('{} missing {} values.'.format(len(missing_indexes[c]), c))

28 missing household_income_median values.
6 missing p_mobilehomes values.
3 missing age_median values.
6 missing p_multiple_unit_strucs values.
303 missing home_value_median values.
6 missing p_pop_renting values.
6 missing p_units_vacant values.
24 missing structure_year_median values.


In [10]:
fill = KNN(k=3).fit_transform(incomplete.values)

Imputing row 1/8685 with 0 missing, elapsed time: 15.230
Imputing row 101/8685 with 0 missing, elapsed time: 15.231
Imputing row 201/8685 with 0 missing, elapsed time: 15.233
Imputing row 301/8685 with 0 missing, elapsed time: 15.234
Imputing row 401/8685 with 0 missing, elapsed time: 15.235
Imputing row 501/8685 with 0 missing, elapsed time: 15.235
Imputing row 601/8685 with 0 missing, elapsed time: 15.236
Imputing row 701/8685 with 1 missing, elapsed time: 15.239
Imputing row 801/8685 with 0 missing, elapsed time: 15.241
Imputing row 901/8685 with 0 missing, elapsed time: 15.242
Imputing row 1001/8685 with 0 missing, elapsed time: 15.244
Imputing row 1101/8685 with 0 missing, elapsed time: 15.245
Imputing row 1201/8685 with 0 missing, elapsed time: 15.247
Imputing row 1301/8685 with 1 missing, elapsed time: 15.247
Imputing row 1401/8685 with 0 missing, elapsed time: 15.248
Imputing row 1501/8685 with 0 missing, elapsed time: 15.249
Imputing row 1601/8685 with 0 missing, elapsed time:

In [11]:
filled = pd.DataFrame(fill, columns = incomplete.columns)
filled.info()
## note: might round values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8685 entries, 0 to 8684
Data columns (total 13 columns):
population_total           8685 non-null float64
age_median                 8685 non-null float64
p_never_married            8685 non-null float64
p_white                    8685 non-null float64
p_black                    8685 non-null float64
p_poverty                  8685 non-null float64
household_income_median    8685 non-null float64
home_value_median          8685 non-null float64
structure_year_median      8685 non-null float64
p_pop_renting              8685 non-null float64
p_units_vacant             8685 non-null float64
p_mobilehomes              8685 non-null float64
p_multiple_unit_strucs     8685 non-null float64
dtypes: float64(13)
memory usage: 882.1 KB


In [12]:
filled.iloc[missing_indexes['age_median']]['age_median']

901     57.705398
3818    43.074146
6945    31.142917
Name: age_median, dtype: float64

In [13]:
imputed_values = {}
delta_imputed_mean = {}
for c in missing_indexes.keys():
    imputed_values[c] = filled[c].iloc[missing_indexes[c]].values
    delta_imputed_mean[c] = abs(np.mean(imputed_values[c]) - np.mean(incomplete[c]))
delta_imputed_mean

{'age_median': 8.371331625644459,
 'home_value_median': 64.63047630901565,
 'household_income_median': 5513.453182592002,
 'p_mobilehomes': 0.027061111445119926,
 'p_multiple_unit_strucs': 0.06738330287365307,
 'p_pop_renting': 0.1818740243968352,
 'p_units_vacant': 0.08684446363297696,
 'structure_year_median': 4.929980667336395}

In [20]:
# replace the tract numbers
filled['tract'] = tract_ns
filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8685 entries, 0 to 8684
Data columns (total 14 columns):
population_total           8685 non-null float64
age_median                 8685 non-null float64
p_never_married            8685 non-null float64
p_white                    8685 non-null float64
p_black                    8685 non-null float64
p_poverty                  8685 non-null float64
household_income_median    8685 non-null float64
home_value_median          8685 non-null float64
structure_year_median      8685 non-null float64
p_pop_renting              8685 non-null float64
p_units_vacant             8685 non-null float64
p_mobilehomes              8685 non-null float64
p_multiple_unit_strucs     8685 non-null float64
tract                      8685 non-null object
dtypes: float64(13), object(1)
memory usage: 950.0+ KB


In [22]:
# add LIC column back
LIC_one = pd.read_pickle("{}/qozs_1.pkl".format(data_directory))[['census_tract_number', 'Non-LIC']]
LIC_two = LIC_one.rename(columns={'census_tract_number':'tract'})
filled = filled.merge(LIC_two, how='left', on='tract')
filled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8685 entries, 0 to 8684
Data columns (total 15 columns):
population_total           8685 non-null float64
age_median                 8685 non-null float64
p_never_married            8685 non-null float64
p_white                    8685 non-null float64
p_black                    8685 non-null float64
p_poverty                  8685 non-null float64
household_income_median    8685 non-null float64
home_value_median          8685 non-null float64
structure_year_median      8685 non-null float64
p_pop_renting              8685 non-null float64
p_units_vacant             8685 non-null float64
p_mobilehomes              8685 non-null float64
p_multiple_unit_strucs     8685 non-null float64
tract                      8685 non-null object
Non-LIC                    8685 non-null int64
dtypes: float64(13), int64(1), object(1)
memory usage: 1.1+ MB


In [24]:
filled.to_pickle("{}/qoz_model_plusPR.pkl".format(data_directory))