# Data Cleaning and Preprocessing

In [1]:
import pandas as pd

%matplotlib inline

# Processing the Geocoded Flint Addresses 
Used [census geocoding database](https://geocoding.geo.census.gov/geocoder/geographies/addressbatch?form) to get coordinates from the US Census Bureau . Had to split the batch job in two because of limits on the service. 

In [2]:
names = ['Index','Address', 'Match', 'Exact', 'Address_2', 'Coords', 'Coords_2', 'Coords_3', 'Coords_4', 'Coords_5', 'Coords_6', 'Coords_7']
geocoded_lead_addr_1 = pd.read_csv('./data/GeocodeResults (1).csv', encoding = 'unicode_escape', error_bad_lines = False, names=names, index_col=0)
geocoded_lead_addr_2 = pd.read_csv('./data/GeocodeResults (2).csv', encoding = 'unicode_escape', error_bad_lines= False, names=names, index_col=0)


Dropping addresses that were not successfully geocoded

In [3]:
geocoded_lead_addr_1.dropna(inplace=True, subset=['Coords'])
geocoded_lead_addr_2.dropna(inplace=True, subset=['Coords'])

Concatenating the two dataframes, now we have all of the addresses in one dataframe.
Also had to take care of the Coords column, contained a string representing both latitude and longitude. More convenient to have latitude and longitude in two separate columns

In [4]:
geocoded_lead_addr = pd.concat((geocoded_lead_addr_1, geocoded_lead_addr_2))

latlon = geocoded_lead_addr["Coords"].str.split(",", expand=True)
latlon.columns = ['Lon', 'Lat']

geocoded_lead_addr = geocoded_lead_addr.join(latlon)

geocoded_lead_addr.head()

Unnamed: 0_level_0,Address,Match,Exact,Address_2,Coords,Coords_2,Coords_3,Coords_4,Coords_5,Coords_6,Coords_7,Lon,Lat
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4970,"1608 CROMWELL AVE, FLINT, MI, 48503",Match,Exact,"1608 CROMWELL AVE, FLINT, MI, 48503","-83.66882,43.02084",647468711.0,R,26.0,49.0,3000.0,1022.0,-83.66882,43.02084
2306,"2732 BROWNELL BLVD, FLINT, MI, 48504",Match,Exact,"2732 BROWNELL BLVD, FLINT, MI, 48504","-83.727516,43.041607",69564568.0,R,26.0,49.0,900.0,2007.0,-83.727516,43.041607
2305,"2714 BROWNELL BLVD, FLINT, MI, 48504",Match,Exact,"2714 BROWNELL BLVD, FLINT, MI, 48504","-83.72751,43.0412",69564571.0,R,26.0,49.0,900.0,2015.0,-83.72751,43.0412
2304,"2707 BROWNELL BLVD, FLINT, MI, 48504",Match,Exact,"2707 BROWNELL BLVD, FLINT, MI, 48504","-83.72752,43.04109",69564571.0,L,26.0,49.0,900.0,1013.0,-83.72752,43.04109
2303,"2702 BROWNELL BLVD, FLINT, MI, 48504",Match,Exact,"2702 BROWNELL BLVD, FLINT, MI, 48504","-83.72751,43.041065",69564571.0,R,26.0,49.0,900.0,2015.0,-83.72751,43.041065


Reading in the deduplicated flint water data and joining with the geocoded addresses (on id)

In [5]:
flint_lead_data = pd.read_csv('./data/flint_water_merge_dedup.csv', index_col=0)
flint_lead_data.head()

Unnamed: 0_level_0,Analysis (Copper),Analysis (Lead),City,Copper (ppb),Date Submitted,Lead (ppb),Sample Number,Street #,Street Name,Zip Code
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Copper,Lead,FLINT,0.0,2016-04-22 13:27:45,0.0,LG38207,525,12TH ST,48503
1,Copper,Lead,FLINT,0.0,2016-02-26 13:51:00,0.0,LG18552,1907,2ND AVE,48503
2,Copper,Lead,FLINT,260.0,2016-03-06 12:14:59,0.0,LG20757,308,4TH AVE,48503
3,Copper,Lead,FLINT,0.0,2016-02-02 13:28:23,7.0,LG07760,2417,ADAIR ST,48506
4,Copper,Lead,,70.0,2016-02-08 13:05:21,8.0,LG10679,2421,ADAIR ST,48506


In [6]:
lead_data_geo = geocoded_lead_addr.join(flint_lead_data)
with open('lead_data_geo.csv', 'w+') as f:
    lead_data_geo.to_csv(f)

# Acquiring the Flint 2014 Housing Data

This was significantly more difficult than the other

In [7]:
import json
import os

In [8]:
#SEV is State Equalized Value


with open(r'C:\Users\gushi\Projects\buffalo_civic_innovation\hud_arcgis_download_script\hud_file_list') as g:
    hudfiles = g.read()

hudfiles = hudfiles.split()

for hudfile in hudfiles:
    #print(r'C:\Users\gushi\Projects\buffalo_civic_innovation\hud_arcgis_download_script\' + hudfile)
    with open(os.path.join(r'C:\Users\gushi\Projects\buffalo_civic_innovation\hud_arcgis_download_script', hudfile)) as f:
        hud_dict = json.load(f)
    for hud_prop in hud_dict["features"]:
        if "2714 BROWNELL BLVD" in hud_prop['attributes']['Prop_Addr']:
            print(hud_prop['attributes']['Prop_Addr'], hud_prop['attributes']['lowmod_pct'])

#for hud_prop in hud_dict["features"]:
 #   print(hud_prop['attributes']['Prop_Addr'])

2714 BROWNELL BLVD 0.6731


Use Zillow API to get the property age and value!
Looked at fire insurance maps, county clerk etc. No dice. 

# Using Zillow API to access home age and home value

In [9]:
import requests


key = os.environ['ZWSID']
base_url = 'http://www.zillow.com/webservice/GetDeepSearchResults.htm'
params = {'zws-id':key, 'address':'2714 BROWNELL BLVD', 'citystatezip':'FLINT MI 48504'}

r = requests.get(base_url, params=params)


In [10]:
from xml.etree import ElementTree
import xmltodict

resp_dict = xmltodict.parse(r.text)
resp_dict

OrderedDict([('SearchResults:searchresults',
              OrderedDict([('@xsi:schemaLocation',
                            'http://www.zillow.com/static/xsd/SearchResults.xsd https://www.zillowstatic.com/vstatic/7de9b24/static/xsd/SearchResults.xsd'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('@xmlns:SearchResults',
                            'http://www.zillow.com/static/xsd/SearchResults.xsd'),
                           ('request',
                            OrderedDict([('address', '2714 BROWNELL BLVD'),
                                         ('citystatezip', 'FLINT MI 48504')])),
                           ('message',
                            OrderedDict([('text',
                                          'Request successfully processed'),
                                         ('code', '0')])),
                           ('response',
                            OrderedD

In [12]:
resp_dict['SearchResults:searchresults']['response']['results']['result']['address']
resp_dict['SearchResults:searchresults']['response']['results']['result']['address']['street']
resp_dict['SearchResults:searchresults']['response']['results']['result']['address']['zipcode']
resp_dict['SearchResults:searchresults']['response']['results']['result']['address']['city']
resp_dict['SearchResults:searchresults']['response']['results']['result']['address']['state']
resp_dict['SearchResults:searchresults']['response']['results']['result']['address']['latitude']
resp_dict['SearchResults:searchresults']['response']['results']['result']['address']['longitude']
resp_dict['SearchResults:searchresults']['response']['results']['result']['useCode']
resp_dict['SearchResults:searchresults']['response']['results']['result']['taxAssessmentYear']
resp_dict['SearchResults:searchresults']['response']['results']['result']['taxAssessment']
resp_dict['SearchResults:searchresults']['response']['results']['result']['yearBuilt']

1942

In [100]:
#lead_data_geo['z_street'],lead_data_geo['z_city'], lead_data_geo['z_state'], lead_data_geo['z_usecode'] = '','','',''
#lead_data_geo['z_taxass'],lead_data_geo['z_taxyear'], lead_data_geo['z_yearbuilt'], lead_data_geo['z_zip'] = -1,-1,-1,-1
#lead_data_geo['z_lat'], lead_data_geo['z_lon'] = -1.0, -1.0

#Correct way to go about this - gotta create a new dataframe with the 

addr_components = ['street', 'zipcode','city', 'state', 'latitude', 'longitude']
result_components = ['useCode','taxAssessmentYear','taxAssessment','yearBuilt']


#To rebuild z_data_flint dataset:


"""z_data_flint = pd.DataFrame(index = geocoded_lead_addr.index.copy(), columns=addr_components + result_components)
z_data_flint['Accessed'] = False

z_data_flint= z_data_flint.join(geocoded_lead_addr['Address'])

with open('../buffalo_civic_innovation/z_data_flint.csv', 'w+') as f:
    z_data_flint.to_csv(f)"""


key = os.environ['ZWSID']

def get_zdata(address, citystatezip):
    base_url = 'http://www.zillow.com/webservice/GetDeepSearchResults.htm'
    params = {'zws-id':key, 'address':address, 'citystatezip':citystatezip}
    r = requests.get(base_url, params=params)
    resp_dict = xmltodict.parse(r.text)
    return resp_dict

def get_addr_parts(item):
    addr_parts = item.split(',')
    addr = addr_parts.pop(0)
    citystatezip = ''.join(addr_parts)
    return (addr, citystatezip)

#cromwell = get_zdata(addr, citystatezip)
#print(cromwell)

#z_data_flint.head()

In [34]:
cromwell['SearchResults:searchresults']['response']['results']['result']['yearBuilt']

'1949'

In [74]:
geocoded_lead_addr['Address']

Index
4970            1608  CROMWELL AVE, FLINT, MI, 48503
2306           2732  BROWNELL BLVD, FLINT, MI, 48504
2305           2714  BROWNELL BLVD, FLINT, MI, 48504
2304           2707  BROWNELL BLVD, FLINT, MI, 48504
2303            2702 BROWNELL BLVD, FLINT, MI, 48504
2302                2702  BROWNELL BLVD, FLINT, MI, 
2301           2644  BROWNELL BLVD, FLINT, MI, 48504
3630            3422  CLAIRMONT ST, FLINT, MI, 48503
4961            1521  CROMWELL AVE, FLINT, MI, 48503
3631            3502  CLAIRMONT ST, FLINT, MI, 48503
4964             1521 CROMWELL AVE, FLINT, MI, 48503
3634             3502 CLAIRMONT ST, FLINT, MI, 48503
2309           2909  BROWNELL BLVD, FLINT, MI, 48504
2308           2807  BROWNELL BLVD, FLINT, MI, 48504
4967                 1601  CROMWELL AVE, FLINT, MI, 
2307           2801  BROWNELL BLVD, FLINT, MI, 48504
3637            3506  CLAIRMONT ST, FLINT, MI, 48503
4968             1607 CROMWELL AVE, FLINT, MI, 48503
4980               1526  CRONK AVE, FLIN

In [96]:
geocoded_lead_addr.loc[8482]

Address      743 FLORAL PARK, FLINT, MI, 48503
Match                                    Match
Exact                                    Exact
Address_2    743 FLORAL PARK, FLINT, MI, 48503
Coords                     -83.669716,43.00813
Coords_2                           6.95756e+07
Coords_3                                     L
Coords_4                                    26
Coords_5                                    49
Coords_6                                  3200
Coords_7                                  1001
Lon                                 -83.669716
Lat                                   43.00813
Name: 8482, dtype: object

In [97]:
z_data_flint.loc[8482]

street                                             NaN
zipcode                                            NaN
city                                               NaN
state                                              NaN
latitude                                           NaN
longitude                                          NaN
useCode                                            NaN
taxAssessmentYear                                  NaN
taxAssessment                                      NaN
yearBuilt                                          NaN
Accessed                                         False
Address              743 FLORAL PARK, FLINT, MI, 48503
Name: 8482, dtype: object

In [None]:
#Use addresses from geocoded_lead_addr dataframe 
#z_data_flint is opened from a csv that we have created
#once an address/index has been used, put the data in the new dataframe (z_data_flint)
#****Need an indicator column in z_data flint - show whether the address/index has been used
#Trip the indicator to show that this index/address pair has been used
#Then the object is added to a list that gets serialized and saved at the end of the session

import pickle
from datetime import datetime
from time import sleep

addr_components = ['street', 'zipcode','city', 'state', 'latitude', 'longitude']
result_components = ['useCode','taxAssessmentYear','taxAssessment','yearBuilt']

z_data_flint = pd.read_csv('../buffalo_civic_innovation/z_data_flint.csv', encoding = 'unicode_escape', index_col=0)

unsampled = z_data_flint.loc[z_data_flint['Accessed'] == False].index

zillow_object_list = []

addr_components = ['street', 'zipcode','city', 'state', 'latitude', 'longitude']
result_components = ['useCode','taxAssessmentYear','taxAssessment','yearBuilt']

for x in range(100):
    unsampled = z_data_flint.loc[z_data_flint['Accessed'] == False].index
    for x in range(100):
        sample_row = z_data_flint.loc[unsampled].sample()
        sample_row['Accessed'] = True
        sample_addr = sample_row['Address'].item()
        addr, citystatezip = get_addr_parts(sample_addr)
        try:
            resp_dict = get_zdata(addr, citystatezip)
            zillow_object_list.append(resp_dict)
            for component in addr_components:
                sample_row[component] = resp_dict['SearchResults:searchresults']['response']['results']['result']['address'][component]
            for comp in result_components:
                sample_row[comp] = resp_dict['SearchResults:searchresults']['response']['results']['result'][comp]
        except:
            pass
        sleep(0.05)
        
        z_data_flint.loc[sample_row.index] = sample_row
    print("Batch Finished")

    with open('../buffalo_civic_innovation/z_data_flint.csv', 'w+') as f:
        z_data_flint.to_csv(f)
    
    fname = '../buffalo_civic_innovation/{}.p'.format(hash(str(datetime.now())))
    
    with open(fname, 'wb+') as f:
        pickle.dump(zillow_object_list, f)
    sleep(10)




Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished
Batch Finished


In [126]:
print(zillow_object_list[-1])

OrderedDict([('SearchResults:searchresults', OrderedDict([('@xsi:schemaLocation', 'http://www.zillow.com/static/xsd/SearchResults.xsd https://www.zillowstatic.com/vstatic/7de9b24/static/xsd/SearchResults.xsd'), ('@xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'), ('@xmlns:SearchResults', 'http://www.zillow.com/static/xsd/SearchResults.xsd'), ('request', OrderedDict([('address', '623 E DARTMOUTH ST'), ('citystatezip', 'FLINT MI 48505')])), ('message', OrderedDict([('text', 'Request successfully processed'), ('code', '0')])), ('response', OrderedDict([('results', OrderedDict([('result', OrderedDict([('zpid', '73928124'), ('links', OrderedDict([('homedetails', 'https://www.zillow.com/homedetails/623-E-Dartmouth-St-Flint-MI-48505/73928124_zpid/'), ('graphsanddata', 'http://www.zillow.com/homedetails/623-E-Dartmouth-St-Flint-MI-48505/73928124_zpid/#charts-and-data'), ('mapthishome', 'http://www.zillow.com/homes/73928124_zpid/'), ('comparables', 'http://www.zillow.com/homes/comps/739

In [127]:
z_data_flint.count()

street                3808
zipcode               3808
city                  3808
state                 3808
latitude              3808
longitude             3808
useCode               3808
taxAssessmentYear     3780
taxAssessment         3752
yearBuilt             1800
Accessed             15491
Address              15491
dtype: int64