# GeoCode 2022 Boston Property Addresses


### https://www.youtube.com/watch?v=nIdmmEfW3R0
### https://geocoding.geo.census.gov/geocoder/geographies/addressbatch?form 

### Set up environment

In [1]:
import pandas as pd 
import urllib.request
import requests
import io
import csv

In [2]:
# set up notebook to display multiple output in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
 
# Set option to format printing of Float columns
pd.options.display.float_format = '{:.2f}'.format

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
# Mount Google Drive

from google.colab import drive
#drive.mount('/content/gdrive')
drive.mount('/gdrive') 
%cd /gdrive/My\ Drive/MSDS\ 498\ Capstone/4.\ Analytics/4a.\ Data/GeoCode

Mounted at /gdrive
/gdrive/My Drive/MSDS 498 Capstone/4. Analytics/4a. Data/GeoCode


### Pull 2022 Boston Property Addresses

In [None]:

# url = 'https://data.boston.gov/api/3/action/datastore_search?resource_id=4b99718b-d064-471b-9b24-517ae5effecc'
# import data sets for 2022 and 2021 and 2020 and 2019
url2022= 'https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/4b99718b-d064-471b-9b24-517ae5effecc/download/fy2022pa-4.csv'  


pa2022alldf = pd.read_csv(url2022, low_memory=False)
pa2022alldf["filename"] = '2022'
pa2022alldf["PID"] = pa2022alldf["PID"].astype(str)
 
print(pa2022alldf.shape)


(178598, 65)


In [None]:
# Limit to Owner Occupied, Single-Family  
pa2022df = pa2022alldf[( pa2022alldf["LU"]=='R1')  & (pa2022alldf["OWN_OCC"]=="Y")]

print(pa2022df.shape)
print(pa2022df.groupby(by="OWN_OCC")["OWN_OCC"].count() )
print(pa2022df.groupby(by="LU")["LU"].count() )

(25640, 65)
OWN_OCC
Y    25640
Name: OWN_OCC, dtype: int64
LU
R1    25640
Name: LU, dtype: int64


In [None]:
# Select the Required Address Columns
pa_df = pa2022df[[
 'PID'
,'ST_NUM'
,'ST_NAME' 
,'CITY'
,'ZIPCODE' 
]]

# Add STATE and set to MA
pa_df['STATE'] = 'MA'
pa_df['STATE'].value_counts()

# Build the Street Address Column
pa_df['ADDRESS'] =  pa_df['ST_NUM'] + " " + pa_df['ST_NAME' ] 

# Order the Columns Required by the API
pa_df = pa_df[['PID','ADDRESS','CITY','STATE','ZIPCODE']]
pa_df.head() 

MA    25640
Name: STATE, dtype: int64

Unnamed: 0,PID,ADDRESS,CITY,STATE,ZIPCODE
27,100021000,243 HF LEXINGTON ST,EAST BOSTON,MA,2128
36,100027000,33 PRESCOTT ST,EAST BOSTON,MA,2128
42,100033000,246 PRINCETON ST,EAST BOSTON,MA,2128
62,100052000,112 PUTNAM ST,EAST BOSTON,MA,2128
130,100118000,1 2 LAWSON PL,EAST BOSTON,MA,2128


# Break the Property File into Chunks and Output to CSV

In [None]:
# Geocode periodically failed when running observations 5000-10000 at one time
# However, breaking it into smaller chunks, the error did not resurface.  It only occurred with this subset (5000-10000)

cut1_df = pa_df.iloc[0:5000, :] 
# cut2_df = pa_df.iloc[5000:10000, :] 
cut2a_df = pa_df.iloc[5000:6000, :] 
cut2b_df = pa_df.iloc[6000:7000, :] 
cut2c_df = pa_df.iloc[7000:8000, :] 
cut2d_df = pa_df.iloc[8000:9000, :] 
cut2e_df = pa_df.iloc[9000:10000, :] 
cut3_df = pa_df.iloc[10000:15000, :] 
cut4_df = pa_df.iloc[15000:20000, :] 
cut5_df = pa_df.iloc[20000:30000, :] 

pa_df.shape 
cut1_df.shape
cut2a_df.shape
cut2b_df.shape
cut2c_df.shape
cut2d_df.shape
cut2e_df.shape
cut3_df.shape
cut4_df.shape
cut5_df.shape

cut1_df.to_csv('census_latlong_input_1.csv', header=False, index=False)
cut2a_df.to_csv('census_latlong_input_2a.csv', header=False, index=False)
cut2b_df.to_csv('census_latlong_input_2b.csv', header=False, index=False)
cut2c_df.to_csv('census_latlong_input_2c.csv', header=False, index=False)
cut2d_df.to_csv('census_latlong_input_2d.csv', header=False, index=False)
cut2e_df.to_csv('census_latlong_input_2e.csv', header=False, index=False)
cut3_df.to_csv('census_latlong_input_3.csv', header=False, index=False)
cut4_df.to_csv('census_latlong_input_4.csv', header=False, index=False)
cut5_df.to_csv('census_latlong_input_5.csv', header=False, index=False)


(25640, 5)

(5000, 5)

(1000, 5)

(1000, 5)

(1000, 5)

(1000, 5)

(1000, 5)

(5000, 5)

(5000, 5)

(5640, 5)

# Geocode the Data Using the US Census API

### Define the Geocode Process as a Function

In [None]:
def getgeo(dfobj, csvobj):
    url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch'
    files = {'addressFile': (csvobj, open(csvobj, 'rb'), 'text/csv')}
    payload = {'benchmark':'Public_AR_Current','vintage':'Current_Current'}
    s = requests.post(url, files=files, data=payload)

    #type(s)

    dfobj = pd.read_csv(io.StringIO(s.text), sep=',', header=None, quoting=csv.QUOTE_ALL)
    dfobj.columns = ['ID', 'ADDRESS_IN', 'MATCH_INDICATOR', 'MATCH_TYPE', 'ADDRESS_OUT', 'LONG_LAT', 'TIGER_EDGE', 'STREET_SIDE', 'FIPS_STATE', 'FIPS_COUNTY', 'CENSUS_TRACT', 'CENSUS_BLOCK']

    with pd.option_context(
        'display.width', None,
        'display.max_columns', None,
        'display.max_colwidth', -1,
        'display.colheader_justify', 'left'):
        print(dfobj.head())

    #print(dfobj.info())
    print(dfobj['MATCH_TYPE'].value_counts())
    
    return dfobj

### Call the Geocode Function on Each of the Data Chunks

In [None]:
geo_df1 = pd.DataFrame()
geo_df1 = getgeo(geo_df1, 'census_latlong_input_1.csv')

   ID         ADDRESS_IN                                  MATCH_INDICATOR  \
0  802870050   29   HUCKINS ST, ROXBURY, MA, 2119          Match            
1  1101083000  33   W WALNUT PK, ROXBURY, MA, 2119         Tie              
2  1102053000  52   HAVERFORD ST, JAMAICA PLAIN, MA, 2130  Match            
3  1301250010  49   MONADNOCK ST, DORCHESTER, MA, 2125     Match            
4  700950000   256   GOLD ST, SOUTH BOSTON, MA, 2127       Match            

  MATCH_TYPE ADDRESS_OUT                                 \
0  Exact      29 HUCKINS ST, ROXBURY, MA, 02119           
1  NaN        NaN                                         
2  Exact      52 HAVERFORD ST, JAMAICA PLAIN, MA, 02130   
3  Exact      49 MONADNOCK ST, DORCHESTER, MA, 02125      
4  Exact      256 GOLD ST, SOUTH BOSTON, MA, 02127        

  LONG_LAT                                TIGER_EDGE STREET_SIDE  FIPS_STATE  \
0  -71.07543237499993,42.323855010000045 85701065.00  R          25.00         
1  NaN                

In [None]:
geo_df2a = pd.DataFrame()
geo_df2a = getgeo(geo_df2a, 'census_latlong_input_2a.csv')

geo_df2b = pd.DataFrame()
geo_df2b = getgeo(geo_df2b, 'census_latlong_input_2b.csv')

geo_df2c = pd.DataFrame()
geo_df2c = getgeo(geo_df2c, 'census_latlong_input_2c.csv')

geo_df2d = pd.DataFrame()
geo_df2d = getgeo(geo_df2d, 'census_latlong_input_2d.csv')

geo_df2e = pd.DataFrame()
geo_df2e = getgeo(geo_df2e, 'census_latlong_input_2e.csv')


   ID         ADDRESS_IN                              MATCH_INDICATOR  \
0  1400570010  24   STANWOOD ST, DORCHESTER, MA, 2121  Match            
1  1500905000  11   HOLIDAY ST, DORCHESTER, MA, 2124   No_Match         
2  1405032000  40   HILLSBORO RD, MATTAPAN, MA, 2126   Match            
3  1403675020  45 B  JONES AV, DORCHESTER, MA, 2124    Match            
4  1403779001  58   JACOB ST, DORCHESTER, MA, 2124     Match            

  MATCH_TYPE ADDRESS_OUT                             \
0  Exact      24 STANWOOD ST, DORCHESTER, MA, 02121   
1  NaN        NaN                                     
2  Exact      40 HILLSBORO RD, MATTAPAN, MA, 02126    
3  Exact      45 JONES AVE, DORCHESTER, MA, 02124     
4  Exact      58 JACOB ST, DORCHESTER, MA, 02124      

  LONG_LAT                                TIGER_EDGE STREET_SIDE  FIPS_STATE  \
0  -71.08083813299999,42.30959153900005  85701566.00  L          25.00         
1  NaN                                   NaN          NaN        NaN  

In [None]:
geo_df3 = pd.DataFrame()
geo_df3 = getgeo(geo_df3, 'census_latlong_input_3.csv')

   ID         ADDRESS_IN                                 MATCH_INDICATOR  \
0  1811197000  29   GORDON AV, HYDE PARK, MA, 2136        Match            
1  1811873001  179   RESERVATION RD, HYDE PARK, MA, 2136  Match            
2  1808462000  149   SHERRIN ST, HYDE PARK, MA, 2136      Match            
3  1808221000  34   GWINNETT ST, HYDE PARK, MA, 2136      Match            
4  1810788000  15   FARWELL AV, HYDE PARK, MA, 2136       Match            

  MATCH_TYPE ADDRESS_OUT                                \
0  Exact      29 GORDON AVE, HYDE PARK, MA, 02136        
1  Exact      179 RESERVATION RD, HYDE PARK, MA, 02136   
2  Exact      149 SHERRIN ST, HYDE PARK, MA, 02136       
3  Exact      34 GWINNETT ST, HYDE PARK, MA, 02136       
4  Exact      15 FARWELL AVE, HYDE PARK, MA, 02136       

  LONG_LAT                                TIGER_EDGE STREET_SIDE  FIPS_STATE  \
0  -71.12637133799996,42.256660632000035 85704390.00  R          25.00         
1  -71.13047558999995,42.251621797

In [None]:
geo_df4 = pd.DataFrame()
geo_df4 = getgeo(geo_df4, 'census_latlong_input_4.csv')

   ID         ADDRESS_IN                                 MATCH_INDICATOR  \
0  2003509000  222   LAGRANGE ST, WEST ROXBURY, MA, 2132  Match            
1  2002539000  231   LAGRANGE ST, WEST ROXBURY, MA, 2132  Match            
2  1900943000  10   ST JOHN ST, JAMAICA PLAIN, MA, 2130   Match            
3  1903418000  5   DELFORD ST, ROSLINDALE, MA, 2131       Match            
4  2000919000  15   HAVANA ST, ROSLINDALE, MA, 2131       Match            

  MATCH_TYPE ADDRESS_OUT                                \
0  Exact      222 LAGRANGE ST, WEST ROXBURY, MA, 02132   
1  Exact      231 LAGRANGE ST, WEST ROXBURY, MA, 02132   
2  Exact      10 ST JOHN ST, JAMAICA PLAIN, MA, 02130    
3  Exact      5 DELFORD ST, ROSLINDALE, MA, 02131        
4  Exact      15 HAVANA ST, ROSLINDALE, MA, 02131        

  LONG_LAT                                TIGER_EDGE STREET_SIDE  FIPS_STATE  \
0  -71.15173876899996,42.27618830100005  85702995.00  L          25.00         
1  -71.15264485699998,42.276819932

In [None]:
geo_df5 = pd.DataFrame()
geo_df5 = getgeo(geo_df5, 'census_latlong_input_5.csv')

   ID         ADDRESS_IN                                  MATCH_INDICATOR  \
0  2007540010  352   COREY ST, WEST ROXBURY, MA, 2132      Match            
1  2005805000  23   MANTHORNE RD, WEST ROXBURY, MA, 2132   Match            
2  2011498000  2   STIMSON RD, WEST ROXBURY, MA, 2132      Match            
3  2010046000  124   NEW HAVEN ST, WEST ROXBURY, MA, 2132  Match            
4  2010287000  104   SALMAN ST, WEST ROXBURY, MA, 2132     Match            

  MATCH_TYPE ADDRESS_OUT                                 \
0  Exact      352 COREY ST, WEST ROXBURY, MA, 02132       
1  Exact      23 MANTHORNE RD, WEST ROXBURY, MA, 02132    
2  Exact      2 STIMSON RD, WEST ROXBURY, MA, 02132       
3  Exact      124 NEW HAVEN ST, WEST ROXBURY, MA, 02132   
4  Exact      104 SALMAN ST, WEST ROXBURY, MA, 02132      

  LONG_LAT                                TIGER_EDGE  STREET_SIDE  FIPS_STATE  \
0  -71.16452688299995,42.294225690000076 636133386.00  L          25.00         
1  -71.1502852769999

# Combine the Geocode Results and Finalize the Data

In [None]:
pa202_geo_df = geo_df1.copy()
pa202_geo_df = pa202_geo_df.append(geo_df2a, ignore_index=True)
pa202_geo_df = pa202_geo_df.append(geo_df2b, ignore_index=True)
pa202_geo_df = pa202_geo_df.append(geo_df2c, ignore_index=True)
pa202_geo_df = pa202_geo_df.append(geo_df2d, ignore_index=True)
pa202_geo_df = pa202_geo_df.append(geo_df2e, ignore_index=True)
pa202_geo_df = pa202_geo_df.append(geo_df3, ignore_index=True)
pa202_geo_df = pa202_geo_df.append(geo_df4, ignore_index=True)
pa202_geo_df = pa202_geo_df.append(geo_df5, ignore_index=True)

print(pa202_geo_df.shape)
print(pa202_geo_df.info())
print(pa202_geo_df['MATCH_TYPE'].value_counts())

# check for duplicates
print(len(pa202_geo_df['ID'])-len(pa202_geo_df['ID'].drop_duplicates()))

(25640, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25640 entries, 0 to 25639
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               25640 non-null  int64  
 1   ADDRESS_IN       25640 non-null  object 
 2   MATCH_INDICATOR  25640 non-null  object 
 3   MATCH_TYPE       25217 non-null  object 
 4   ADDRESS_OUT      25217 non-null  object 
 5   LONG_LAT         25217 non-null  object 
 6   TIGER_EDGE       25217 non-null  float64
 7   STREET_SIDE      25217 non-null  object 
 8   FIPS_STATE       25217 non-null  float64
 9   FIPS_COUNTY      25217 non-null  float64
 10  CENSUS_TRACT     25217 non-null  float64
 11  CENSUS_BLOCK     25217 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 2.3+ MB
None
Exact        24875
Non_Exact      342
Name: MATCH_TYPE, dtype: int64
0


### Parse Lat/Long and Add PID as String

In [None]:

pa202_geo_df["PID"] = pa202_geo_df["ID"].astype(str)
pa202_geo_df['LATITUDE'] = pa202_geo_df['LONG_LAT'].str.split(",").str[1].astype(float)
pa202_geo_df['LONGITUDE'] = pa202_geo_df['LONG_LAT'].str.split(",").str[0].astype(float)

print(pa202_geo_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25640 entries, 0 to 25639
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               25640 non-null  int64  
 1   ADDRESS_IN       25640 non-null  object 
 2   MATCH_INDICATOR  25640 non-null  object 
 3   MATCH_TYPE       25217 non-null  object 
 4   ADDRESS_OUT      25217 non-null  object 
 5   LONG_LAT         25217 non-null  object 
 6   TIGER_EDGE       25217 non-null  float64
 7   STREET_SIDE      25217 non-null  object 
 8   FIPS_STATE       25217 non-null  float64
 9   FIPS_COUNTY      25217 non-null  float64
 10  CENSUS_TRACT     25217 non-null  float64
 11  CENSUS_BLOCK     25217 non-null  float64
 12  PID              25640 non-null  object 
 13  LATITUDE         25217 non-null  float64
 14  LONGITUDE        25217 non-null  float64
dtypes: float64(7), int64(1), object(7)
memory usage: 2.9+ MB
None


In [None]:
pa202_geo_df.head()

Unnamed: 0,ID,ADDRESS_IN,MATCH_INDICATOR,MATCH_TYPE,ADDRESS_OUT,LONG_LAT,TIGER_EDGE,STREET_SIDE,FIPS_STATE,FIPS_COUNTY,CENSUS_TRACT,CENSUS_BLOCK,PID,LATITUDE,LONGITUDE
0,802870050,"29 HUCKINS ST, ROXBURY, MA, 2119",Match,Exact,"29 HUCKINS ST, ROXBURY, MA, 02119","-71.07543237499993,42.323855010000045",85701065.0,R,25.0,25.0,90600.0,1001.0,802870050,42.32,-71.08
1,1101083000,"33 W WALNUT PK, ROXBURY, MA, 2119",Tie,,,,,,,,,,1101083000,,
2,1102053000,"52 HAVERFORD ST, JAMAICA PLAIN, MA, 2130",Match,Exact,"52 HAVERFORD ST, JAMAICA PLAIN, MA, 02130","-71.10188635799994,42.313157886000056",85699833.0,L,25.0,25.0,120301.0,4007.0,1102053000,42.31,-71.1
3,1301250010,"49 MONADNOCK ST, DORCHESTER, MA, 2125",Match,Exact,"49 MONADNOCK ST, DORCHESTER, MA, 02125","-71.06875589899994,42.31638613300004",85701335.0,R,25.0,25.0,91400.0,1002.0,1301250010,42.32,-71.07
4,700950000,"256 GOLD ST, SOUTH BOSTON, MA, 2127",Match,Exact,"256 GOLD ST, SOUTH BOSTON, MA, 02127","-71.04849527899995,42.33551860100005",85712795.0,L,25.0,25.0,60800.0,1002.0,700950000,42.34,-71.05


# Export Final GeoCoded File

In [None]:
pa202_geo_df.to_csv('pa_2022_geocoded.csv', header=True, index=False)

# Test Calculating Distance

In [55]:
import geopy.distance
from numpy import NaN
import numpy as np

# Read in Geocoded Data with Lat/Long by PID
mydata = pd.read_csv('pa_2022_geocoded.csv')

# Set a Tuple of Lat/Long for the center of Boston
boston = (42.361145, -71.057083)

# Keep only the PID, Lat and Long
mydata2 = mydata.copy()
mydata2 = mydata2[['PID','LATITUDE','LONGITUDE']]
mydata2.head()

# Determine number of rows
nrecs = mydata2.shape[0]

# Initialize an empty list of distances
distances = []

for i in range(nrecs):
    id   = mydata2.iloc[i,0]
    lat  = mydata2.iloc[i,1]
    long = mydata2.iloc[i,2]
    coords = (lat, long)
    if lat>0 :
      dist = geopy.distance.geodesic(coords, boston).miles
    else:
      dist = NaN
    
    distances.append(dist)
    #print( i, id, lat, long, coords, dist)

# Append Distances to DataFrame
mydata3 = mydata2.copy() 
mydata3["DISTANCE"] =  distances
mydata3.head()
mydata3.info()

Unnamed: 0,PID,LATITUDE,LONGITUDE
0,802870050,42.32,-71.08
1,1101083000,,
2,1102053000,42.31,-71.1
3,1301250010,42.32,-71.07
4,700950000,42.34,-71.05


Unnamed: 0,PID,LATITUDE,LONGITUDE,DISTANCE
0,802870050,42.32,-71.08,2.74
1,1101083000,,,
2,1102053000,42.31,-71.1,4.03
3,1301250010,42.32,-71.07,3.15
4,700950000,42.34,-71.05,1.82


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25640 entries, 0 to 25639
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   PID        25640 non-null  int64  
 1   LATITUDE   25217 non-null  float64
 2   LONGITUDE  25217 non-null  float64
 3   DISTANCE   25217 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 801.4 KB
