In [62]:
from geosupport import Geocode
import pandas as pd
import numpy as np
import simplejson as json

In [2]:
g = Geocode()

In [64]:
def geocode(line):
    address = line['address']
    zip = line['ZIP']
    try:
        result = g.address(address=address, zip=zip)
        return json.dumps(result)
    except:
        pass

In [66]:
result = g.address(address="123 PALMER AVE", zip=10302)

In [67]:
result

{'1990 Census Tract': '213',
 '2000 Census Block': '1004',
 '2000 Census Block Suffix': '',
 '2000 Census Tract': '213',
 '2010 Census Block': '1002',
 '2010 Census Tract': '213',
 '5-Digit Street Code of ON- Street': '42540',
 'Alley or Cross Street List Flag': '',
 'Angle to From Node - Beta Value': '',
 'Angle to To Node - Alpha Value': '',
 'Assembly District': '6',
 'Atomic Polygon': '105',
 'B10SC - First Borough and Street Code': '54254001010',
 'B10SC - Second Borough and Street Code': '',
 'B10SC - Third Borough and Street Code': '',
 'BOE LGC Pointer': '1',
 'BOE Preferred B7SC': '54254001',
 'BOE Preferred Street Name': 'PALMER AVENUE',
 'BOROUGH BLOCK LOT': {'Borough Code': '', 'Tax Block': '', 'Tax Lot': ''},
 'BOROUGH BLOCK LOT (BBL)': {'Borough Code': '5',
  'Tax Block': '01038',
  'Tax Lot': '0049'},
 'Bike Lane': '',
 'Bike Lane 2': '',
 'Bike Traffic Direction': '',
 'Blockface ID': '1622606950',
 'Borough': '5',
 'Building Identification Number (BIN)': '',
 'Building

In [68]:
sample = pd.read_csv('/data/sample.csv')

In [69]:
sample['response'] = sample.apply(geocode, axis=1)

In [70]:
sample[:10]['response']

0    {"First Borough Name": "MANHATTAN", "House Num...
1    {"First Borough Name": "STATEN IS", "House Num...
2    {"First Borough Name": "QUEENS", "House Number...
3    {"First Borough Name": "MANHATTAN", "House Num...
4    {"First Borough Name": "MANHATTAN", "House Num...
5    {"First Borough Name": "MANHATTAN", "House Num...
6    {"First Borough Name": "QUEENS", "House Number...
7    {"First Borough Name": "MANHATTAN", "House Num...
8    {"First Borough Name": "MANHATTAN", "House Num...
9    {"First Borough Name": "QUEENS", "House Number...
Name: response, dtype: object

# Batch process full dataset

In [71]:
def batchprocess(data):
    for gi, gdf in data.groupby(np.arange(len(data)) // 50000):
        print('Processing chunk {}...'.format(gi))
        gdf['response'] = gdf.apply(geocode, axis=1)
        gdf.to_csv('/data/full_{}.csv'.format(gi), index=False)

In [72]:
fulldata = pd.read_csv('/data/full.csv', encoding = "ISO-8859-1")

In [73]:
%time batchprocess(fulldata)

Processing chunk 0...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
CPU times: user 19min 7s, sys: 57.2 s, total: 20min 4s
Wall time: 20min 14s


# Pull out particular fields

We can pull out now the fields from the json response, which can of course be json objects themselves as is the case with the BBL entry:

In [74]:
sample = pd.read_csv('/data/full_0.csv')

In [75]:
sample[:10].apply(lambda x: json.loads(x['response'])['BOROUGH BLOCK LOT (BBL)'], axis=1)

0    {'Borough Code': '', 'Tax Block': '', 'Tax Lot...
1    {'Borough Code': '5', 'Tax Block': '01038', 'T...
2    {'Borough Code': '', 'Tax Block': '', 'Tax Lot...
3    {'Borough Code': '1', 'Tax Block': '01992', 'T...
4    {'Borough Code': '1', 'Tax Block': '00286', 'T...
5    {'Borough Code': '1', 'Tax Block': '00895', 'T...
6    {'Borough Code': '4', 'Tax Block': '11331', 'T...
7    {'Borough Code': '1', 'Tax Block': '01482', 'T...
8    {'Borough Code': '1', 'Tax Block': '01770', 'T...
9    {'Borough Code': '4', 'Tax Block': '06407', 'T...
dtype: object

In [76]:
def bbl(row):
    """ Produce BBL string for row in data """
    if 'response' in row and row['response']:
        response = json.loads(row['response'])
        if response:
            bbldict = response['BOROUGH BLOCK LOT (BBL)']
            result = '{}-{}-{}'.format(bbldict['Borough Code'],
                                       bbldict['Tax Block'],
                                       bbldict['Tax Lot'])
            return result

In [81]:
sample[:10].dropna().apply(bbl, axis=1)

0              --
1    5-01038-0049
2              --
3    1-01992-0013
4    1-00286-0024
5    1-00895-0015
6    4-11331-0001
7    1-01482-0020
8    1-01770-0033
9    4-06407-0001
dtype: object

Remember to `dropna()` since some of the rows might not have resolved to a json output.

In [78]:
%time sample['bbl'] = sample.dropna().apply(bbl, axis=1)

CPU times: user 49.8 s, sys: 22.1 ms, total: 49.8 s
Wall time: 50 s


In [80]:
sample[:10]['bbl']

0              --
1    5-01038-0049
2              --
3    1-01992-0013
4    1-00286-0024
5    1-00895-0015
6    4-11331-0001
7    1-01482-0020
8    1-01770-0033
9    4-06407-0001
Name: bbl, dtype: object