In [1]:
from geosupport import Geocode
import pandas as pd
import numpy as np

In [2]:
g = Geocode()

In [3]:
def geocode(line):
    address = line['address']
    zip = line['ZIP']
    try:
        result = g.address(address=address, zip=zip)
        return result
    except:
        pass

In [4]:
result = g.address(address="123 PALMER AVE", zip=10302)

In [5]:
sample = pd.read_csv('/data/sample.csv')

In [6]:
sample['response'] = sample.apply(geocode, axis=1)

In [7]:
sample[:10]['response']

0    {'First Borough Name': 'MANHATTAN', 'House Num...
1    {'First Borough Name': 'STATEN IS', 'House Num...
2    {'First Borough Name': 'QUEENS', 'House Number...
3    {'First Borough Name': 'MANHATTAN', 'House Num...
4    {'First Borough Name': 'MANHATTAN', 'House Num...
5    {'First Borough Name': 'MANHATTAN', 'House Num...
6    {'First Borough Name': 'QUEENS', 'House Number...
7    {'First Borough Name': 'MANHATTAN', 'House Num...
8    {'First Borough Name': 'MANHATTAN', 'House Num...
9    {'First Borough Name': 'QUEENS', 'House Number...
Name: response, dtype: object

# Batch process full dataset

In [8]:
def batchprocess(data):
    for gi, gdf in data.groupby(np.arange(len(data)) // 50000):
        print('Processing chunk {}...'.format(gi))
        gdf['response'] = gdf.apply(geocode, axis=1)
        gdf.to_csv('/data/full_{}.csv'.format(gi), index=False)

In [9]:
fulldata = pd.read_csv('/data/full.csv', encoding = "ISO-8859-1")

In [10]:
%time batchprocess(fulldata)

Processing chunk 0...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
CPU times: user 9min 9s, sys: 57.1 s, total: 10min 6s
Wall time: 10min 11s


# Pull out particular fields

It would seem that the JSON string returned by the Geosupport API is not well-formed JSON since it uses single quotes rather than double quotes for the keys and values. We can use the [demjson](https://deron.meranda.us/python/demjson/) package to do the decoding of such strings.

In [17]:
import demjson

In [48]:
sample = pd.read_csv('/data/full_0.csv')

We can pull out now the fields from the json response, which can of course be json objects themselves as is the case with the BBL entry:

In [21]:
sample[:10].apply(lambda x: demjson.decode(x['response'])['BOROUGH BLOCK LOT (BBL)'], axis=1)

0    {'Borough Code': '', 'Tax Block': '', 'Tax Lot...
1    {'Borough Code': '5', 'Tax Block': '01038', 'T...
2    {'Borough Code': '', 'Tax Block': '', 'Tax Lot...
3    {'Borough Code': '1', 'Tax Block': '01992', 'T...
4    {'Borough Code': '1', 'Tax Block': '00286', 'T...
5    {'Borough Code': '1', 'Tax Block': '00895', 'T...
6    {'Borough Code': '4', 'Tax Block': '11331', 'T...
7    {'Borough Code': '1', 'Tax Block': '01482', 'T...
8    {'Borough Code': '1', 'Tax Block': '01770', 'T...
9    {'Borough Code': '4', 'Tax Block': '06407', 'T...
dtype: object

In [53]:
def bbl(row):
    # Produce BBL string for row in data
    if 'response' in row and row['response']:
        response = demjson.decode(row['response'])
        if response:
            bbldict = response['BOROUGH BLOCK LOT (BBL)']
            result = '{}-{}-{}'.format(bbldict['Borough Code'],
                                       bbldict['Tax Block'],
                                       bbldict['Tax Lot'])
            return result

In [57]:
sample[:100].dropna().apply(bbl, axis=1)

0               --
1     5-01038-0049
2               --
3     1-01992-0013
4     1-00286-0024
5     1-00895-0015
6     4-11331-0001
7     1-01482-0020
8     1-01770-0033
9     4-06407-0001
10    3-07635-0011
11    1-02060-0018
12    1-01348-0023
13    3-01185-0001
14    5-00593-0500
15    4-06059-0001
16    1-01726-0044
17              --
18    4-02169-0025
19    2-05933-0224
20    2-05790-0001
21    4-09695-0014
22    4-02857-0036
23    4-09762-0001
24    4-06831-0022
25    4-09762-0001
26              --
27    3-01099-0059
28    1-00481-7502
29    1-01495-0032
          ...     
70    1-01457-0017
71    1-00448-0009
72    1-00831-0033
73    1-01169-0001
74    1-02180-0001
75    1-01461-7501
76    1-00078-7508
77    2-04355-0001
78    3-06094-0001
79    3-06093-0045
80    3-06118-0001
81    3-05707-0007
82    3-06142-0046
83    3-08725-0001
84    1-01876-0034
85    1-01841-0060
86    1-01861-0022
87    1-01840-0001
88    1-01841-0063
89    1-01841-0062
90    1-01113-0047
91    3-0196

Remember to `dropna()` since some of the rows might not have resolved to a json output.

In [None]:
%time sample['bbl'] = sample.dropna().apply(bbl, axis=1)