In [39]:
import csv
import datetime
import glob
import grequests
import io
import os
import re
from requests import Session
import time
import urllib.parse

When the user query page loads its table, it sends XHRs to https://jpwebsite.harriscountytx.gov/PublicExtracts/GetExtractData .

I set this routine up to use `grequests` (async) so I could fetch data in parallel. But (maybe just on Windows?) that led to very high rates of connection errors (SSL handshake failures), so I ended up cutting it down to 1-2 anyway.



*Update*: found a fix, which I'm bringing in from the other notebook.

In [57]:
# Work around some Windows TLS issues
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
import ssl

class MyAdapter(HTTPAdapter):
    def init_poolmanager(self, connections, maxsize, block=False):
        self.poolmanager = PoolManager(num_pools=connections,
                                       maxsize=maxsize,
                                       block=block,
                                       ssl_version=ssl.PROTOCOL_TLSv1_2)

The courts are enumerated thus:

In [41]:
courts = {
    305: 'Precinct 1, Place 1',
    310: 'Precinct 1, Place 2',
    315: 'Precinct 2, Place 1',
    320: 'Precinct 2, Place 2',
    325: 'Precinct 3, Place 1',
    330: 'Precinct 3, Place 2',
    335: 'Precinct 4, Place 1',
    340: 'Precinct 4, Place 2',
    345: 'Precinct 5, Place 1',
    350: 'Precinct 5, Place 2',
    355: 'Precinct 6, Place 1',
    360: 'Precinct 6, Place 2',
    365: 'Precinct 7, Place 1',
    370: 'Precinct 7, Place 2',
    375: 'Precinct 8, Place 1',
    380: 'Precinct 8, Place 2',
}

In [42]:
def makerequest(court, from_date, to_date, session, format):
    """Query the Houston Courts extract data service
    
    Returns a tuple of the header and the data"""
    return grequests.get(
        'https://jpwebsite.harriscountytx.gov/PublicExtracts/GetExtractData',
        params={
            'extractCaseType': 'CV', # Civil 
            'extract': 3, # Judgments entered
            'court': court,
            'casetype': 'EV', # Eviction
            'format': format, #csv, xml, tab
            'fdate': from_date.strftime('%m/%d/%Y'),
            'tdate': to_date.strftime('%m/%d/%Y'),
        },
        stream=False,
        session=session)

Retrying the failures worked fine, so this little exception handler (factory) makes it easy to keep track of them.

In [43]:
def makeexceptionhandler(eh_badreqs):
    """make an exception handler for a call to `grequests.map` that collects failed requests in the `eh_badreqs` list
    """
    def exceptionhandler(req, exc):
        print("Request failed:", str(exc), req.kwargs['params'])
        eh_badreqs.append(req.kwargs['params'])
    return exceptionhandler

And a helper to make adequately unique CSV file names

In [44]:
def urltocsvpath(template, url):
    """Parse the court and from-data from an API URL and use it to make a distinct CSV file name.
    """
    pr = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
    return template.format(
        court=pr['court'][0],
        fromdate=re.sub('^(..)/(..)/(.+)$',r'\3-\1-\2', pr['fdate'][0]),
        format=pr['format'][0]
    )

In [45]:
# interactive sanity check
req = makerequest(305, datetime.date(2016,1,1), datetime.date(2016,1,31), None, 'xml')
r = grequests.map([req])

In [46]:
# interactive sanity check
r[0].status_code

200

In [47]:
# interactive sanity check
urltocsvpath('tmp/tmp_court_{court}-{fromdate}.{format}', r[0].request.url)

'tmp/tmp_court_305-2016-01-01.xml'

In [48]:
r[0].text[:100]

'<?xml version="1.0" encoding="ISO-8859-1"?><Results><Row><CaseNumber>EV11C0087006</CaseNumber><CaseT'

Most of the actual control flow lives here; this sends the requests passed in via iterable `reqs` and write outputs to filenames generated using the routine above, and retries any failures.

In [49]:
def send_requests(reqs, size, template):
    eh_badreqs = True # hokey trick to bootstrap first loop iteration
    issued = 0
    while eh_badreqs:
        eh_badreqs = []
        exceptionhandler = makeexceptionhandler(eh_badreqs)
        for resp in grequests.imap(reqs, exception_handler=exceptionhandler, size=size):
            if resp.status_code == 200:
                issued += 1
                if issued % 10 == 0:
                    print("Finished", issued)
                with open(urltocsvpath(template, resp.request.url), 'w') as out:
                    out.write(resp.text.replace('\n\n','\n'))
            else:
                print("Request failed:", req.kwargs['params'])
        print("\n\n*** Retrying", len(eh_badreqs), "bad requests")
        reqs = []
        for params in eh_badreqs:
            reqs.append(grequests.get('https://jpwebsite.harriscountytx.gov/PublicExtracts/GetExtractData', params=params))

A few date manipulation routines to get month start/end dates for queries.

In [50]:
def incmonth(date):
    """Returns a `datetime.date` fo the first day of the month after the input date's month.
    """
    if date.month < 12:
        return datetime.date(date.year, date.month+1, date.day)
    else:
        return datetime.date(date.year+1, 1, date.day)

In [51]:
def endofmonth(date):
    """Returns a `datetime.date` for the last day of the input date's month
    """
    return incmonth(datetime.date(date.year, date.month, 1)) - datetime.timedelta(1)

In [52]:
def datebounds(
    start,
    limit=None
):
    """Generator to  yield month startdate-enddate pairs contained in the range [start, limit).
    The first pair will always start with `begin`, so may be less than a full month.
    The last pair will always be for the month before `end`'s month.
    """
    fromdate = start
    todate = endofmonth(fromdate)
    limit = limit or datetime.datetime.now().date()
    while fromdate < limit and todate < limit:
        yield (fromdate, todate)
        fromdate = incmonth(fromdate)
        todate = endofmonth(fromdate)

In [53]:
# interactive sanity check
[x for x in datebounds(datetime.date(2017,5,1))]

[(datetime.date(2017, 5, 1), datetime.date(2017, 5, 31)),
 (datetime.date(2017, 6, 1), datetime.date(2017, 6, 30)),
 (datetime.date(2017, 7, 1), datetime.date(2017, 7, 31))]

So putting them together, we can make a generator to 

In [54]:
def makerequests(
    session,
    start,
    limit,
    format,
):
    for frm,to in datebounds(start, limit):
        for court in [305, # Precinct 1, Place 1
                      310, # Precinct 1, Place 2
                      315, # Precinct 2, Place 1
                      320, # Precinct 2, Place 2
                      325, # Precinct 3, Place 1
                      330, # Precinct 3, Place 2
                      335, # Precinct 4, Place 1
                      340, # Precinct 4, Place 2
                      345, # Precinct 5, Place 1
                      350, # Precinct 5, Place 2
                      355, # Precinct 6, Place 1
                      360, # Precinct 6, Place 2
                      365, # Precinct 7, Place 1
                      370, # Precinct 7, Place 2
                      375, # Precinct 8, Place 1
                      380, # Precinct 8, Place 2
                     ]:
            yield makerequest(court, frm, to, session, format)

In [55]:
%%time
def scrape(
    start,
    size,
    template,
    end=None,
    format='csv'
):
    try:
        os.makedirs(os.path.dirname(template))
    except:
        pass
    session = Session()
    session.mount('https://', MyAdapter())
    send_requests(makerequests(session, start, end, format), size, template)

Wall time: 0 ns


In [None]:
%%time
template = 'orig/orig_court_{court}-{fromdate}.{format}'
scrape(start=datetime.date(2007,1,1), size=100, template=template, format='csv')

In [58]:
%%time
template = 'orig_xml/orig_court_{court}-{fromdate}.{format}'
scrape(start=datetime.date(2007,1,1), size=20, template=template, format='xml')

Finished 10
Finished 20
Finished 30
Finished 40
Finished 50
Finished 60
Finished 70
Finished 80
Finished 90
Finished 100
Finished 110
Finished 120
Finished 130
Finished 140
Finished 150
Finished 160
Finished 170
Finished 180
Finished 190
Finished 200
Finished 210
Finished 220
Finished 230
Finished 240
Finished 250
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 320, 'extractCaseType': 'CV', 'fdate': '02/01/2007', 'tdate': '02/28/2007', 'extract': 3, 'casetype': 'EV'}
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 315, 'extractCaseType': 'CV', 'fdate': '02/01/2007', 'tdate': '02/28/2007', 'extract': 3, 'casetype': 'EV'}
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 310, 'extractCaseType': 'CV', 'fdate': '02/01/2007', 'tdate': '02/28/2007', 'extract': 3, 'casetype': 'EV'}
Request failed: ("bad handshake: SysCallError(10054, 'WSAECO

Finished 1780
Finished 1790
Finished 1800
Finished 1810
Finished 1820
Finished 1830
Finished 1840
Finished 1850
Finished 1860
Finished 1870
Finished 1880
Finished 1890
Finished 1900
Finished 1910
Finished 1920
Finished 1930
Finished 1940
Finished 1950
Finished 1960
Finished 1970
Finished 1980
Finished 1990
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 380, 'extractCaseType': 'CV', 'fdate': '07/01/2015', 'tdate': '07/31/2015', 'extract': 3, 'casetype': 'EV'}
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 370, 'extractCaseType': 'CV', 'fdate': '07/01/2015', 'tdate': '07/31/2015', 'extract': 3, 'casetype': 'EV'}
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 315, 'extractCaseType': 'CV', 'fdate': '08/01/2015', 'tdate': '08/31/2015', 'extract': 3, 'casetype': 'EV'}
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET'

Finished 2010
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 305, 'extractCaseType': 'CV', 'fdate': '06/01/2008', 'tdate': '06/30/2008', 'extract': 3, 'casetype': 'EV'}
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 320, 'extractCaseType': 'CV', 'fdate': '06/01/2008', 'tdate': '06/30/2008', 'extract': 3, 'casetype': 'EV'}
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 375, 'extractCaseType': 'CV', 'fdate': '05/01/2008', 'tdate': '05/31/2008', 'extract': 3, 'casetype': 'EV'}


*** Retrying 20 bad requests
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 340, 'extractCaseType': 'CV', 'fdate': '01/01/2007', 'tdate': '01/31/2007', 'extract': 3, 'casetype': 'EV'}
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 325, 'extractCaseType':

Finished 2030
Request failed: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",) {'format': 'xml', 'court': 345, 'extractCaseType': 'CV', 'fdate': '01/01/2007', 'tdate': '01/31/2007', 'extract': 3, 'casetype': 'EV'}


*** Retrying 1 bad requests


*** Retrying 0 bad requests
Wall time: 2h 8min 59s


In [85]:
%%time
import lxml.etree as etree
from collections import defaultdict
data = defaultdict(list)
cols = []
for fn in glob.glob('orig_xml/*xml'):
    try:
        doc = etree.parse(fn)
        for row in doc.getroot():
            if not cols:
                for elt in row:
                    cols.append(elt.tag)
            for elt in row:
                data[elt.tag].append(elt.text)
    except etree.XMLSyntaxError as e:
        pass

Wall time: 38.3 s


In [86]:
df = pd.DataFrame(data, columns=cols)

In [87]:
df.to_csv('evictions-20170630.csv')