[Download JSON file from a URL](https://docs.python.org/2/howto/urllib2.html)

[Write JSON data to a file](http://stackoverflow.com/questions/12309269/how-do-i-write-json-data-to-a-file-in-python)

[Converting string into datetime](http://stackoverflow.com/questions/466345/converting-string-into-datetime)

[strftime format mask](https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior)

[converting string to boolean](http://stackoverflow.com/questions/715417/converting-from-a-string-to-boolean-in-python)

[Set application memory size from pyspark shell](http://stackoverflow.com/questions/21609173/set-application-memory-size-from-pyspark-shell)

In [55]:
from datetime import datetime
import json
import os
import numpy as np
import re
import urllib2

In [90]:
def format_record(raw_data):
    """ NULL """
    record = {}

    for idx in range(0, len(fieldnames_br.value)):
        record[fieldnames_br.value[idx]] = raw_data[idx]

    return record

def init_mailingaddrloc_vars(tdlr_all_licenses):
    """ NULL """
    raw_field_name = u'mailing_address_location'

    match_idx =\
        np.argwhere(map(lambda elem: elem['fieldName'] == raw_field_name,\
                   tdlr_all_licenses['meta']['view'][u'columns']))[0, 0]

    matching_column = tdlr_all_licenses['meta']['view'][u'columns'][match_idx]

    return map(lambda elem: re.sub("[:_]", "", elem),
               matching_column['subColumnTypes'])

def split_mailingaddresslocation(record):
    base_field_name = u'mailingaddresslocation'

    human_address = {u'address': None,
                     u'city': None,
                     u'state': None,
                     u'zip': None}

    updated_record = record.copy()
    
    parsed_human_address =\
        json.loads(updated_record[base_field_name][0])

    for key in parsed_human_address.keys():
        human_address[key] = parsed_human_address[key]

    for key in human_address.keys():
        updated_record[base_field_name + 'humanaddress' + key] =\
            human_address[key]

    for idx in range(1, len(mailingaddrloc_vars_br.value)):
        updated_record[unicode(base_field_name + 
                               mailingaddrloc_vars_br.value[idx])] =\
            updated_record[base_field_name][idx]

    updated_record.pop(base_field_name)
    
    return updated_record

def split_mailingaddr_citystatezip(record):
    """ NULL """
    base_field_name = 'mailingaddresscitystatezip'

    keys = ['city', 'state', 'zip']

    updated_record = record.copy()
    
    if updated_record[base_field_name] == None:
        matchobj = None
        values = [None, None, None]
    #------------------------------------------------
    else:
        patternobj = re.compile("^([A-Z\\s]+)\\s([A-Z]+)\\s([0-9]+)")

        matchobj =\
            patternobj.match(updated_record[u'mailingaddresscitystatezip'])

        if matchobj == None:
            values = [None, None, None]
        else:
            values = list(matchobj.groups())

    mailingaddresscitystatezip = dict(zip(keys, values))

    if matchobj != None:
        mailingaddresscitystatezip['zip'] =\
            int(mailingaddresscitystatezip['zip'])

    for key in mailingaddresscitystatezip:
        updated_record[unicode('mailingaddress' + key)] =\
            mailingaddresscitystatezip[key]

    updated_record.pop(base_field_name)
    
    return updated_record

def format_dates(record):
    """ NULL """
    date_keys = [u'licenseexpirationdatemmddccyy']
    
    updated_record = record.copy()
    
    for key in date_keys:
        if updated_record[key] != None:
            try:
                updated_record[key] =\
                    datetime.strptime(updated_record[key],
                                      "%m%d%Y")
            except ValueError:
                updated_record[key] = None
    
    return updated_record

def format_timestamps(record):
    """ NULL """
    timestamp_keys = [u'createdat', u'updatedat']

    updated_record = record.copy()
    
    for key in timestamp_keys:
        if updated_record[key] != None:
            try:
                updated_record[key] =\
                    datetime.fromtimestamp(updated_record[key])
            except ValueError:
                updated_record[key] = None
    
    return updated_record

def format_booleans(record):
    """ NULL """
    boolean_keys = [u'continuingeducationflag']

    updated_record = record.copy()
    
    for key in boolean_keys:
        if updated_record[key] != None:
            try:
                updated_record[key] = updated_record[key] in [u"Y"]
            except ValueError:
                updated_record[key] = None
    
    return updated_record

def format_ints(record):
    """ NULL """
    int_keys = [u'businesscitystatezip',
                u'createdmeta',
                u'licensenumber',
                u'mailingaddresscountycode',
                u'mailingaddresslocationhumanaddresszip',
                u'mailingaddresszip',
                u'updatedmeta']
    
    updated_record = record.copy()

    for key in int_keys:
        if updated_record[key] != None:
            try:
                updated_record[key] = int(updated_record[key])
            except ValueError:
                updated_record[key] = None
    
    return updated_record

def format_floats(record):
    """ NULL """
    float_keys = [u'mailingaddresslocationlatitude',
                  u'mailingaddresslocationlongitude']

    updated_record = record.copy()
    
    for key in float_keys:
        if updated_record[key] != None:
            try:
                updated_record[key] = float(updated_record[key])
            except ValueError:
                updated_record[key] = None

    return updated_record

def format_telephone_numbers(record):
    """ NULL """
    telephone_number_keys = [u'businesstelephone',
                             u'ownertelephone']

    patternobj = re.compile('^([0-9]{3})([0-9]{3})([0-9]{4})$')

    updated_record = record.copy()
    
    for key in telephone_number_keys:
        areacode_key = key + 'areacode'
        updated_record[areacode_key] = None

        if record[key] != None:
            matchobj = patternobj.match(record[key])

            if matchobj != None:
                updated_record[areacode_key] = int(matchobj.group(1))

                updated_record[key] = matchobj.group(1) + '-' +\
                                      matchobj.group(2) + '-' +\
                                      matchobj.group(3)
    
    return updated_record

In [4]:
data_path = "./Data"
data_file = 'tdlrAllLicenses.json'
datafile_fullpath = os.path.join(data_path, data_file)

if not os.path.exists(data_path):
    os.mkdir(data_path)
    
    download_url = "https://data.texas.gov/api/views/7358-krk7/rows.json?" +\
                   "accessType=DOWNLOAD"
        
    response = json.loads(urllib2.urlopen(download_url).read())
    
    with open(datafile_fullpath, "w") as outfile:
        json.dump(response, outfile)
else:
    with open(datafile_fullpath, "r") as infile:
        tdlr_all_licenses = json.load(infile)

In [94]:
columns_br = sc.broadcast(tdlr_all_licenses['meta']['view'][u'columns'])

fieldnames_br = sc.broadcast(map(lambda elem: re.sub("[:_]", "", elem),
                                 [elem['fieldName'] for elem in\
                                  tdlr_all_licenses['meta']['view'][u'columns']]))

mailingaddrloc_vars_br =\
    sc.broadcast(init_mailingaddrloc_vars(tdlr_all_licenses))

raw_data_rdd = sc.parallelize(tdlr_all_licenses['data'][:1000],4)

records_rdd = raw_data_rdd.map(lambda elem: format_record(elem))

records_rdd = records_rdd.map(lambda elem: split_mailingaddresslocation(elem))

records_rdd = records_rdd.map(lambda elem: split_mailingaddr_citystatezip(elem))

records_rdd = records_rdd.map(lambda elem: format_dates(elem))

records_rdd = records_rdd.map(lambda elem: format_timestamps(elem))

records_rdd = records_rdd.map(lambda elem: format_booleans(elem))

records_rdd = records_rdd.map(lambda elem: format_ints(elem))

records_rdd = records_rdd.map(lambda elem: format_floats(elem))

records_rdd = records_rdd.map(lambda elem: format_telephone_numbers(elem))

d = records_rdd.collect()