In [3]:
import os
import logging

import requests
import json
import datetime

import pyarrow.csv as pv
import pyarrow.parquet as pq
import pyarrow as pa

Workaround hitting rows with incomplete data

In [51]:
def get_json(offset=1, limit = 100_000):
    while True:
        url = 'https://data.austintexas.gov/resource/xwdj-i9he.json'
        
        params = {
            "$limit": limit,
            "$offset": offset
        }

        # Make the GET request to the API
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        data_json = response.json()
        print(f'start {offset} with {len(data_json)} records')

        # if the page has no records, stop iterating
        if data_json:
            yield data_json
            if len(data_json) < limit:
                print('Exit from len(data) with offset', offset)
                # ti.xcom_push(key='offset', value=offset)
                break
            else:
                offset += limit
                #ti.xcom_push(key='offset', value=offset)
        else:
            # No more data, break the loop
            break

def preprocess_data(data_json):
    """ 
    Parameters:
        data_json: list of dictionaries from API request
    """
    # create a table
    # schema with data types throws errors
    # data types changed throgh casting below
    if len(data_json[0]) < 20:
        raise KeyError
    py_table = pa.Table.from_pylist(data_json)
    print(py_table.column_names)
    # drop not needed columns
    # cols_to_drop = [
    #         'sr_location',
    #         'sr_location_council_district',
    #         'sr_location_lat_long', 
    #         'sr_location_map_tile', 
    #         'sr_location_map_page',
    #         'sr_location_street_number',
    #         'sr_location_street_name']
    # for col in cols_to_drop:
    #     if col in py_table.column_names:
    #         py_table = py_table.drop_columns(col)
    py_table = py_table.drop_columns([
            'sr_location',
            # 'sr_location_council_district',
            'sr_location_lat_long', 
            'sr_location_map_tile', 
            'sr_location_map_page',
            # 'sr_location_street_number',
            'sr_location_street_name'])
    if 'sr_location_street_number' in py_table.column_names:
        py_table = py_table.drop_columns('sr_location_street_number')
    if 'sr_location_council_district' in py_table.column_names:
        py_table = py_table.drop_columns('sr_location_council_district')
      
    # rename columns, remove sr_ prefix
    cols = py_table.column_names
    cols = [col[3:] if col!='sr_number' else 'request_id' for col in cols]
    py_table = py_table.rename_columns(cols)

    # sort columns by type and info
    request_info = ['request_id', 'status_desc', 'type_desc', 'method_received_desc']
    date_info = ['created_date', 'status_date', 'updated_date']
    address_info = ['location_county', 'location_city', 'location_zip_code']
    gis_info = ['location_x', 'location_y', 'location_lat', 'location_long']

    # cast date 
    for col in date_info:
        arr = py_table[col].cast(pa.timestamp('ms'))
        py_table = py_table.drop_columns(col)
        py_table = py_table.append_column(col, arr)
    
    # cast gis info to float
    for col in gis_info:
        arr = py_table[col].cast(pa.float64())
        py_table = py_table.drop_columns(col)
        py_table = py_table.append_column(col, arr)
    
    # new column order
    new_order = request_info + date_info + address_info + gis_info
    return py_table.select(new_order)

def upload_to_gcs(bucket_name, object_name, pq_file):
    """
    Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
    :param bucket: GCS bucket name
    :param object_name: target path & file-name
    :param local_file: source path & file-name
    :return:
    """
    # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
    # (Ref: https://github.com/googleapis/python-storage/issues/74)
    storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
    storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
    # End of Workaround

    client = storage.Client()
    bucket = client.bucket(bucket_name)
    # f"raw/{pq_file}"
    blob = bucket.blob(object_name)
    blob.upload_from_filename(pq_file)

def save_data(offset=1, limit = 100_000):
    # extract
    n = 1
    has_data = True
    while has_data:
        try:
            geneartor = get_json(offset, limit)
            #for data in geneartor:
            data = next(geneartor)
            print(offset)
            print("Data Length:", len(data))
            #data = [*data, *j]
            if len(data) == 0:
                has_data = False
                break
            # preprocess
            table = preprocess_data(data)
            offset = offset + limit
            print("Data Processed, offset increased by 100 000")
                
            # write to local file
            filename = f'data_{n:02d}.parquet'
            object_name = f"raw/{filename}"
            local_file = f"{AIRFLOW_HOME}/{filename}"
            # save table into a file
            pq.write_table(table, local_file)
            # upload to gcs
            upload_to_gcs(GCP_GCS_BUCKET, object_name, local_file)
            os.remove(local_file)
            n += 1
        except KeyError:
            print(KeyError)
            offset = offset + 1
            print("Offset increased by 1, offset =", offset)
            # continue
        except StopIteration:
            print(StopIteration)
            has_data = False
            break
        

In [52]:
save_data(offset=800_001, limit=100_000)

start 800001 with 100000 records
800001
Data Length: 100000
<class 'KeyError'>
Offset increased by 1, offset = 800002
start 800002 with 100000 records
800002
Data Length: 100000
['sr_location', 'sr_location_council_district', 'sr_method_received_desc', 'sr_closed_date', 'sr_location_x', 'sr_status_desc', 'sr_type_desc', 'sr_location_county', 'sr_location_map_page', 'sr_location_long', 'sr_updated_date', 'sr_number', 'sr_location_lat', 'sr_location_lat_long', 'sr_status_date', 'sr_location_street_name', 'sr_created_date', 'sr_location_street_number', 'sr_location_city', 'sr_location_y', 'sr_location_map_tile', 'sr_location_zip_code']
Data Processed, offset increased by 100 000
start 900002 with 100000 records
900002
Data Length: 100000
['sr_location', 'sr_location_council_district', 'sr_method_received_desc', 'sr_closed_date', 'sr_location_x', 'sr_status_desc', 'sr_type_desc', 'sr_location_county', 'sr_location_map_page', 'sr_location_long', 'sr_updated_date', 'sr_number', 'sr_location_

In [41]:
url = 'https://data.austintexas.gov/resource/xwdj-i9he.json'
limit = 100
offset = 800_001

params = {
    "$limit": limit,
    "$offset": offset
}

# Make the GET request to the API
response = requests.get(url, params=params)
response.raise_for_status()  # Raise an HTTPError for bad responses
data_json = response.json()
py_table = pa.Table.from_pylist(data_json)
py_table.column_names

['sr_method_received_desc',
 'sr_closed_date',
 'sr_status_desc',
 'sr_type_desc',
 'sr_updated_date',
 'sr_number',
 'sr_status_date',
 'sr_created_date']

In [48]:
len(data_json[1])

22