In [1]:
land_base_path = 's3://mojap-land/open_data/postcodes_example/'
raw_hist_base_path = 's3://mojap-raw-hist/open_data/postcodes_example/'
api_get = "https://api.postcodes.io/random/postcodes"
job_bucket = "alpha-curated-postcodes-example"

In [2]:
import json
import os
from datetime import datetime

from urllib.request import urlopen

# Lil' function to take the api response and put into a tabular format
def unpack_data(data):
    new_dict = {}
    row = data['result']
    for c in row:
        if c != 'codes':
            new_dict[c] = row[c]
    for c in row["codes"]:
        new_dict["codes_" + c] = row["codes"][c]
    return new_dict

# Get the run timestamp of this script - use it as the file partition (note that I remove milliseconds)
run_timestamp = int(datetime.now().timestamp())
# Request the API 1000 times
data = []
for i in range(0,10):
    f = urlopen(api_get)
    api_out = f.readlines()[0]
    row = json.loads(api_out)
    
    new_row = unpack_data(row)
    new_row['index'] = i
    data.append(new_row)
    
print(data[0])

{'postcode': 'TR1 9XL', 'quality': 5, 'eastings': 182961, 'northings': 43918, 'country': 'England', 'nhs_ha': 'South West', 'longitude': -5.046154, 'latitude': 50.25527, 'european_electoral_region': 'South West', 'primary_care_trust': 'Cornwall and Isles of Scilly', 'region': 'South West', 'lsoa': 'Cornwall 044A', 'msoa': 'Cornwall 044', 'incode': '9XL', 'outcode': 'TR1', 'parliamentary_constituency': 'Truro and Falmouth', 'admin_district': 'Cornwall', 'parish': 'Truro', 'admin_county': None, 'admin_ward': 'Truro Boscawen & Redannick', 'ced': None, 'ccg': 'NHS Kernow', 'nuts': 'Cornwall and Isles of Scilly', 'codes_admin_district': 'E06000052', 'codes_admin_county': 'E99999999', 'codes_admin_ward': 'E05013354', 'codes_parish': 'E04011600', 'codes_parliamentary_constituency': 'E14001003', 'codes_ccg': 'E38000089', 'codes_ccg_id': '11N', 'codes_ced': 'E99999999', 'codes_nuts': 'TLK30', 'codes_lsoa': 'E01018807', 'codes_msoa': 'E02003910', 'codes_lau2': 'E06000052', 'index': 0}


In [3]:
import os
import gzip

from gluejobutils.s3 import (
    s3_path_to_bucket_key,
    s3_resource
)

def write_dicts_to_jsonl_gz(data, s3_path):
    file_as_string = json.dumps(data[0])
    for d in data[1:]:
        file_as_string += '\n'
        file_as_string += json.dumps(d)
    b, k = s3_path_to_bucket_key(s3_path)
    compressed_out = gzip.compress(bytes(file_as_string, 'utf-8'))
    s3_resource.Object(b, k).put(Body=compressed_out)

land_base_path = 's3://mojap-land/open_data/postcodes_example/'

s3_out = os.path.join(land_base_path, 'random_postcodes', f'file_land_timestamp={run_timestamp}', f'random_postcodes_{run_timestamp}.jsonl.gz')
write_dicts_to_jsonl_gz(data, s3_out)

In [4]:
len_data = len(data)
if len_data < 10:
    error = True
    print(f"TEST DATA SIZE: FAILED (size {len_data})")
else:
    print(f"TEST DATA SIZE: PASSED (size {len_data})")

TEST DATA SIZE: PASSED (size 10)


In [5]:
import pandas as pd
from gluejobutils import s3
from scripts.utils import (
    read_jsonl_from_s3
)

table_land_path = os.path.join(land_base_path, 'random_postcodes/')

error = False
meta = pd.read_json('meta_data/raw/random_postcodes.json')
colnames = [c['name'] for c in meta['columns']]

# Get all partitions then test each one
all_data_paths = s3.get_filepaths_from_s3_folder(table_land_path)
if len(all_data_paths) == 0:
    raise ValueError(f"Was expecting data in land but nothing was found in the folder: {table_land_path}")

for data_path in all_data_paths:
    print(f'TESTING {data_path}')
    data = read_jsonl_from_s3(data_path, compressed=True)

    # Let's say we always expect at least 100 records
    len_data = len(data)
    if len_data < 10:
        error = True
        print(f"TEST DATA SIZE: FAILED (size {len_data})")
    else:
        print(f"TEST DATA SIZE: PASSED (size {len_data})")

    # We might want to check the data against the our meta data (if we expect all the columns to exist)
    # If there is an error wait to test the rest of the data so you can see which other rows fail before raising an error
    error_str = ''
    for i, row in enumerate(data):
        col_mismatch = list(set(row.keys()).symmetric_difference(set(colnames)))
        if len(col_mismatch) > 0:
            error_str += f"row {i}: col mismatch: {', '.join(col_mismatch)}\n"
            error = True

    if error_str != '':
        print(error_str)

    if error:
        raise ValueError("Raising error due to one of the tests not passing. See log.")
    else:
        print("All tests passed!")
        print("Now writing to raw and deleting from land...")
        raw_hist_out = data_path.replace('s3://mojap-land/', 's3://mojap-raw-hist/')
        s3.copy_s3_object(data_path, raw_hist_out)
        s3.delete_s3_object(data_path)
        print("Done.")

TESTING s3://mojap-land/open_data/postcodes_example/random_postcodes/file_land_timestamp=1655057864/random_postcodes_1655057864.jsonl.gz
TEST DATA SIZE: PASSED (size 10)
All tests passed!
Now writing to raw and deleting from land...
Done.


In [6]:
from etl_manager.etl import GlueJob
import datetime

job_bucket = "alpha-curated-postcodes-example"
iam_role = "airflow-postcodes-example-role"
github_tag = "v0.0.1"
snapshot_date = datetime.datetime.now().strftime("%Y-%m-%d")

# Get job parameters for specific glue job
job_args = {"--github_tag": github_tag, "--snapshot_date": snapshot_date}
job = GlueJob(f"etl_pipeline_example_job/", bucket = job_bucket, job_role = iam_role, job_arguments = job_args)

print(f'Starting job "{job.job_name}"...')
job.run_job()
job.wait_for_completion(verbose=True)

Starting job "etl_pipeline_example_job"...
2022-06-12 19:18:42: Job State: RUNNING | Execution Time: 3 (s) | Error: n/a
2022-06-12 19:18:52: Job State: RUNNING | Execution Time: 13 (s) | Error: n/a
2022-06-12 19:19:03: Job State: RUNNING | Execution Time: 24 (s) | Error: n/a
2022-06-12 19:19:13: Job State: RUNNING | Execution Time: 34 (s) | Error: n/a
2022-06-12 19:19:23: Job State: RUNNING | Execution Time: 44 (s) | Error: n/a
2022-06-12 19:19:34: Job State: RUNNING | Execution Time: 54 (s) | Error: n/a
2022-06-12 19:19:44: Job State: RUNNING | Execution Time: 64 (s) | Error: n/a
2022-06-12 19:19:54: Job State: RUNNING | Execution Time: 75 (s) | Error: n/a
2022-06-12 19:20:04: Job State: RUNNING | Execution Time: 85 (s) | Error: n/a
2022-06-12 19:20:14: Job State: RUNNING | Execution Time: 95 (s) | Error: n/a
2022-06-12 19:20:24: Job State: SUCCEEDED | Execution Time: 99 (s) | Error: n/a


In [7]:
from etl_manager.meta import read_database_folder

db = read_database_folder('meta_data/curated/')
db.delete_glue_database()
db.create_glue_database()

db.refresh_all_table_partitions()

In [15]:
with open('meta_data/curated/database.json') as f:
    database = json.load(f)
database_name = database['name']
with open('meta_data/curated/random_postcodes.json') as f:
    table = json.load(f)
table_name = table['name']
print(database_name, table_name)

example_postcodes_db random_postcodes


In [13]:
import pydbtools as pydb

print(pydb.read_sql_query(f"SELECT COUNT(*) AS Postcodes from {database_name}.{table_name}").values)

[[110]]
