## Adam Curry Week 3
## Assignment 3
For this assignment, you will be working with data from OpenFlights. This data was originally obtained from the OpenFlights Github repository and a copy of the original data is found in data/external/openflights/. For this assignment, you will use a dataset derived from that original data. You can find this data in data/processed/openflights/routes.jsonl.gz. The data is compressed with gzip and encoded in the JSON Lines format. Each line represents a single airline route.

The dsc650/assignments/assignment03 directory contains placeholder code and data outputs for this assignment.

### Assignment 3.1
In the first part of the assignment, you will be creating schemas for the route data and encoding the routes.jsonl.gz using Protocol Buffers, Avro, and Parquet.



In [1]:
import os
import sys
import gzip
import json
from pathlib import Path
import csv

import pandas as pd
import s3fs
import pyarrow as pa

import jsonschema
from jsonschema.exceptions import ValidationError


endpoint_url='https://storage.budsc.midwest-datascience.com'
current_dir = Path(os.getcwd()).absolute()
schema_dir = current_dir.joinpath('schemas')
results_dir = current_dir.joinpath('results')
results_dir.mkdir(parents=True, exist_ok=True)


def read_jsonl_data():
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    src_data_path = 'data/processed/openflights/routes.jsonl.gz'
    with s3.open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            records = [json.loads(line) for line in f.readlines()]
        

    return records

In [2]:
# Load the records from https://storage.budsc.midwest-datascience.com/data/processed/openflights/routes.jsonl.gz 
records = read_jsonl_data()

In [8]:
#View data
with open('./routes_out.json','w') as f:
   json.dump(records, f, indent=2, sort_keys=True)

In [12]:
#schema_dir

PosixPath('/home/jovyan/dsc650/dsc650/assignments/assignment03/schemas')

In [44]:
#record = records[0]
#record_json = json.dumps(record)
#record_json

'{"airline": {"airline_id": 410, "name": "Aerocondor", "alias": "ANA All Nippon Airways", "iata": "2B", "icao": "ARD", "callsign": "AEROCONDOR", "country": "Portugal", "active": true}, "src_airport": {"airport_id": 2965, "name": "Sochi International Airport", "city": "Sochi", "country": "Russia", "iata": "AER", "icao": "URSS", "latitude": 43.449902, "longitude": 39.9566, "altitude": 89, "timezone": 3.0, "dst": "N", "tz_id": "Europe/Moscow", "type": "airport", "source": "OurAirports"}, "dst_airport": {"airport_id": 2990, "name": "Kazan International Airport", "city": "Kazan", "country": "Russia", "iata": "KZN", "icao": "UWKD", "latitude": 55.606201171875, "longitude": 49.278701782227, "altitude": 411, "timezone": 3.0, "dst": "N", "tz_id": "Europe/Moscow", "type": "airport", "source": "OurAirports"}, "codeshare": false, "equipment": ["CR2"]}'

In [48]:
"""schema_path = schema_dir.joinpath('routes-schema.json')
validation_csv_path = results_dir.joinpath('validation-results.csv')
with open(schema_path) as f:
    schema = json.load(f)

with open(validation_csv_path, 'w') as f:
    for i, record in enumerate(record_json):
        try:
            jsonschema.validate(instance=record,schema=schema)
        except ValidationError as e:
            print(e)"""


### 3.1a. JSON Schema
Create a JSON Schema in the schemas/routes-schema.json file to describe a route and validate the data in routes.jsonl.gz using the jsonschema library.

In [4]:
def validate_jsonl_data(records):
    schema_path = schema_dir.joinpath('routes-schema.json')
    validation_csv_path = results_dir.joinpath('validation-results.csv')

    with open(schema_path) as f:
        schema = json.load(f)
        
    with open(validation_csv_path, 'w') as f:    
        for i, record in enumerate(records):
            try:
                ## TODO: Validate record 
                #print(i)
                #print(schema)
                #validate(instance=r,schema=schema)
                jsonschema.validate(instance=record, schema=schema)
            except ValidationError as e:
                ## Print message if invalid record
                detail = e.message
                f.write(str(e.path))
                f.write(str(e.instance))
                f.write(str(detail))
                print(detail)
                return detail
                pass

validate_jsonl_data(records)

### 3.1b. Avro
Use the fastavro library to create results/routes.avro with the schema provided.



In [7]:
import fastavro
from fastavro.schema import load_schema
from fastavro import writer

def create_avro_dataset(records):
    schema_path = schema_dir.joinpath('routes.avsc')
    global data_path
    data_path = results_dir.joinpath('routes.avro')
    ## TODO: Use fastavro to create Avro dataset
    parsed_schema = load_schema(schema_path)
    with open(data_path, 'wb') as out:
        writer(out, parsed_schema, records)
        
create_avro_dataset(records)

In [15]:
# see if loaded
import pandas as pd 
with open(data_path,mode='rb') as fp:
    # Configure Avro reader
    reader = fastavro.reader(fp)
    # Load records in memory
    records = [r for r in reader]
    # Populate pandas.DataFrame with records
    df = pd.DataFrame.from_records(records)
    # Return created DataFrame
    print(df.head(1))


                                             airline  \
0  {'airline_id': 410, 'name': 'Aerocondor', 'ali...   

                                         src_airport  \
0  {'airport_id': 2965, 'name': 'Sochi Internatio...   

                                         dst_airport  codeshare  stops  \
0  {'airport_id': 2990, 'name': 'Kazan Internatio...      False      0   

  equipment  
0     [CR2]  


### 3.1c. Parquet
Create a Parquet dataset in results/routes.parquet using Apache Arrow.


In [20]:
from pyarrow.json import read_json
import pyarrow.parquet as pq

def create_parquet_dataset():
    src_data_path = 'data/processed/openflights/routes.jsonl.gz'
    parquet_output_path = results_dir.joinpath('routes.parquet')
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    
    with s3.open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            records = [json.loads(line) for line in f.readlines()]
            ## TODO: Use Apache Arrow to create Parquet table and save the dataset
            # use pandas to load to simple dataframe 
            df = pd.DataFrame(records)
            print(df.head(1))
            # convert to apache arrow format
            table = pa.Table.from_pandas(df)
            type(table)
            print(table)
            pq.write_table(table, parquet_output_path, compression='none')

create_parquet_dataset()

                                             airline  \
0  {'airline_id': 410, 'name': 'Aerocondor', 'ali...   

                                         src_airport  \
0  {'airport_id': 2965, 'name': 'Sochi Internatio...   

                                         dst_airport  codeshare equipment  
0  {'airport_id': 2990, 'name': 'Kazan Internatio...      False     [CR2]  
pyarrow.Table
airline: struct<active: bool, airline_id: int64, alias: string, callsign: string, country: string, iata: string, icao: string, name: string>
  child 0, active: bool
  child 1, airline_id: int64
  child 2, alias: string
  child 3, callsign: string
  child 4, country: string
  child 5, iata: string
  child 6, icao: string
  child 7, name: string
src_airport: struct<airport_id: int64, altitude: int64, city: string, country: string, dst: string, iata: string, icao: string, latitude: double, longitude: double, name: string, source: string, timezone: double, type: string, tz_id: string>
  child 0, airport_i

### 3.1d. Protocol Buffers
Using the generated code found in dsc650/assignment/assignment03/routes_pb2.py create results/routes.pb using Protocol Buffers.


In [None]:
"""sys.path.insert(0, os.path.abspath('routes_pb2'))

import routes_pb2
import snappy

routes = routes_pb2.Routes()
for record in records:
    route = routes_pb2.Route()
    airin = route.airline"""

In [18]:
sys.path.insert(0, os.path.abspath('routes_pb2'))

import routes_pb2
import snappy

def _airport_to_proto_obj(airport):
    obj = routes_pb2.Airport()
    if airport is None:
        return None
    if airport.get('airport_id') is None:
        return None

    obj.airport_id = airport.get('airport_id')
    if airport.get('name'):
        obj.name = airport.get('name')
    if airport.get('city'):
        obj.city = airport.get('city')
    if airport.get('iata'):
        obj.iata = airport.get('iata')
    if airport.get('icao'):
        obj.icao = airport.get('icao')
    if airport.get('altitude'):
        obj.altitude = airport.get('altitude')
    if airport.get('timezone'):
        obj.timezone = airport.get('timezone')
    if airport.get('dst'):
        obj.dst = airport.get('dst')
    if airport.get('tz_id'):
        obj.tz_id = airport.get('tz_id')
    if airport.get('type'):
        obj.type = airport.get('type')
    if airport.get('source'):
        obj.source = airport.get('source')

    obj.latitude = airport.get('latitude')
    obj.longitude = airport.get('longitude')

    return obj


def _airline_to_proto_obj(airline):
    obj = routes_pb2.Airline()
    ## TODO: Create an Airline obj using Protocol Buffers API
    if not airline.get('name'):
        return None
    if not airline.get('airline_id'):
        return None

    obj.airline_id = airline.get('airline_id')
    if airline.get('name'):
        obj.name = airline.get('name')
    if airline.get('alias'):
        obj.alias = airline.get('alias')
    if airline.get('iata'):
        obj.iata = airline.get('iata')
    if airline.get('icao'):
        obj.icao = airline.get('icao')
    if airline.get('callsign'):
        obj.callsign = airline.get('callsign')
    if airline.get('country'):
        obj.country = airline.get('country')
    if airline.get('active'):
        obj.active = airline.get('active')
    return obj


def create_protobuf_dataset(records):
    routes = routes_pb2.Routes()
    for record in records:
        route = routes_pb2.Route()
        ## TODO: Implement the code to create the Protocol Buffers Dataset
        for k, v in record.items():
            # match key to the object and call airline func
            if k=='airline':
                airline = _airline_to_proto_obj(v)
                al = route.airline
                al.name = airline.name
                al.airline_id = airline.airline_id
                al.active = airline.active
            # match key to the object and call airport func   

            if k=='src_airport' and v is not None:
                src_airport = _airport_to_proto_obj(v)
                src = route.src_airport
                src.name = src_airport.name
                src.airport_id = src_airport.airport_id
                src.latitude = src_airport.latitude
                src.longitude = src_airport.longitude

            if k=='dst_airport' and v is not None:
                dst_airport = _airport_to_proto_obj(v)
                dst = route.dst_airport
                dst.name = dst_airport.name
                dst.airport_id = dst_airport.airport_id
                dst.latitude = dst_airport.latitude
                dst.longitude = dst_airport.longitude

            if k=='codeshare' and v is not None:
                route.codeshare = v
                
                
        routes.route.append(route)

    data_path = results_dir.joinpath('routes.pb')

    with open(data_path, 'wb') as f:
        f.write(routes.SerializeToString())
        
    compressed_path = results_dir.joinpath('routes.pb.snappy')
    
    with open(compressed_path, 'wb') as f:
        f.write(snappy.compress(routes.SerializeToString()))

create_protobuf_dataset(records)

### 3.1e. Output Sizes
Compare the output sizes of the different formats. Populate the results in results/comparison.csv. Compare compressed and uncompressed sizes if possible.

In [55]:
#results_dir = current_dir.joinpath('results')

avro = os.path.getsize(results_dir.joinpath('routes.avro'))
parquet = os.path.getsize(results_dir.joinpath('routes.parquet'))
pb = os.path.getsize(results_dir.joinpath('routes.pb'))
snappy = os.path.getsize(results_dir.joinpath('routes.pb.snappy'))
my_list = [avro,parquet,pb,snappy]
df = pd.DataFrame(my_list)

In [56]:
df.to_csv(results_dir.joinpath('comparison.csv'), index=False, header=False)

## 3.2

### 3.2.a Simple Geohash Index

In [59]:
import pygeohash

def create_hash_dirs(records):
    geoindex_dir = results_dir.joinpath('geoindex')
    geoindex_dir.mkdir(exist_ok=True, parents=True)
    hashes = []
    ## TODO: Create hash index
    for record in records:
        src_airport = record.get('src_airport', {})
        if src_airport:
            latitude = src_airport.get('latitude')
            longitude = src_airport.get('longitude')
            if latitude and longitude:
                ## TODO: use pygeohash.encode() to assign geohashes to the records and complete the hashes list
                h = pygeohash.encode(latitude,longitude)
                record['geohash'] = h
                hashes.append(h)
    hashes.sort()
    three_letter = sorted(list(set([entry[:3] for entry in hashes])))
    hash_index = {value: [] for value in three_letter}
    
    for record in records:
        geohash = record.get('geohash')
        if geohash:
            hash_index[geohash[:3]].append(record)
    
    for key, values in hash_index.items():
        output_dir = geoindex_dir.joinpath(str(key[:1])).joinpath(str(key[:2]))
        print(output_dir)
        output_dir.mkdir(exist_ok=True, parents=True)
        output_path = output_dir.joinpath('{}.jsonl.gz'.format(key))
        with gzip.open(output_path, 'w') as f:
            json_output = '\n'.join([json.dumps(value) for value in values])
            f.write(json_output.encode('utf-8'))
    
create_hash_dirs(records)

/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2e
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2e
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2e
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2h
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2h
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2h
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2j
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2j
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2j
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2j
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2j
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2k
/home/jovyan/dsc650/dsc650/assignments/assignment03/results/geoindex/2/2k
/home/jovyan/dsc650/dsc650/assignments

### 3.2.b Simple Search Feature

In [42]:

#Find unique elements, preserving their order. Remembers all elements ever seen.
#https://iteration-utilities.readthedocs.io/en/latest/generated/unique_everseen.html
from iteration_utilities import unique_everseen

def airport_search(latitude, longitude):
    distm = 625441
    ## TODO: Create simple search to return nearest airport
    distm = distm / 1000
    # pull in the geo value
    srcgeoval = pygeohash.encode(latitude, longitude, precision=3)
    #print(srcgeoval)
    recout = []
    # loop through src airports for airport data elements and add to the list
    for r in records:
        for k, v in r.items():
            if k == 'src_airport' and v is not None:
                if v not in recout:
                    recout.append(v)
   
    # create a list of airports and their attribute that are within the distance
    airport = []
    for record in recout:
        dstname = record['name']
        lat = record['latitude']
        long = record['longitude']
        geohval = pygeohash.encode(lat, long, precision=12)
        geo = pygeohash.geohash_approximate_distance(srcgeoval, geohval) / 1000
        airport_dist = {"Airport": dstname,"Geoval": geohval,"Latitude": lat,"Longitude": long,"Distance(m)": geo}
        #print(airport_dist)
        airport.append(airport_dist)
    
    # print out the airports
    airport2 = list(unique_everseen(airport))
    for i in range(len(airport2)):
        for k, v in airport2[i].items():
            if k == 'Distance(m)':
                if v <= distm:
                    print(airport2[i])


airport_search( 41.154443, -96.042238)

{'Airport': 'Southeast Iowa Regional Airport', 'Geoval': '9zr0n2k7mg7s', 'Latitude': 40.783199310302734, 'Longitude': -91.12550354003906, 'Distance(m)': 625.441}
{'Airport': 'Minneapolis-St Paul International/Wold-Chamberlain Airport', 'Geoval': '9zvxqdtbwe4q', 'Latitude': 44.882, 'Longitude': -93.221802, 'Distance(m)': 625.441}
{'Airport': 'Kirksville Regional Airport', 'Geoval': '9znhn4fce9cb', 'Latitude': 40.09349822998047, 'Longitude': -92.5448989868164, 'Distance(m)': 625.441}
{'Airport': 'Quincy Regional Baldwin Field', 'Geoval': '9zp5hz655dz7', 'Latitude': 39.94269943, 'Longitude': -91.19460297, 'Distance(m)': 625.441}
{'Airport': 'Waterloo Regional Airport', 'Geoval': '9zw617k1hfpk', 'Latitude': 42.55709838867188, 'Longitude': -92.40029907226562, 'Distance(m)': 625.441}
{'Airport': 'The Eastern Iowa Airport', 'Geoval': '9zqy30fhp6nu', 'Latitude': 41.88470077514648, 'Longitude': -91.71080017089844, 'Distance(m)': 625.441}
{'Airport': 'Dubuque Regional Airport', 'Geoval': '9zx3pz