In [21]:
import os
import sys
import pandas as pd
import logging
import json
from google.cloud import bigquery
from hashlib import md5
from typing import List


# SETUP

DATA_DIR = "data/air_travel/"
DEFAULT_TICKETS_FILE = os.path.join(DATA_DIR, "tickets.json") 
PROJECT_NAME = "deb-01-372120"
DATASET_NAME = "air_travel"


# **** TABLE SCHEMAS ****

TABLE_METADATA = {
    'airlines': {
        'table_name': 'airlines',
        'schema': [
            bigquery.SchemaField('name', 'string', mode='REQUIRED'),
            bigquery.SchemaField('iata', 'string', mode='REQUIRED'),
            bigquery.SchemaField('icao', 'string', mode='REQUIRED'),
            bigquery.SchemaField('callsign', 'string', mode='REQUIRED'),
            bigquery.SchemaField('country', 'string', mode='REQUIRED')
        ]
    },
    'airports': {
        'table_name': 'airports',
        'schema': [
            bigquery.SchemaField('name', 'string', mode='REQUIRED'),
            bigquery.SchemaField('city', 'string', mode='REQUIRED'),
            bigquery.SchemaField('country', 'string', mode='REQUIRED'),
            bigquery.SchemaField('icao', 'string', mode='REQUIRED'),
            bigquery.SchemaField('lattitude', 'int64', mode='REQUIRED'),
            bigquery.SchemaField('longitude', 'int64', mode='REQUIRED'),
            bigquery.SchemaField('altitude', 'int64', mode='REQUIRED'),
            bigquery.SchemaField('tz_timezone', 'string', mode='REQUIRED')
        ]
    },
    'passengers': {
        'table_name': 'passengers',
        'schema': [
            bigquery.SchemaField('record_id', 'string', mode='REQUIRED'),
            bigquery.SchemaField('first_name', 'string', mode='REQUIRED'),
            bigquery.SchemaField('last_name', 'string', mode='REQUIRED'),
            bigquery.SchemaField('gender', 'string', mode='REQUIRED'),
            bigquery.SchemaField('birth_date', 'datetime', mode='REQUIRED'),
            bigquery.SchemaField('email', 'string', mode='REQUIRED'),
            bigquery.SchemaField('street','string', mode='REQUIRED'),
            bigquery.SchemaField('city', 'string', mode='REQUIRED'),
            bigquery.SchemaField('state', 'string', mode='REQUIRED'),
            bigquery.SchemaField('zip', 'string', mode='REQUIRED'),
            bigquery.SchemaField('start_date', 'datetime', mode='REQUIRED'),
            bigquery.SchemaField('end_date', 'datetime', mode='REQUIRED')
        ]
    }
}


# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level


# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
client = bigquery.Client()

logger.info(f"Setup Completed")


[DEBUG][2023-01-16 11:28:51,171][2541736974:0077] : Creating bigquery client
[INFO ][2023-01-16 11:28:51,307][2541736974:0080] : Setup Completed


In [2]:
# create dataset 
dataset_id = f"{PROJECT_NAME}.{DATASET_NAME}"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"
dataset = client.create_dataset(dataset, exists_ok=True)

logger.info(f"Created air travel dataset: {dataset.full_dataset_id}")

[INFO ][2023-01-16 08:32:21,614][2632008059:0007] : Created air travel dataset: deb-01-372120:air_travel


In [24]:
# air travel data file name
filename = DEFAULT_TICKETS_FILE
logger.debug(f"attempting to process: {filename}")

# check if the file exists
assert os.path.exists(filename), f"Data file does not exists: '{filename}'"
# check if the file contains any data 
assert os.path.getsize(filename) > 78, f"Data file size incorrect; does not seem to contain data: '{filename}'"

# convert json into an array and then load into dataframe
data = [json.loads(line) for line in open(filename, 'r')]

df = pd.json_normalize(data)
logger.info(f"loaded {len(df.index)} rows from: {filename}")

# check schema: contains all expected columns?
expected_columns = ['eticket_num', 'confirmation', 'ticket_date', 'price', 'seat', 'status', 'airline.name', 'airline.iata', 'airline.icao', 'airline.callsign', 'airline.country', 'origin.name', 'origin.city', 'origin.country', 'origin.iata', 'origin.icao', 'origin.latitude', 'origin.longitude', 'origin.altitude', 'origin.tz_timezone', 'destination.name', 'destination.city', 'destination.country', 'destination.iata', 'destination.icao', 'destination.latitude', 'destination.longitude', 'destination.altitude', 'destination.tz_timezone', 'passenger.first_name', 'passenger.last_name', 'passenger.gender', 'passenger.birth_date', 'passenger.email', 'passenger.street', 'passenger.city', 'passenger.state', 'passenger.zip']
for col in expected_columns:
    assert col in list(df.columns), f"Data file missing required column: {col}"

# assign & remember receipts dataframe
tickets_df = df
display(tickets_df.head(n=10))

[DEBUG][2023-01-16 11:41:13,988][829190179:0003] : attempting to process: data/air_travel/tickets.json
[INFO ][2023-01-16 11:41:15,913][829190179:0014] : loaded 4096 rows from: data/air_travel/tickets.json


Unnamed: 0,eticket_num,confirmation,ticket_date,price,seat,status,airline.name,airline.iata,airline.icao,airline.callsign,...,passenger.last_name,passenger.gender,passenger.birth_date,passenger.email,passenger.street,passenger.city,passenger.state,passenger.zip,origin,destination
0,498-938211-0795,ZVFDC4,2022-03-23,723.42,31I,active,China Eastern Airlines,MU,CES,CHINA EASTERN,...,Brown,M,1969-02-17,robert.brown.69@hotmail.com,5007 Thomas Way,Lake Hollystad,DC,20027,,
1,482-850738-6048,IL5GUI,2022-03-23,765.18,29B,active,Hawaiian Airlines,HA,HAL,HAWAIIAN,...,Kent,F,1998-08-05,laura.kent.98@hotmail.com,13991 Davis Village,North Catherineborough,PA,16516,,
2,275-207321-8092,CYEFBC,2022-03-21,753.89,26I,active,Wizz Air,W6,WZZ,WIZZ AIR,...,Tucker,F,1965-01-22,lisa.tucker.65@hotmail.com,04135 Marvin Via,North Kristabury,MA,1093,,
3,246-793315-3102,ZNGPC2,2022-03-22,793.89,15A,active,AirAsia,AK,AXM,ASIAN EXPRESS,...,Yates,NB,1975-03-31,matthew.yates.75@yahoo.com,76045 Samantha Road Suite 111,Lake Jeffrey,DE,19898,,
4,091-128904-1226,MGSBD9,2022-03-24,820.25,17F,active,Xiamen Airlines,MF,CXA,XIAMEN AIR,...,Villanueva,NB,1945-08-14,megan.villanueva.45@hotmail.com,848 Melissa Springs Suite 947,Kellerstad,TX,76177,,
5,115-196069-8963,XFYQC0,2022-03-23,892.69,18C,active,Air New Zealand,NZ,ANZ,NEW ZEALAND,...,Hall,NB,1944-08-31,sarah.hall.44@gmail.com,75420 Michael Mountains Suite 485,New Victoria,HI,96727,,
6,396-673460-1326,N5UOOZ,2022-03-23,889.53,3C,active,Jeju Air,7C,JJA,JEJU AIR,...,Thompson,M,1968-05-02,seth.thompson.68@yahoo.com,22455 Higgins Junction Apt. 042,New Keith,OR,97405,,
7,380-894599-8109,PAA19Y,2022-03-22,706.78,7D,active,American Airlines,AA,AAL,AMERICAN,...,Garcia,F,1950-02-12,jennifer.garcia.50@gmail.com,6607 Sharp Common,Chadstad,VA,22121,,
8,614-960971-2686,EF4BHJ,2022-03-23,486.4,24J,active,Juneyao Airlines,HO,DKH,JUNEYAO AIRLINES,...,Clark,F,1991-11-09,becky.clark.91@gmail.com,691 Jones Cliffs,Michaelburgh,TX,76003,,
9,481-321233-0702,FVM9EE,2022-03-23,855.93,16A,active,Royal Air Maroc,AT,RAM,ROYALAIR MAROC,...,Cook,M,1976-07-29,ronald.cook.76@hotmail.com,93328 Davis Island,Rodriguezside,MD,21408,,


In [29]:
# Gather unique airline 
# start from the tickets
df = tickets_df

logger.debug(f"getting unique airlines...")

# set of unique columns to return
cols = ['airline.name', 'airline.iata', 'airline.icao', 'airline.callsign','airline.country']
# group by unique columns and only select them
df = df.groupby(cols).all()
df = df.reset_index().loc[:, cols]
# rename columns
df = df.rename(columns={'airline.name': 'name', 'airline.iata':'iata', 'airline.icao':'icao', 'airline.callsign':'callsign', 'airline.country':'country'})

logger.info(f"airlines dim - found {len(df.index)} rows")
display(df)

[DEBUG][2023-01-16 11:55:05,711][1702493371:0005] : getting unique airlines...
[INFO ][2023-01-16 11:55:06,417][1702493371:0015] : airlines dim - found 48 rows


Unnamed: 0,name,iata,icao,callsign,country
0,Air Canada,AC,ACA,AIR CANADA,Canada
1,Air China,CA,CCA,AIR CHINA,China
2,Air France,AF,AFR,AIRFRANS,France
3,Air New Zealand,NZ,ANZ,NEW ZEALAND,New Zealand
4,AirAsia,AK,AXM,ASIAN EXPRESS,Malaysia
5,Alaska Airlines,AS,ASA,Inc.,ALASKA
6,Allegiant Air,G4,AAY,ALLEGIANT,United States
7,American Airlines,AA,AAL,AMERICAN,United States
8,British Airways,BA,BAW,SPEEDBIRD,United Kingdom
9,Cape Air,9K,KAP,CAIR,United States
