In [14]:
import boto3
import sagemaker
import pandas as pd
from pyathena import connect

# Initialize S3 and SageMaker session
session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

s3 = boto3.Session().client(service_name="s3", region_name=region)

# Configuration variables
s3_bucket = bucket
s3_staging_dir = f"s3://{s3_bucket}/athena/staging"
database_name = "db_airline_delay_cause"
dev_table_name = "development_data"
prod_table_name = "production_data"
development_data_location = f"s3://{s3_bucket}/development_data.csv"
production_data_location = f"s3://{s3_bucket}/production_data.csv"



In [12]:
# Load required %store variables
%store -r airline_dev_and_prod_datasets_available
%store -r ingest_create_athena_db_passed
%store -r ingest_create_athena_table_tsv_passed
%store -r s3_private_path_tsv
%store -r setup_dependencies_passed
%store -r setup_s3_bucket_passed

# Validate the required conditions
if not airline_dev_and_prod_datasets_available:
    raise RuntimeError("Development and production datasets are not available.")
if not ingest_create_athena_db_passed or not ingest_create_athena_table_tsv_passed:
    raise RuntimeError("Athena database or table creation did not pass.")
if not setup_dependencies_passed or not setup_s3_bucket_passed:
    raise RuntimeError("Setup dependencies or S3 bucket setup did not pass.")

# Connect to Athena
connection = connect(s3_staging_dir=s3_staging_dir, region_name=region)

In [16]:
s3_private_path_tsv

's3://sagemaker-us-east-1-607916531205/airline-delay-cause/csv'

In [19]:
# SQL for creating production table
statement_prod = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
    year string,
    month string,
    carrier string,
    carrier_name string,
    airport string,
    airport_name string,
    arr_flights float,
    arr_del15 float,
    carrier_ct float,
    weather_ct float,
    nas_ct float,
    security_ct float,
    late_aircraft_ct float,
    arr_cancelled int,
    arr_diverted int,
    arr_delay int,
    carrier_delay int,
    weather_delay int,
    nas_delay int,
    security_delay int,
    late_aircraft_delay int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION '{}' 
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(
    database_name, prod_table_name, s3_private_path_tsv)

# Execute SQL for production table
print("Executing production table creation SQL...")
print(statement_prod)
pd.read_sql(statement_prod, connection)



Executing production table creation SQL...
CREATE EXTERNAL TABLE IF NOT EXISTS db_airline_delay_cause.production_data(
    year string,
    month string,
    carrier string,
    carrier_name string,
    airport string,
    airport_name string,
    arr_flights float,
    arr_del15 float,
    carrier_ct float,
    weather_ct float,
    nas_ct float,
    security_ct float,
    late_aircraft_ct float,
    arr_cancelled int,
    arr_diverted int,
    arr_delay int,
    carrier_delay int,
    weather_delay int,
    nas_delay int,
    security_delay int,
    late_aircraft_delay int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '
' LOCATION 's3://sagemaker-us-east-1-607916531205/airline-delay-cause/csv' 
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')


  pd.read_sql(statement_prod, connection)


In [23]:
# Sample query for production data
carrier = "9E"
statement_query_prod = """SELECT * FROM {}.{} WHERE carrier = '{}' LIMIT 100""".format(
    database_name, prod_table_name, carrier)
print("Executing sample query for production data...")
print(statement_query_prod)
df_prod = pd.read_sql(statement_query_prod, connection)
print(df_prod.head(5))


Executing sample query for production data...
SELECT * FROM db_airline_delay_cause.production_data WHERE carrier = '9E' LIMIT 100


  df_prod = pd.read_sql(statement_query_prod, connection)


   year month carrier       carrier_name airport                 airport_name  \
0  2024     9      9E  Endeavor Air Inc.     ABE  "Allentown/Bethlehem/Easton   
1  2024     9      9E  Endeavor Air Inc.     AEX                  "Alexandria   
2  2024     9      9E  Endeavor Air Inc.     AGS                     "Augusta   
3  2024     9      9E  Endeavor Air Inc.     ALB                      "Albany   
4  2024     9      9E  Endeavor Air Inc.     ATL                     "Atlanta   

  arr_flights  arr_del15  carrier_ct  weather_ct  ...  security_ct  \
0        None       81.0         7.0        5.61  ...         0.84   
1        None       81.0         6.0        4.43  ...         1.13   
2        None      133.0        12.0        3.49  ...         3.84   
3        None       73.0         4.0        0.82  ...         0.23   
4        None     2204.0       280.0       61.70  ...        88.36   

   late_aircraft_ct  arr_cancelled  arr_diverted  arr_delay  carrier_delay  \
0             

In [18]:
# SQL for creating development table
statement_dev = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
    year string,
    month string,
    carrier string,
    carrier_name string,
    airport string,
    airport_name string,
    arr_flights float,
    arr_del15 float,
    carrier_ct float,
    weather_ct float,
    nas_ct float,
    security_ct float,
    late_aircraft_ct float,
    arr_cancelled int,
    arr_diverted int,
    arr_delay int,
    carrier_delay int,
    weather_delay int,
    nas_delay int,
    security_delay int,
    late_aircraft_delay int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION '{}' 
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(
    database_name, dev_table_name, s3_private_path_tsv)

# Execute SQL for development table
print("Executing development table creation SQL...")
print(statement_dev)
pd.read_sql(statement_dev, connection)



Executing development table creation SQL...
CREATE EXTERNAL TABLE IF NOT EXISTS db_airline_delay_cause.development_data(
    year string,
    month string,
    carrier string,
    carrier_name string,
    airport string,
    airport_name string,
    arr_flights float,
    arr_del15 float,
    carrier_ct float,
    weather_ct float,
    nas_ct float,
    security_ct float,
    late_aircraft_ct float,
    arr_cancelled int,
    arr_diverted int,
    arr_delay int,
    carrier_delay int,
    weather_delay int,
    nas_delay int,
    security_delay int,
    late_aircraft_delay int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '
' LOCATION 's3://sagemaker-us-east-1-607916531205/airline-delay-cause/csv' 
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')


  pd.read_sql(statement_dev, connection)


In [22]:
# Sample query for development data
carrier = "9E"
statement_query_dev = """SELECT * FROM {}.{} WHERE carrier = '{}' LIMIT 100""".format(
    database_name, dev_table_name, carrier)
print("Executing sample query for development data...")
print(statement_query_dev)
df_dev = pd.read_sql(statement_query_dev, connection)
print(df_dev.head(5))


Executing sample query for development data...
SELECT * FROM db_airline_delay_cause.development_data WHERE carrier = '9E' LIMIT 100


  df_dev = pd.read_sql(statement_query_dev, connection)


   year month carrier       carrier_name airport                 airport_name  \
0  2024     9      9E  Endeavor Air Inc.     ABE  "Allentown/Bethlehem/Easton   
1  2024     9      9E  Endeavor Air Inc.     AEX                  "Alexandria   
2  2024     9      9E  Endeavor Air Inc.     AGS                     "Augusta   
3  2024     9      9E  Endeavor Air Inc.     ALB                      "Albany   
4  2024     9      9E  Endeavor Air Inc.     ATL                     "Atlanta   

  arr_flights  arr_del15  carrier_ct  weather_ct  ...  security_ct  \
0        None       81.0         7.0        5.61  ...         0.84   
1        None       81.0         6.0        4.43  ...         1.13   
2        None      133.0        12.0        3.49  ...         3.84   
3        None       73.0         4.0        0.82  ...         0.23   
4        None     2204.0       280.0       61.70  ...        88.36   

   late_aircraft_ct  arr_cancelled  arr_diverted  arr_delay  carrier_delay  \
0             

In [24]:
# Show tables
statement_show = "SHOW TABLES in {}".format(database_name)
print("Fetching list of tables in the database...")
df_show = pd.read_sql(statement_show, connection)
print(df_show.head(5))


Fetching list of tables in the database...


  df_show = pd.read_sql(statement_show, connection)


                  tab_name
0  airline_delay_cause_csv
1         development_data
2          production_data


In [26]:
# Check Glue Catalog

from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="top" href="https://console.aws.amazon.com/glue/home?region={}#">AWS Glue Catalog</a></b>'.format(
            region
        )
    )
)



  from IPython.core.display import display, HTML


In [29]:
# Store query availability flags
athena_query_for_production_data_available = True
athena_query_for_development_data_available = True
%store athena_query_for_production_data_available
%store athena_query_for_development_data_available

Stored 'athena_query_for_production_data_available' (bool)
Stored 'athena_query_for_development_data_available' (bool)


In [30]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}