In [4]:
import boto3
import sagemaker
import pandas as pd
from pyathena import connect

# Initialize S3 and SageMaker session
session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

s3 = boto3.Session().client(service_name="s3", region_name=region)

# Configuration variables
s3_bucket = bucket
s3_staging_dir = f"s3://{s3_bucket}/athena/staging"
database_name = "db_airline_delay_cause"
table_name = "airline_delay_cause_csv"
test_output_s3 = f"s3://{s3_bucket}/development_data.csv"
production_output_s3 = f"s3://{s3_bucket}/production_data.csv"

# Connect to Athena
connection = connect(s3_staging_dir=s3_staging_dir, region_name=region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [8]:
# Calculate year range split for 60/40
# 60% of 20 years (2004-2024) is 12 years; hence 2004-2015 for test, 2016-2024 for production
query_test = f"""
    SELECT * FROM {database_name}.{table_name}
    WHERE CAST(year AS INT) BETWEEN 2004 AND 2015
"""
query_production = f"""
    SELECT * FROM {database_name}.{table_name}
    WHERE CAST(year AS INT) BETWEEN 2016 AND 2024
"""

# Execute Athena Queries
print("Running development data query...")
test_df = pd.read_sql(query_test, connection)
test_df.to_csv("data/development_data.csv", index=False)

print("Running production data query...")
production_df = pd.read_sql(query_production, connection)
production_df.to_csv("data/production_data.csv", index=False)

# Upload to S3
print("Uploading development data to S3...")
s3.upload_file("data/development_data.csv", s3_bucket, "development_data.csv")

print("Uploading production data to S3...")
s3.upload_file("data/production_data.csv", s3_bucket, "production_data.csv")

print("Split complete! Files uploaded to S3.")


Running development data query...


  test_df = pd.read_sql(query_test, connection)


Running production data query...


  production_df = pd.read_sql(query_production, connection)


Uploading development data to S3...
Uploading production data to S3...
Split complete! Files uploaded to S3.


In [11]:
airline_dev_and_prod_datasets_available = True
%store airline_dev_and_prod_datasets_available
%store

Stored 'airline_dev_and_prod_datasets_available' (bool)
Stored variables and their in-db values:
airline_dev_and_prod_datasets_available             -> True
ingest_create_athena_db_passed                      -> True
ingest_create_athena_table_tsv_passed               -> True
s3_private_path_tsv                                 -> 's3://sagemaker-us-east-1-607916531205/airline-del
setup_dependencies_passed                           -> True
setup_s3_bucket_passed                              -> True


# Release Resources

In [12]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}