Step 1: Create AWS Session and clients

In [None]:
import boto3
from botocore.config import Config

aws_session = boto3.Session(
    aws_access_key_id='',
    aws_secret_access_key='',
    region_name='us-east-1'
)

cfg = Config(retries={"max_attempts": 10, "mode": "standard"}, read_timeout=60, connect_timeout=10)

s3_client = aws_session.client('s3')
glue_client = aws_session.client('glue')

athena  = aws_session.client("athena", config=cfg)


Step 2: Extract the data from Postgres

In [52]:
import psycopg2
import pandas as pd
from io import StringIO
import logging

def extract_postgres_to_s3(
        table_name,
        schema="public",
        where_clause=None,
        partition_date=None,
        pg_config=None
    ):

    # Default configs
    pg_config = pg_config or {
        'host': 'localhost',
        'port': '5432',
        'user': 'postgres',
        'password': '4518',
        'database': 'kaggle_practice'
    }

    bucket='golu-aws-project-bucket'


    logging.basicConfig(level=logging.INFO)

    try:
        conn = psycopg2.connect(**pg_config)
        query = f"SELECT * FROM {schema}.{table_name}"
        if where_clause:
            query += f" WHERE {where_clause}"

        logging.info(f"Running query: {query}")

        df = pd.read_sql(query, conn)
        if df.empty:
            logging.warning("No data found.")
            return None


        # S3 key path
        key = f"raw/{table_name}"
        if partition_date:
            key += f"/dt={partition_date}"
        key += f"/{table_name}.csv"

        csv_buffer = StringIO()
        df.to_csv(csv_buffer, index=False)

        s3_client.put_object(
            Bucket=bucket,
            Key=key,
            Body=csv_buffer.getvalue()
        )

        s3_uri = f"s3://{bucket}/{key}"
        logging.info(f"‚úÖ Uploaded to: {s3_uri}")
        return s3_uri

    except Exception as e:
        logging.error(f"‚ùå Failed: {e}")
        raise

extract_postgres_to_s3("customers", "olist_brazil_e_commerce")



INFO:root:Running query: SELECT * FROM olist_brazil_e_commerce.customers
  df = pd.read_sql(query, conn)
INFO:root:‚úÖ Uploaded to: s3://golu-aws-project-bucket/raw/customers/customers.csv


's3://golu-aws-project-bucket/raw/customers/customers.csv'

In [94]:
%pip install mysql-connector-python pandas boto3

Collecting mysql-connector-python
  Downloading mysql_connector_python-9.5.0-cp313-cp313-macosx_14_0_arm64.whl.metadata (7.5 kB)
Downloading mysql_connector_python-9.5.0-cp313-cp313-macosx_14_0_arm64.whl (17.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m17.6/17.6 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


RDS Setup

In [None]:
import mysql.connector
import pandas as pd
from io import StringIO
import logging
import boto3



def extract_rds_mysql_to_s3(
        table_name,
        database=None,
        where_clause=None,
        partition_date=None,
        rds_config=None,
        storage_class='STANDARD_IA'  # Options: STANDARD, INTELLIGENT_TIERING, STANDARD_IA, ONEZONE_IA, GLACIER, DEEP_ARCHIVE
    ):

    # RDS MySQL configuration
    rds_config = rds_config or {
        'host': 'brazil-e-commerce-1.cc76oy40c2av.us-east-1.rds.amazonaws.com',  # Your RDS endpoint
        'port': 3306,
        'user': 'admin',  # Your master username
        'password': '',  # Your master password
        'database': 'brazil_e_commerce'  # Your database name
    }

    bucket = 'golu-aws-project-bucket'

    # Configure logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

    conn = None
    try:
        # Connect to RDS MySQL
        logging.info(f"üîå Connecting to RDS MySQL: {rds_config['host']}")
        conn = mysql.connector.connect(
            host=rds_config['host'],
            port=rds_config['port'],
            user=rds_config['user'],
            password=rds_config['password'],
            database=rds_config['database'],
            connection_timeout=10  # Connection timeout in seconds
        )
        logging.info("‚úÖ Successfully connected to RDS")

        # Build query - MySQL uses backticks for identifiers with special chars
        db_prefix = f"`{database or rds_config['database']}`."
        query = f"SELECT * FROM {db_prefix}`{table_name}`"
        if where_clause:
            query += f" WHERE {where_clause}"

        logging.info(f"üìä Running query: {query}")

        # Execute query and load into DataFrame
        df = pd.read_sql(query, conn)
        
        if df.empty:
            logging.warning("‚ö†Ô∏è No data found for the given query.")
            return None

        logging.info(f"‚úÖ Extracted {len(df)} rows and {len(df.columns)} columns")

        # Build S3 key path
        key = f"raw/RDS/brazil_e_commerce/{table_name}"
        if partition_date:
            key += f"/dt={partition_date}"
        key += f"/{table_name}.csv"

        # Convert DataFrame to CSV in memory
        csv_buffer = StringIO()
        df.to_csv(csv_buffer, index=False)

        logging.info(f"üì§ Uploading to S3 with storage class: {storage_class}")

        # Upload to S3 with specified storage class
        s3_client.put_object(
            Bucket=bucket,
            Key=key,
            Body=csv_buffer.getvalue(),
            StorageClass=storage_class,
            ServerSideEncryption='AES256',  # Enable encryption at rest
            Metadata={
                'source': 'rds-mysql',
                'table': table_name,
                'database': database or rds_config['database'],
                'extracted_rows': str(len(df))
            }
        )

        s3_uri = f"s3://{bucket}/{key}"
        logging.info(f"‚úÖ Successfully uploaded to: {s3_uri}")
        logging.info(f"üì¶ Storage Class: {storage_class}")
        logging.info(f"üìè File size: {len(csv_buffer.getvalue())} bytes")
        
        return s3_uri

    except mysql.connector.Error as e:
        logging.error(f"‚ùå Database error: {e}")
        logging.error("Check: 1) RDS endpoint, 2) Security group rules, 3) DB credentials")
        raise
    
    except boto3.exceptions.Boto3Error as e:
        logging.error(f"‚ùå S3 upload failed: {e}")
        raise
    
    except Exception as e:
        logging.error(f"‚ùå Unexpected error: {e}")
        raise
    
    finally:
        if conn and conn.is_connected():
            conn.close()
            logging.info("üîå Database connection closed")


# ============================================================
# USAGE EXAMPLES
# ============================================================

# Example 1: Extract customers table from your RDS
try:
    s3_uri = extract_rds_mysql_to_s3(
        table_name="customers"
    )
    print(f"Data uploaded to: {s3_uri}")
except Exception as e:
    print(f"Extraction failed: {e}")

# # Example 2: With date filter and partition (if you have date columns)
# try:
#     s3_uri = extract_rds_mysql_to_s3(
#         table_name="customers",
#         where_clause="customer_state = 'SP'",
#         partition_date="2025-11-08",
#         storage_class='INTELLIGENT_TIERING'  # Auto-optimize storage costs
#     )
# except Exception as e:
#     print(f"Extraction failed: {e}")

# # Example 3: With custom RDS config
# custom_rds_config = {
#     'host': 'brazil-e-commerce-1.cc76oy40c2av.us-east-1.rds.amazonaws.com',
#     'port': 3306,
#     'user': 'admin',
#     'password': '',
#     'database': 'brazil_e_commerce'
# }

# try:
#     s3_uri = extract_rds_mysql_to_s3(
#         table_name="customers",
#         rds_config=custom_rds_config,
#         storage_class='STANDARD_IA'
#     )
# except Exception as e:
#     print(f"Extraction failed: {e}")

# # Example 4: Extract multiple tables (when you have more tables)
# tables = ['olist_customers_dataset', 'orders', 'order_items', 'products', 'sellers']

# for table in tables:
#     try:
#         logging.info(f"\n{'='*60}")
#         logging.info(f"Processing table: {table}")
#         logging.info(f"{'='*60}")
        
#         s3_uri = extract_rds_mysql_to_s3(
#             table_name=table,
#             storage_class='STANDARD_IA'
#         )
#         logging.info(f"‚úÖ {table} completed successfully\n")
#     except Exception as e:
#         logging.error(f"‚ùå {table} failed: {e}\n")
#         continue

Step 3: Crawler

In [53]:
# Create Glue Crawler

crawler_name = "crawler_07"
role = "arn:aws:iam::180294202865:role/glue_role_to_give_full_access_to_s3"
database_name = "mydb_01"
s3_target_path = f"s3://golu-aws-project-bucket/raw/customers/"  # adjust path as needed
table_prefix = "raw2_"


try:
    response = glue_client.create_crawler(
        Name=crawler_name,
        Role=role,
        DatabaseName=database_name,
        Description="Crawler created via boto3 from Jupyter notebook",
        Targets={
            "S3Targets": [
                {"Path": s3_target_path}
            ]
        },
        TablePrefix=table_prefix,
        Classifiers=[],
        RecrawlPolicy={"RecrawlBehavior": "CRAWL_EVERYTHING"},
        SchemaChangePolicy={
            "UpdateBehavior": "UPDATE_IN_DATABASE",
            "DeleteBehavior": "DEPRECATE_IN_DATABASE"
        },
        Configuration='{"Version":1.0,"CreatePartitionIndex":true}'
    )
    print(f"Crawler '{crawler_name}' created. Response HTTPStatusCode: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}")
except Exception as e:
    print("Failed to create crawler:", e)

Crawler 'crawler_07' created. Response HTTPStatusCode: 200


In [54]:
# List Crawlers ::
response = glue_client.get_crawlers()
crawlers = response['Crawlers']
for crawler in crawlers:
    print(f"Crawler Name: {crawler['Name']}, Crawler State: {crawler['State']}")

Crawler Name: CRAWLER_04, Crawler State: READY
Crawler Name: crawler_01, Crawler State: READY
Crawler Name: crawler_02, Crawler State: READY
Crawler Name: crawler_03, Crawler State: READY
Crawler Name: crawler_06, Crawler State: READY
Crawler Name: crawler_07, Crawler State: READY
Crawler Name: crawler_refined_03, Crawler State: READY


In [55]:
# Start the crawler
response = glue_client.start_crawler(Name='crawler_07')
print(response)

{'ResponseMetadata': {'RequestId': '7f42fe9a-bb1e-4190-9663-1dc2bcf9bdb7', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Thu, 06 Nov 2025 10:29:17 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': '7f42fe9a-bb1e-4190-9663-1dc2bcf9bdb7', 'cache-control': 'no-cache'}, 'RetryAttempts': 0}}


In [58]:
# Get status of the crawler
state = glue_client.get_crawler(Name=crawler_name)["Crawler"]["State"]
print(f"Current Crawler State: {state}")


Current Crawler State: READY


Step 4: Glue Job

In [59]:
# Upload Glue Job Script to S3 
s3_client.upload_file(r"/Users/pavanhalde/Downloads/glue_job_07.py", 'golu-aws-project-bucket', 'scripts/glue_job_07.py')

# Note: In industry we use CI/CD pipelines to automate such tasks

In [60]:
# create Glue Job 
response = glue_client.create_job(
    Name="glue_job_07",
    Role="arn:aws:iam::180294202865:role/glue_role_to_give_full_access_to_s3",
    Command={
        'Name': 'glueetl',
        'ScriptLocation': "s3://golu-aws-project-bucket/scripts/glue_job_07.py"
    },
    GlueVersion='4.0',
    WorkerType='G.1X',
    NumberOfWorkers=2,
    ExecutionProperty={
        'MaxConcurrentRuns': 1
    }
)

In [61]:
# Listing jobs in account:
response = glue_client.get_jobs()
jobs=response['Jobs']
for job in jobs:
    print(f"Job Name: {job['Name']}")

Job Name: glue_job_01
Job Name: glue_job_02
Job Name: glue_job_03
Job Name: glue_job_04
Job Name: glue_job_05
Job Name: glue_job_06
Job Name: glue_job_07


In [62]:
# Start Job Run ::
response = glue_client.start_job_run(JobName='glue_job_07')
job_run_id = response['JobRunId']
print(f"Job run started with ID: {job_run_id}")


Job run started with ID: jr_6129933cc70187784ab134707c1109656a01e0e399abb147d0ac8eeb9aebeafb


In [64]:
# Get Job Run::
response = glue_client.get_job_run(JobName='glue_job_07', RunId=job_run_id)
job_run = response['JobRun']
print(f"Job Run Status: {job_run['JobRunState']}")


Job Run Status: SUCCEEDED


Step 5: Crawler for refined data

In [65]:
# Create Glue Crawler

crawler_name = "crawler_refined_04"
role = "arn:aws:iam::180294202865:role/glue_role_to_give_full_access_to_s3"
database_name = "mydb_01"
s3_target_path = f"s3://golu-aws-project-bucket/output/brazil_e_commerce/customer_refined/" 
table_prefix = "refined_"


try:
    response = glue_client.create_crawler(
        Name=crawler_name,
        Role=role,
        DatabaseName=database_name,
        Description="Crawler created via boto3 from Jupyter notebook",
        Targets={
            "S3Targets": [
                {"Path": s3_target_path}
            ]
        },
        TablePrefix=table_prefix,
        Classifiers=[],
        RecrawlPolicy={"RecrawlBehavior": "CRAWL_EVERYTHING"},
        SchemaChangePolicy={
            "UpdateBehavior": "UPDATE_IN_DATABASE",
            "DeleteBehavior": "DEPRECATE_IN_DATABASE"
        },
        Configuration='{"Version":1.0,"CreatePartitionIndex":true}'
    )
    print(f"Crawler '{crawler_name}' created. Response HTTPStatusCode: {response.get('ResponseMetadata', {}).get('HTTPStatusCode')}")
except Exception as e:
    print("Failed to create crawler:", e)

Crawler 'crawler_refined_04' created. Response HTTPStatusCode: 200


In [66]:
# List Crawlers ::
response = glue_client.get_crawlers()
crawlers = response['Crawlers']
for crawler in crawlers:
    print(f"Crawler Name: {crawler['Name']}, Crawler State: {crawler['State']}")

Crawler Name: CRAWLER_04, Crawler State: READY
Crawler Name: crawler_01, Crawler State: READY
Crawler Name: crawler_02, Crawler State: READY
Crawler Name: crawler_03, Crawler State: READY
Crawler Name: crawler_06, Crawler State: READY
Crawler Name: crawler_07, Crawler State: READY
Crawler Name: crawler_refined_03, Crawler State: READY
Crawler Name: crawler_refined_04, Crawler State: READY


In [67]:
# Start the crawler
response = glue_client.start_crawler(Name='crawler_refined_04')
print(response)

{'ResponseMetadata': {'RequestId': '9d37261e-c734-4df7-93a9-68cb00154164', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Thu, 06 Nov 2025 11:00:02 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': '9d37261e-c734-4df7-93a9-68cb00154164', 'cache-control': 'no-cache'}, 'RetryAttempts': 0}}


In [70]:
# Get status of the crawler
state = glue_client.get_crawler(Name=crawler_name)["Crawler"]["State"]
print(f"Current Crawler State: {state}")


Current Crawler State: READY


Step 6: Validation through Athena 

In [71]:

import time


# Configuration
WORKGROUP       = "primary"
OUTPUT_LOCATION = "s3://golu-aws-project-bucket/athena-results/"  # For Athena query results only
DATABASE        = "mydb_01"  # Your Glue database name
RAW_TABLE       = "raw2_customers"  # Original raw table
REFINED_TABLE   = "refined_customer_refined"  # Table created by crawler (with prefix)

print("=" * 60)
print("üîç VALIDATING GLUE JOB TRANSFORMATIONS")
print("=" * 60)

def run_athena_query(sql, description):
    """Helper function to run Athena query and display results"""
    print(f"\n{'='*60}")
    print(f"üìä {description}")
    print(f"{'='*60}")
    print(f"Query: {sql[:100]}...")
    
    # Start query execution
    resp = athena.start_query_execution(
        QueryString=sql,
        QueryExecutionContext={"Database": DATABASE},
        WorkGroup=WORKGROUP,
        ResultConfiguration={"OutputLocation": OUTPUT_LOCATION}
    )
    qid = resp["QueryExecutionId"]
    print(f"Query ID: {qid}")
    
    # Wait for completion
    print("‚è≥ Waiting for query to complete...", end="")
    while True:
        status = athena.get_query_execution(QueryExecutionId=qid)["QueryExecution"]["Status"]["State"]
        if status in ("SUCCEEDED", "FAILED", "CANCELLED"):
            break
        print(".", end="", flush=True)
        time.sleep(2)
    
    print(f"\nStatus: {status}")
    
    if status != "SUCCEEDED":
        detail = athena.get_query_execution(QueryExecutionId=qid)["QueryExecution"]["Status"]
        print(f"‚ùå Query failed: {detail}")
        return None
    
    # Get results
    results = athena.get_query_results(QueryExecutionId=qid)
    headers = [c.get("VarCharValue", "") for c in results["ResultSet"]["Rows"][0]["Data"]]
    rows = [[c.get("VarCharValue", None) for c in r["Data"]] 
            for r in results["ResultSet"]["Rows"][1:]]
    
    # Display results
    print("\n" + " | ".join(headers))
    print("-" * 60)
    for row in rows:
        print(" | ".join(str(cell) if cell else "NULL" for cell in row))
    print(f"\n‚úÖ Rows returned: {len(rows)}")
    
    return rows


üîç VALIDATING GLUE JOB TRANSFORMATIONS


In [72]:

# ============================================================
# VALIDATION 1: Check if customer_city is UPPERCASE
# ============================================================
sql_uppercase_check = f"""
SELECT 
    CASE 
        WHEN customer_city = UPPER(customer_city) THEN 'PASS - All Uppercase'
        ELSE 'FAIL - Not Uppercase'
    END AS uppercase_validation,
    COUNT(*) AS count
FROM {DATABASE}.{REFINED_TABLE}
GROUP BY 1;
"""
run_athena_query(sql_uppercase_check, "Validation 1: Check Uppercase Transformation")



üìä Validation 1: Check Uppercase Transformation
Query: 
SELECT 
    CASE 
        WHEN customer_city = UPPER(customer_city) THEN 'PASS - All Uppercase'
   ...
Query ID: a5dba1a4-b501-4f8e-b90f-0027ab6511b7
‚è≥ Waiting for query to complete....
Status: SUCCEEDED

uppercase_validation | count
------------------------------------------------------------
PASS - All Uppercase | 41746

‚úÖ Rows returned: 1


[['PASS - All Uppercase', '41746']]

In [34]:

# ============================================================
# VALIDATION 2: Verify only SP state exists
# ============================================================
sql_state_check = f"""
SELECT 
    customer_state,
    COUNT(*) AS count
FROM {DATABASE}.{REFINED_TABLE}
GROUP BY customer_state
ORDER BY count DESC;
"""
run_athena_query(sql_state_check, "Validation 2: Verify State Filter (Should be SP only)")



üìä Validation 2: Verify State Filter (Should be SP only)
Query: 
SELECT 
    customer_state,
    COUNT(*) AS count
FROM mydb_01.refined_customer_refined
GROUP BY cu...
Query ID: 465d56d4-02be-477c-8a9e-b5e7bc97e132
‚è≥ Waiting for query to complete....
Status: SUCCEEDED

customer_state | count
------------------------------------------------------------
SP | 41746

‚úÖ Rows returned: 1


[['SP', '41746']]

In [35]:

# ============================================================
# VALIDATION 3: Check if customer_unique_id column was dropped
# ============================================================
sql_column_check = f"""
SELECT * 
FROM {DATABASE}.{REFINED_TABLE} 
LIMIT 1;
"""
print(f"\n{'='*60}")
print("üìä Validation 3: Check Dropped Columns")
print(f"{'='*60}")
print("Query: Fetching table schema...")

resp = athena.start_query_execution(
    QueryString=sql_column_check,
    QueryExecutionContext={"Database": DATABASE},
    WorkGroup=WORKGROUP,
    ResultConfiguration={"OutputLocation": OUTPUT_LOCATION}
)
qid = resp["QueryExecutionId"]

# Wait for completion
while True:
    status = athena.get_query_execution(QueryExecutionId=qid)["QueryExecution"]["Status"]["State"]
    if status in ("SUCCEEDED", "FAILED", "CANCELLED"):
        break
    time.sleep(2)

if status == "SUCCEEDED":
    results = athena.get_query_results(QueryExecutionId=qid)
    headers = [c.get("VarCharValue", "") for c in results["ResultSet"]["Rows"][0]["Data"]]
    print("\nColumns in refined table:")
    for i, col in enumerate(headers, 1):
        print(f"  {i}. {col}")
    
    if "customer_unique_id" in headers:
        print("\n‚ùå FAIL: customer_unique_id column still exists!")
    else:
        print("\n‚úÖ PASS: customer_unique_id column was successfully dropped!")



üìä Validation 3: Check Dropped Columns
Query: Fetching table schema...

Columns in refined table:
  1. customer_id
  2. customer_zip_code_prefix
  3. customer_city
  4. customer_state

‚úÖ PASS: customer_unique_id column was successfully dropped!


In [36]:

# ============================================================
# VALIDATION 4: Compare Raw vs Refined Row Counts
# ============================================================
sql_compare_counts = f"""
SELECT 
    'Raw Table' AS source,
    COUNT(*) AS row_count
FROM {DATABASE}.{RAW_TABLE}
UNION ALL
SELECT 
    'Refined Table (SP only)' AS source,
    COUNT(*) AS row_count
FROM {DATABASE}.{REFINED_TABLE};
"""
run_athena_query(sql_compare_counts, "Validation 4: Compare Raw vs Refined Row Counts")



üìä Validation 4: Compare Raw vs Refined Row Counts
Query: 
SELECT 
    'Raw Table' AS source,
    COUNT(*) AS row_count
FROM mydb_01.raw_customers
UNION ALL
S...
Query ID: 520f4847-135e-4832-b2bd-9d68ca3f868a
‚è≥ Waiting for query to complete....
Status: SUCCEEDED

source | row_count
------------------------------------------------------------
Refined Table (SP only) | 41746
Raw Table | 99441

‚úÖ Rows returned: 2


[['Refined Table (SP only)', '41746'], ['Raw Table', '99441']]

In [37]:

# ============================================================
# VALIDATION 5: Sample refined data
# ============================================================
sql_sample = f"""
SELECT 
    customer_id,
    customer_zip_code_prefix,
    customer_city,
    customer_state
FROM {DATABASE}.{REFINED_TABLE}
LIMIT 10;
"""
run_athena_query(sql_sample, "Validation 5: Sample Refined Data (First 10 rows)")



üìä Validation 5: Sample Refined Data (First 10 rows)
Query: 
SELECT 
    customer_id,
    customer_zip_code_prefix,
    customer_city,
    customer_state
FROM m...
Query ID: 532e971a-80df-4e7e-8e01-56cc20ed80d4
‚è≥ Waiting for query to complete....
Status: SUCCEEDED

customer_id | customer_zip_code_prefix | customer_city | customer_state
------------------------------------------------------------
06b8999e2fba1a1fbc88172c00ba8bc7 | 14409.0 | FRANCA | SP
18955e83d337fd6b2def6b18a428ac77 | 9790.0 | SAO BERNARDO DO CAMPO | SP
4e7b3e00288586ebd08712fdd0374a03 | 1151.0 | SAO PAULO | SP
b2b6027bc5c5109e529d4dc6358b12c3 | 8775.0 | MOGI DAS CRUZES | SP
4f2d8ab171c80ec8364f7c12e35b23ad | 13056.0 | CAMPINAS | SP
fd826e7cf63160e536e0908c76c3f441 | 4534.0 | SAO PAULO | SP
b2d1536598b73a9abd18e0d75d92f0a3 | 18682.0 | LENCOIS PAULISTA | SP
eabebad39a88bb6f5b52376faec28612 | 5704.0 | SAO PAULO | SP
206f3129c0e4d7d0b9550426023f0a08 | 13412.0 | PIRACICABA | SP
c5c61596a3b6bd0cee5766992c48a9a1 | 7124

[['06b8999e2fba1a1fbc88172c00ba8bc7', '14409.0', 'FRANCA', 'SP'],
 ['18955e83d337fd6b2def6b18a428ac77', '9790.0', 'SAO BERNARDO DO CAMPO', 'SP'],
 ['4e7b3e00288586ebd08712fdd0374a03', '1151.0', 'SAO PAULO', 'SP'],
 ['b2b6027bc5c5109e529d4dc6358b12c3', '8775.0', 'MOGI DAS CRUZES', 'SP'],
 ['4f2d8ab171c80ec8364f7c12e35b23ad', '13056.0', 'CAMPINAS', 'SP'],
 ['fd826e7cf63160e536e0908c76c3f441', '4534.0', 'SAO PAULO', 'SP'],
 ['b2d1536598b73a9abd18e0d75d92f0a3', '18682.0', 'LENCOIS PAULISTA', 'SP'],
 ['eabebad39a88bb6f5b52376faec28612', '5704.0', 'SAO PAULO', 'SP'],
 ['206f3129c0e4d7d0b9550426023f0a08', '13412.0', 'PIRACICABA', 'SP'],
 ['c5c61596a3b6bd0cee5766992c48a9a1', '7124.0', 'GUARULHOS', 'SP']]

In [None]:

# ============================================================
# VALIDATION 6: Top cities in refined data
# ============================================================
sql_top_cities = f"""
SELECT 
    customer_city,
    customer_state,
    COUNT(*) AS customer_count
FROM {DATABASE}.{REFINED_TABLE}
GROUP BY customer_city, customer_state
ORDER BY customer_count DESC
LIMIT 10;
"""
run_athena_query(sql_top_cities, "Validation 6: Top 10 Cities in Refined Data")

print("\n" + "="*60)
print("‚úÖ ALL VALIDATIONS COMPLETE!")
print("="*60)
print("\nSummary of Validations:")
print("1. ‚úì Uppercase transformation on customer_city")
print("2. ‚úì State filter (SP only)")
print("3. ‚úì Column drop (customer_unique_id)")
print("4. ‚úì Row count comparison")
print("5. ‚úì Sample data inspection")
print("6. ‚úì Top cities analysis")