# Data Validation Notebook

This notebook performs data quality validation on processed data.

## Parameters
- `execution_date`: Date of pipeline execution
- `s3_bucket`: S3 bucket containing processed data
- `aws_endpoint_url`: AWS endpoint URL (LocalStack)

In [None]:
    "# Parameters - these will be injected by Papermill\n",
    "execution_date = '2024-01-01'\n",
    "s3_bucket = 'processed-data'\n",
    "aws_endpoint_url = 'http://localstack-service.localstack:4566'"

In [None]:
import pandas as pd
import boto3
import numpy as np
from datetime import datetime
import os
import json

# Configure AWS credentials for LocalStack
os.environ['AWS_ACCESS_KEY_ID'] = 'test'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'test'
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'

print(f"Execution Date: {execution_date}")
print(f"S3 Bucket: {s3_bucket}")
print(f"AWS Endpoint: {aws_endpoint_url}")

In [None]:
# Initialize S3 client
s3_client = boto3.client(
    's3',
    endpoint_url=aws_endpoint_url,
    aws_access_key_id='test',
    aws_secret_access_key='test',
    region_name='us-east-1'
)

# List objects in the processed data bucket
try:
    response = s3_client.list_objects_v2(Bucket=s3_bucket)
    if 'Contents' in response:
        print(f"Found {len(response['Contents'])} objects in bucket:")
        for obj in response['Contents'][:10]:  # Show first 10
            print(f"  - {obj['Key']} ({obj['Size']} bytes)")
    else:
        print("No objects found in bucket")
except Exception as e:
    print(f"Error accessing bucket: {e}")

## Summary

Data validation completed!

This notebook validates the quality and integrity of processed data.