In [None]:
# Serverless Data Lake on AWS (GitHub Ready)

# This immersive provides a conceptual implementation for a serverless data lake on AWS.
# It demonstrates how components like S3, Lambda, Glue, and Athena can be integrated
# and managed through a GitHub-driven deployment.

# Project Overview:
# This project establishes a serverless data lake architecture designed to ingest,
# process, and query data efficiently. Raw data is landed in an S3 bucket, processed
# by AWS Glue for ETL, and made queryable via AWS Athena. AWS Lambda functions handle
# event-driven processing, such as triggering Glue jobs upon new data arrival.

# GitHub Repository Structure Suggestion:
# Your GitHub repository for this project might look like this:
# my-data-lake/
# ├── lambda_functions/
# │   └── data_ingestion_trigger/
# │       ├── lambda_function.py   # Python code for Lambda
# │       └── requirements.txt     # Lambda dependencies
# ├── glue_scripts/
# │   └── transform_data.py        # Python/PySpark script for Glue ETL
# ├── cloudformation/
# │   └── data_lake_stack.yml      # CloudFormation template for infrastructure
# ├── data/
# │   └── raw_data_example.csv     # Example raw data
# └── README.md

# --- AWS Lambda Function Example (lambda_functions/data_ingestion_trigger/lambda_function.py) ---
# This Lambda function is triggered by S3 object creation events and starts an AWS Glue job.

import json
import os
import boto3

def lambda_handler(event, context):
    """
    Lambda function to trigger an AWS Glue job when a new object is uploaded to an S3 bucket.
    """
    print("Received event:", json.dumps(event))

    s3_bucket = event['Records'][0]['s3']['bucket']['name']
    s3_key = event['Records'][0]['s3']['object']['key']

    print(f"New object '{s3_key}' detected in bucket '{s3_bucket}'.")

    glue_client = boto3.client('glue')
    glue_job_name = os.environ.get('GLUE_JOB_NAME', 'your-glue-etl-job') # Set this as an env var in Lambda config

    try:
        response = glue_client.start_job_run(
            JobName=glue_job_name,
            Arguments={
                '--S3_INPUT_BUCKET': s3_bucket,
                '--S3_INPUT_KEY': s3_key,
                '--S3_OUTPUT_LOCATION': os.environ.get('S3_PROCESSED_DATA_BUCKET', 's3://your-processed-data-bucket/processed/')
            }
        )
        print(f"Successfully started Glue job '{glue_job_name}'. Run ID: {response['JobRunId']}")
        return {
            'statusCode': 200,
            'body': json.dumps(f"Glue job '{glue_job_name}' triggered successfully.")
        }
    except Exception as e:
        print(f"Error triggering Glue job: {e}")
        return {
            'statusCode': 500,
            'body': json.dumps(f"Error triggering Glue job: {str(e)}")
        }

# --- AWS Glue ETL Script Example (glue_scripts/transform_data.py) ---
# This PySpark script reads data from S3, performs a simple transformation, and writes it back to S3.

"""
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

# @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, [
    'JOB_NAME',
    '--S3_INPUT_BUCKET',
    '--S3_INPUT_KEY',
    '--S3_OUTPUT_LOCATION'
])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

s3_input_bucket = args['S3_INPUT_BUCKET']
s3_input_key = args['S3_INPUT_KEY']
s3_output_location = args['S3_OUTPUT_LOCATION']

print(f"Reading data from s3://{s3_input_bucket}/{s3_input_key}")

# Read raw data from S3 (e.g., CSV)
datasource = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    connection_options={"paths": [f"s3://{s3_input_bucket}/{s3_input_key}"]},
    format="csv",
    format_options={"withHeader": True, "separator": ","},
    transformation_ctx="datasource_raw"
)

# Example Transformation: Convert to a different format (e.g., Parquet) and add a timestamp
print("Applying transformations...")
transformed_df = datasource.toDF()
transformed_df = transformed_df.withColumn("processing_timestamp", current_timestamp())

# Write processed data back to S3 in Parquet format
print(f"Writing processed data to {s3_output_location}")
glueContext.write_dynamic_frame.from_options(
    frame=DynamicFrame.fromDF(transformed_df, glueContext, "transformed_df"),
    connection_type="s3",
    connection_options={"path": s3_output_location},
    format="parquet",
    transformation_ctx="datasink_processed"
)

job.commit()
print("Glue job completed successfully.")
"""

# --- AWS CloudFormation Template Example (cloudformation/data_lake_stack.yml) ---
# This template defines the AWS resources for your data lake.
# Using Infrastructure as Code (IaC) like CloudFormation (or Terraform) is crucial for GitHub deployment.

"""
AWSTemplateFormatVersion: '2010-09-09'
Description: A serverless data lake architecture on AWS.

Parameters:
  RawDataBucketName:
    Type: String
    Default: your-raw-data-bucket-name-unique
    Description: Name for the S3 bucket to store raw data.
  ProcessedDataBucketName:
    Type: String
    Default: your-processed-data-bucket-name-unique
    Description: Name for the S3 bucket to store processed data.
  GlueJobName:
    Type: String
    Default: MyDataLakeETLJob
    Description: Name for the AWS Glue ETL job.

Resources:
  # S3 Bucket for Raw Data
  RawDataBucket:
    Type: AWS::S3::Bucket
    Properties:
      BucketName: !Ref RawDataBucketName
      Tags:
        - Key: Project
          Value: DataLake
      NotificationConfiguration:
        LambdaConfigurations:
          - Event: s3:ObjectCreated:*
            Function: !GetAtt GlueTriggerLambdaFunction.Arn
            Filter:
              S3Key:
                Rules:
                  - Name: suffix
                    Value: .csv # Trigger only for CSV files

  # S3 Bucket for Processed Data
  ProcessedDataBucket:
    Type: AWS::S3::Bucket
    Properties:
      BucketName: !Ref ProcessedDataBucketName
      Tags:
        - Key: Project
          Value: DataLake

  # IAM Role for Lambda Function
  LambdaExecutionRole:
    Type: AWS::IAM::Role
    Properties:
      AssumeRolePolicyDocument:
        Version: '2012-10-17'
        Statement:
          - Effect: Allow
            Principal:
              Service: lambda.amazonaws.com
            Action: sts:AssumeRole
      ManagedPolicyArns:
        - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
      Policies:
        - PolicyName: LambdaS3GlueAccess
          PolicyDocument:
            Version: '2012-10-17'
            Statement:
              - Effect: Allow
                Action:
                  - s3:GetObject
                  - s3:ListBucket
                Resource: !Join ['', ['arn:aws:s3:::', !Ref RawDataBucketName, '/*']]
              - Effect: Allow
                Action:
                  - s3:PutObject
                  - s3:ListBucket
                Resource: !Join ['', ['arn:aws:s3:::', !Ref ProcessedDataBucketName, '/*']]
              - Effect: Allow
                Action:
                  - glue:StartJobRun
                Resource: !Sub 'arn:aws:glue:${AWS::Region}:${AWS::AccountId}:job/${GlueJobName}'

  # Lambda Function to Trigger Glue Job
  GlueTriggerLambdaFunction:
    Type: AWS::Lambda::Function
    Properties:
      FunctionName: !Sub 'data-lake-glue-trigger-${AWS::StackName}'
      Handler: lambda_function.lambda_handler
      Runtime: python3.9
      Code:
        S3Bucket: !Ref CodeBucket # Assuming you upload your code to a separate S3 bucket
        S3Key: lambda_functions/data_ingestion_trigger.zip
      MemorySize: 128
      Timeout: 30
      Role: !GetAtt LambdaExecutionRole.Arn
      Environment:
        Variables:
          GLUE_JOB_NAME: !Ref GlueJobName
          S3_PROCESSED_DATA_BUCKET: !Join ['', ['s3://', !Ref ProcessedDataBucketName, '/processed/']]

  # Permission for S3 to invoke Lambda
  S3LambdaPermission:
    Type: AWS::Lambda::Permission
    Properties:
      FunctionName: !GetAtt GlueTriggerLambdaFunction.Arn
      Action: lambda:InvokeFunction
      Principal: s3.amazonaws.com
      SourceAccount: !Ref AWS::AccountId
      SourceArn: !GetAtt RawDataBucket.Arn

  # IAM Role for Glue Job
  GlueExecutionRole:
    Type: AWS::IAM::Role
    Properties:
      AssumeRolePolicyDocument:
        Version: '2012-10-17'
        Statement:
          - Effect: Allow
            Principal:
              Service: glue.amazonaws.com
            Action: sts:AssumeRole
      ManagedPolicyArns:
        - arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole
      Policies:
        - PolicyName: GlueS3Access
          PolicyDocument:
            Version: '2012-10-17'
            Statement:
              - Effect: Allow
                Action:
                  - s3:GetObject
                  - s3:PutObject
                  - s3:ListBucket
                Resource:
                  - !Join ['', ['arn:aws:s3:::', !Ref RawDataBucketName, '/*']]
                  - !Join ['', ['arn:aws:s3:::', !Ref ProcessedDataBucketName, '/*']]
              - Effect: Allow
                Action:
                  - s3:ListBucket
                Resource:
                  - !Join ['', ['arn:aws:s3:::', !Ref RawDataBucketName]]
                  - !Join ['', ['arn:aws:s3:::', !Ref ProcessedDataBucketName]]

  # AWS Glue ETL Job
  GlueETLJob:
    Type: AWS::Glue::Job
    Properties:
      Name: !Ref GlueJobName
      Command:
        Name: glueetl
        ScriptLocation: !Join ['', ['s3://', !Ref CodeBucket, '/glue_scripts/transform_data.py']] # Assuming code in S3
        PythonVersion: 3
      Role: !GetAtt GlueExecutionRole.Arn
      GlueVersion: '3.0'
      WorkerType: Standard
      NumberOfWorkers: 2
      Timeout: 60

  # AWS Glue Data Catalog Database (for Athena)
  DataLakeDatabase:
    Type: AWS::Glue::Database
    Properties:
      CatalogId: !Ref AWS::AccountId
      DatabaseInput:
        Name: my_data_lake_db
        Description: Database for the serverless data lake

Outputs:
  RawDataBucketArn:
    Description: ARN of the S3 bucket for raw data
    Value: !GetAtt RawDataBucket.Arn
  ProcessedDataBucketArn:
    Description: ARN of the S3 bucket for processed data
    Value: !GetAtt ProcessedDataBucket.Arn
  GlueJobNameOutput:
    Description: Name of the AWS Glue ETL Job
    Value: !Ref GlueJobName
  LambdaFunctionArn:
    Description: ARN of the Lambda function that triggers the Glue job
    Value: !GetAtt GlueTriggerLambdaFunction.Arn
"""

# --- AWS Setup and GitHub Integration Steps ---

# 1.  **Create an S3 Bucket for Code:**
#     * Create an S3 bucket (e.g., `your-data-lake-code-bucket`) to store your Lambda function ZIP files and Glue scripts.
#     * This bucket is referenced in the CloudFormation template (`CodeBucket`).

# 2.  **Package and Upload Code:**
#     * **Lambda Function:**
#         * Navigate to `lambda_functions/data_ingestion_trigger/` locally.
#         * Install dependencies: `pip install -r requirements.txt -t .`
#         * Zip the contents: `zip -r ../data_ingestion_trigger.zip .`
#         * Upload `data_ingestion_trigger.zip` to `s3://your-data-lake-code-bucket/lambda_functions/`.
#     * **Glue Script:**
#         * Upload `glue_scripts/transform_data.py` to `s3://your-data-lake-code-bucket/glue_scripts/`.

# 3.  **Create IAM Roles:**
#     * The CloudFormation template defines `LambdaExecutionRole` and `GlueExecutionRole` with necessary permissions. When you deploy the CloudFormation stack, these roles will be created.

# 4.  **Deploy CloudFormation Stack:**
#     * Go to AWS CloudFormation in the AWS Management Console.
#     * Click "Create stack" -> "With new resources (standard)".
#     * Upload your `cloudformation/data_lake_stack.yml` file.
#     * Provide values for parameters like `RawDataBucketName` and `ProcessedDataBucketName` (ensure they are globally unique).
#     * Acknowledge IAM resource creation and create the stack.
#     * This will provision your S3 buckets, Lambda function, Glue job, and necessary IAM roles.

# 5.  **Set up S3 Event Notification (if not done by CloudFormation):**
#     * If your CloudFormation template doesn't automatically configure S3 event notifications (it does in the example above), you would manually configure the `RawDataBucket` to send `ObjectCreated` events to your `GlueTriggerLambdaFunction`.

# 6.  **Push to GitHub:**
#     * Once your AWS resources are set up (preferably via CloudFormation), commit your entire `my-data-lake` project to your GitHub repository.
#     * You can use **GitHub Actions** or **AWS CodePipeline** to automate the deployment of your CloudFormation stack and code updates.

#     * **Example GitHub Actions Workflow (.github/workflows/deploy-data-lake.yml):**
#         # This is a simplified example. In a real scenario, you'd use OIDC for authentication
#         # and more robust actions.
#         """
#         name: Deploy Data Lake
#         on:
#           push:
#             branches:
#               - main
#             paths:
#               - 'cloudformation/**'
#               - 'lambda_functions/**'
#               - 'glue_scripts/**'

#         jobs:
#           deploy:
#             runs-on: ubuntu-latest
#             steps:
#               - name: Checkout code
#                 uses: actions/checkout@v3

#               - name: Configure AWS credentials
#                 uses: aws-actions/configure-aws-credentials@v2
#                 with:
#                   aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
#                   aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
#                   aws-region: us-east-1 # Replace with your AWS region

#               - name: Upload Lambda function code to S3
#                 run: |
#                   cd lambda_functions/data_ingestion_trigger/
#                   pip install -r requirements.txt -t .
#                   zip -r ../data_ingestion_trigger.zip .
#                   aws s3 cp ../data_ingestion_trigger.zip s3://your-data-lake-code-bucket/lambda_functions/data_ingestion_trigger.zip

#               - name: Upload Glue script to S3
#                 run: |
#                   aws s3 cp glue_scripts/transform_data.py s3://your-data-lake-code-bucket/glue_scripts/transform_data.py

#               - name: Deploy CloudFormation stack
#                 uses: aws-actions/aws-cloudformation-github-deploy@v1
#                 with:
#                   name: my-data-lake-stack
#                   template: cloudformation/data_lake_stack.yml
#                   parameter-overrides: |
#                     RawDataBucketName=your-raw-data-bucket-name-unique
#                     ProcessedDataBucketName=your-processed-data-bucket-name-unique
#                   capabilities: CAPABILITY_IAM,CAPABILITY_NAMED_IAM
#         """

# Workflow:
# 1.  A developer pushes new raw data (e.g., `raw_data_example.csv`) to the `RawDataBucket` via S3 console, API, or programmatic upload.
# 2.  The S3 `ObjectCreated` event triggers the `GlueTriggerLambdaFunction`.
# 3.  The Lambda function starts the `GlueETLJob`, passing the S3 input and output paths as arguments.
# 4.  The Glue job reads the raw data, transforms it, and writes the processed data to the `ProcessedDataBucket`.
# 5.  Users can then query the processed data using AWS Athena, pointing to the `ProcessedDataBucket` and the `my_data_lake_db` Glue Data Catalog.

# Considerations:
# * **Schema Evolution:** Implement strategies for handling schema changes in your data, such as using Glue Schema Registry or evolving your Glue ETL scripts.
# * **Data Partitioning:** For large datasets, partition your data in S3 (e.g., by date) to optimize Athena query performance and cost.
# * **Security:** Ensure proper IAM policies are in place, and data is encrypted at rest and in transit.
# * **Monitoring:** Use CloudWatch to monitor Lambda invocations, Glue job runs, and S3 bucket activity.