In [1]:

import boto3
from botocore.exceptions import ClientError
import os
from pathlib import Path
import json

In [None]:
BUCKET_NAME = 'soccer-database-project' 
REGION = 'us-east-1'
CSV_DIRECTORY = r'D:\Intern\AWS Lambda\Dataset' 
S3_FOLDER = 'raw-dataset'
LAMBDA_FUNCTION_NAME = 'data-cleaning-function'  # UPDATE THIS
AWS_ACCOUNT_ID = ''

In [None]:
AWS_SECRET_ACCESS_KEY=''
AWS_ACCESS_KEY_ID='' 



In [4]:
def get_aws_clients(aws_access_key=AWS_ACCESS_KEY_ID, aws_secret_key=AWS_SECRET_ACCESS_KEY, region='us-east-1'):
    """Initialize AWS clients for S3 and Lambda"""
    if aws_access_key and aws_secret_key:
        s3 = boto3.client('s3', aws_access_key_id=aws_access_key, 
                         aws_secret_access_key=aws_secret_key, region_name=region)
        lambda_client = boto3.client('lambda', aws_access_key_id=aws_access_key,
                                    aws_secret_access_key=aws_secret_key, region_name=region)
    else:
        s3 = boto3.client('s3', region_name=region)
        lambda_client = boto3.client('lambda', region_name=region)
    
    return s3, lambda_client

In [5]:
def create_s3_bucket(bucket_name, aws_access_key=AWS_ACCESS_KEY_ID, 
                    aws_secret_key=AWS_SECRET_ACCESS_KEY, region='us-east-1'):
    """Create S3 bucket if it doesn't exist"""
    s3, _ = get_aws_clients(aws_access_key, aws_secret_key, region)
    
    try:
        s3.head_bucket(Bucket=bucket_name)
        print(f"‚úì Bucket '{bucket_name}' already exists")
        return True
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == '404':
            try:
                if region == 'us-east-1':
                    s3.create_bucket(Bucket=bucket_name)
                else:
                    s3.create_bucket(
                        Bucket=bucket_name,
                        CreateBucketConfiguration={'LocationConstraint': region}
                    )
                print(f"‚úì Successfully created bucket '{bucket_name}' in region '{region}'")
                return True
            except ClientError as create_error:
                print(f"‚úó Error creating bucket: {create_error}")
                return False
        else:
            print(f"‚úó Error checking bucket: {e}")
            return False


In [6]:
def upload_csv_to_s3(file_path, bucket_name, s3_key=None, aws_access_key=AWS_ACCESS_KEY_ID, 
                     aws_secret_key=AWS_SECRET_ACCESS_KEY, region='us-east-1'):
    """Upload a single CSV file to S3"""
    
    # Initialize S3 client
    if aws_access_key and aws_secret_key:
        s3 = boto3.client('s3', aws_access_key_id=aws_access_key,
                         aws_secret_access_key=aws_secret_key, region_name=region)
    else:
        s3 = boto3.client('s3', region_name=region)
    
    # If s3_key not provided, use the filename
    if s3_key is None:
        s3_key = os.path.basename(file_path)
    
    try:
        # Upload the file
        s3.upload_file(file_path, bucket_name, s3_key)
        print(f"‚úì Successfully uploaded {os.path.basename(file_path)} to s3://{bucket_name}/{s3_key}")
        return True
    except FileNotFoundError:
        print(f"‚úó Error: File {file_path} not found")
        return False
    except ClientError as e:
        print(f"‚úó Error uploading {file_path}: {e}")
        return False


In [7]:
def upload_multiple_csvs(directory, bucket_name, s3_folder='', aws_access_key=AWS_ACCESS_KEY_ID,
                        aws_secret_key=AWS_SECRET_ACCESS_KEY, region='us-east-1'):
    """Upload all CSV files from a directory to S3"""
    
    results = {'success': 0, 'failed': 0, 'files': []}
    
    # Get all CSV files in directory
    csv_files = list(Path(directory).glob('*.csv'))
    
    if not csv_files:
        print(f"No CSV files found in {directory}")
        return results
    
    print(f"Found {len(csv_files)} CSV file(s) to upload\n")
    
    for csv_file in csv_files:
        # Construct S3 key with optional folder
        s3_key = f"{s3_folder}/{csv_file.name}" if s3_folder else csv_file.name
        
        success = upload_csv_to_s3(
            str(csv_file), 
            bucket_name, 
            s3_key,
            aws_access_key,
            aws_secret_key,
            region
        )
        
        if success:
            results['success'] += 1
            results['files'].append(str(csv_file))
        else:
            results['failed'] += 1
    
    print(f"\n--- Upload Summary ---")
    print(f"Total: {len(csv_files)} | Success: {results['success']} | Failed: {results['failed']}")
    
    return results

In [8]:
def add_lambda_permission(lambda_function_name, bucket_name, account_id, 
                         aws_access_key=AWS_ACCESS_KEY_ID, aws_secret_key=AWS_SECRET_ACCESS_KEY,
                         region='us-east-1'):
    """Add permission for S3 to invoke Lambda"""
    _, lambda_client = get_aws_clients(aws_access_key, aws_secret_key, region)
    
    statement_id = f'S3InvokePermission-{bucket_name}'
    
    try:
        # Remove existing permission if it exists
        try:
            lambda_client.remove_permission(
                FunctionName=lambda_function_name,
                StatementId=statement_id
            )
            print(f"‚úì Removed existing permission")
        except ClientError:
            pass  # Permission doesn't exist, continue
        
        # Add new permission
        lambda_client.add_permission(
            FunctionName=lambda_function_name,
            StatementId=statement_id,
            Action='lambda:InvokeFunction',
            Principal='s3.amazonaws.com',
            SourceArn=f'arn:aws:s3:::{bucket_name}',
            SourceAccount=account_id
        )
        print(f"‚úì Added Lambda permission for S3 to invoke function")
        return True
        
    except ClientError as e:
        print(f"‚úó Error adding Lambda permission: {e}")
        return False

In [9]:
def configure_s3_trigger(bucket_name, lambda_function_name, region, account_id,
                        prefix='raw-dataset/', 
                        aws_access_key=AWS_ACCESS_KEY_ID, aws_secret_key=AWS_SECRET_ACCESS_KEY):
    """Configure S3 to trigger Lambda on file upload"""
    s3, _ = get_aws_clients(aws_access_key, aws_secret_key, region)
    
    lambda_arn = f'arn:aws:lambda:{region}:{account_id}:function:{lambda_function_name}'
    
    notification_configuration = {
        'LambdaFunctionConfigurations': [
            {
                'Id': 'TriggerLambdaOnUpload',
                'LambdaFunctionArn': lambda_arn,
                'Events': ['s3:ObjectCreated:*'],
                'Filter': {
                    'Key': {
                        'FilterRules': [
                            {'Name': 'prefix', 'Value': prefix}
                        ]
                    }
                }
            }
        ]
    }
    
    try:
        s3.put_bucket_notification_configuration(
            Bucket=bucket_name,
            NotificationConfiguration=notification_configuration
        )
        print(f"‚úì Successfully configured S3 trigger for Lambda")
        print(f"  ‚Üí Watching: s3://{bucket_name}/{prefix}")
        print(f"  ‚Üí Triggers: {lambda_function_name}")
        return True
        
    except ClientError as e:
        print(f"‚úó Error configuring S3 trigger: {e}")
        print(f"  Make sure Lambda permissions are set first!")
        return False


In [10]:
def verify_trigger_configuration(bucket_name, aws_access_key=AWS_ACCESS_KEY_ID, 
                                aws_secret_key=AWS_SECRET_ACCESS_KEY, region='us-east-1'):
    """Verify the trigger is configured correctly"""
    s3, _ = get_aws_clients(aws_access_key, aws_secret_key, region)
    
    try:
        response = s3.get_bucket_notification_configuration(Bucket=bucket_name)
        
        if 'LambdaFunctionConfigurations' in response and response['LambdaFunctionConfigurations']:
            print(f"\n‚úì Active S3 Event Notifications:")
            for config in response['LambdaFunctionConfigurations']:
                print(f"  ‚Üí ID: {config.get('Id')}")
                print(f"  ‚Üí Lambda: {config.get('LambdaFunctionArn')}")
                print(f"  ‚Üí Events: {config.get('Events')}")
                if 'Filter' in config:
                    filters = config['Filter'].get('Key', {}).get('FilterRules', [])
                    for f in filters:
                        print(f"  ‚Üí {f['Name']}: {f['Value']}")
            return True
        else:
            print(f"\n‚ö† No event notifications configured for bucket '{bucket_name}'")
            return False
            
    except ClientError as e:
        print(f"‚úó Error checking trigger configuration: {e}")
        return False


In [11]:
def setup_complete_pipeline(bucket_name=BUCKET_NAME, lambda_function_name=LAMBDA_FUNCTION_NAME,
                           account_id=AWS_ACCOUNT_ID, region=REGION):
    """Complete setup: bucket, permissions, and trigger"""
    
    print("=" * 70)
    print("üöÄ AWS Lambda + S3 Pipeline Setup")
    print("=" * 70)
    
    # Step 1: Create bucket
    print("\n[1/4] Creating S3 bucket...")
    if not create_s3_bucket(bucket_name, region=region):
        print("‚ùå Setup failed at bucket creation")
        return False
    
    # Step 2: Add Lambda permission
    print("\n[2/4] Adding Lambda permission for S3...")
    if not add_lambda_permission(lambda_function_name, bucket_name, account_id, region=region):
        print("‚ùå Setup failed at Lambda permission")
        return False
    
    # Step 3: Configure S3 trigger
    print("\n[3/4] Configuring S3 event notification...")
    if not configure_s3_trigger(bucket_name, lambda_function_name, region, account_id):
        print("‚ùå Setup failed at S3 trigger configuration")
        return False
    
    # Step 4: Verify
    print("\n[4/4] Verifying configuration...")
    verify_trigger_configuration(bucket_name, region=region)
    
    print("\n" + "=" * 70)
    print("‚úÖ Setup Complete!")
    print("=" * 70)
    print(f"\nüìÅ Upload files to: s3://{bucket_name}/raw-dataset/")
    print(f"‚ö° Lambda will automatically process them!")
    
    return True


In [12]:
if __name__ == "__main__":
    print("=" * 70)
    print("üèóÔ∏è  S3 + Lambda Pipeline Configuration")
    print("=" * 70)
    
    # Step 1: Create/Verify Bucket
    print("\n=== Step 1: Creating S3 Bucket ===")
    bucket_created = create_s3_bucket(
        bucket_name=BUCKET_NAME,
        region=REGION,
        aws_access_key=AWS_ACCESS_KEY_ID,
        aws_secret_key=AWS_SECRET_ACCESS_KEY
    )
    
    if not bucket_created:
        print("‚ùå Failed to create/access bucket. Stopping.")
        exit(1)
    
    # Step 2: List CSV Files
    print("\n=== Step 2: Scanning CSV Files ===")
    csv_files = list(Path(CSV_DIRECTORY).glob('*.csv'))
    print(f"Found {len(csv_files)} CSV file(s) in '{CSV_DIRECTORY}':")
    for i, file in enumerate(csv_files, 1):
        print(f"  {i}. {file.name}")
    
    # Step 3: Setup Lambda Trigger (Optional - uncomment to run)
    print("\n=== Step 3: Lambda Trigger Setup ===")
    print("‚ö†Ô∏è  To enable automatic Lambda triggers:")
    print(f"1. Update LAMBDA_FUNCTION_NAME = '{LAMBDA_FUNCTION_NAME}'")
    print(f"2. Update AWS_ACCOUNT_ID = '{AWS_ACCOUNT_ID}'")
    print("3. Uncomment the setup_complete_pipeline() line below\n")
    
    # Uncomment this line after updating the config:
    setup_complete_pipeline()
    
    # Or verify existing trigger:
    verify_trigger_configuration(BUCKET_NAME, region=REGION)
    
    # Step 4: Upload Files
    print("\n=== Step 4: Upload Files to S3 ===")
    upload_choice = input("Do you want to upload CSV files now? (yes/no): ").strip().lower()
    
    if upload_choice in ['yes', 'y']:
        print("\nüì§ Starting upload...")
        results = upload_multiple_csvs(
            directory=CSV_DIRECTORY,
            bucket_name=BUCKET_NAME,
            s3_folder=S3_FOLDER,
            aws_access_key=AWS_ACCESS_KEY_ID,
            aws_secret_key=AWS_SECRET_ACCESS_KEY,
            region=REGION
        )
        
        if results['success'] > 0:
            print(f"\n‚úÖ Successfully uploaded {results['success']} file(s)!")
            if LAMBDA_FUNCTION_NAME != 'your-lambda-function-name':
                print("‚ö° Lambda function should be processing them now...")
                print("üìä Check CloudWatch Logs to see processing status")
    else:
        print("\n‚è≠Ô∏è  Skipping upload. Run upload_multiple_csvs() when ready.")
    
    print("\n" + "=" * 70)
    print("‚úÖ Script Complete!")
    print("=" * 70)

üèóÔ∏è  S3 + Lambda Pipeline Configuration

=== Step 1: Creating S3 Bucket ===
‚úì Successfully created bucket 'soccer-database-project' in region 'us-east-1'

=== Step 2: Scanning CSV Files ===
Found 5 CSV file(s) in 'D:\Intern\AWS Lambda\Dataset':
  1. League.csv
  2. Player.csv
  3. Player_Attributes.csv
  4. Team.csv
  5. Team_Attributes.csv

=== Step 3: Lambda Trigger Setup ===
‚ö†Ô∏è  To enable automatic Lambda triggers:
1. Update LAMBDA_FUNCTION_NAME = 'data-cleaning-function'
2. Update AWS_ACCOUNT_ID = '311353793773'
3. Uncomment the setup_complete_pipeline() line below

üöÄ AWS Lambda + S3 Pipeline Setup

[1/4] Creating S3 bucket...
‚úì Bucket 'soccer-database-project' already exists

[2/4] Adding Lambda permission for S3...
‚úì Removed existing permission
‚úì Added Lambda permission for S3 to invoke function

[3/4] Configuring S3 event notification...
‚úì Successfully configured S3 trigger for Lambda
  ‚Üí Watching: s3://soccer-database-project/raw-dataset/
  ‚Üí Triggers: 