diff --git a/.gitignore b/.gitignore index 03664de..f7bea21 100644 --- a/.gitignore +++ b/.gitignore @@ -33,4 +33,5 @@ override.tf.json .terraformrc terraform.rc -downloaded_package_* \ No newline at end of file +downloaded_package_* +MEMORY-DEMO-QUICKSTART.md diff --git a/modules/scenarios/main.tf b/modules/scenarios/main.tf index 0fa1d6d..90819b1 100644 --- a/modules/scenarios/main.tf +++ b/modules/scenarios/main.tf @@ -65,3 +65,25 @@ module "vpc" { Environment = "dev" } } + +# Memory optimization demo scenario +module "memory_optimization" { + source = "./memory-optimization" + + # Control whether this scenario is enabled + enabled = var.enable_memory_optimization_demo + + # Use the VPC created above instead of default VPC + use_default_vpc = false + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.public_subnets + + # Demo configuration + name_prefix = "scenarios-memory-demo" + container_memory = var.memory_optimization_container_memory + number_of_containers = var.memory_optimization_container_count + + # Context for the demo + days_until_black_friday = var.days_until_black_friday + days_since_last_memory_change = 423 +} diff --git a/modules/scenarios/memory-optimization/.terraform.lock.hcl b/modules/scenarios/memory-optimization/.terraform.lock.hcl new file mode 100644 index 0000000..7dc9923 --- /dev/null +++ b/modules/scenarios/memory-optimization/.terraform.lock.hcl @@ -0,0 +1,45 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.100.0" + constraints = "~> 5.0" + hashes = [ + "h1:wOhTPz6apLBuF7/FYZuCoXRK/MLgrNprZ3vXmq83g5k=", + "zh:054b8dd49f0549c9a7cc27d159e45327b7b65cf404da5e5a20da154b90b8a644", + "zh:0b97bf8d5e03d15d83cc40b0530a1f84b459354939ba6f135a0086c20ebbe6b2", + "zh:1589a2266af699cbd5d80737a0fe02e54ec9cf2ca54e7e00ac51c7359056f274", + "zh:6330766f1d85f01ae6ea90d1b214b8b74cc8c1badc4696b165b36ddd4cc15f7b", + "zh:7c8c2e30d8e55291b86fcb64bdf6c25489d538688545eb48fd74ad622e5d3862", + "zh:99b1003bd9bd32ee323544da897148f46a527f622dc3971af63ea3e251596342", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:9f8b909d3ec50ade83c8062290378b1ec553edef6a447c56dadc01a99f4eaa93", + "zh:aaef921ff9aabaf8b1869a86d692ebd24fbd4e12c21205034bb679b9caf883a2", + "zh:ac882313207aba00dd5a76dbd572a0ddc818bb9cbf5c9d61b28fe30efaec951e", + "zh:bb64e8aff37becab373a1a0cc1080990785304141af42ed6aa3dd4913b000421", + "zh:dfe495f6621df5540d9c92ad40b8067376350b005c637ea6efac5dc15028add4", + "zh:f0ddf0eaf052766cfe09dea8200a946519f653c384ab4336e2a4a64fdd6310e9", + "zh:f1b7e684f4c7ae1eed272b6de7d2049bb87a0275cb04dbb7cda6636f600699c9", + "zh:ff461571e3f233699bf690db319dfe46aec75e58726636a0d97dd9ac6e32fb70", + ] +} + +provider "registry.terraform.io/hashicorp/random" { + version = "3.7.2" + constraints = "~> 3.1" + hashes = [ + "h1:Lmv2TxyKKm9Vt4uxcPZHw1uf0Ax/yYizJlilbLSZN8E=", + "zh:14829603a32e4bc4d05062f059e545a91e27ff033756b48afbae6b3c835f508f", + "zh:1527fb07d9fea400d70e9e6eb4a2b918d5060d604749b6f1c361518e7da546dc", + "zh:1e86bcd7ebec85ba336b423ba1db046aeaa3c0e5f921039b3f1a6fc2f978feab", + "zh:24536dec8bde66753f4b4030b8f3ef43c196d69cccbea1c382d01b222478c7a3", + "zh:29f1786486759fad9b0ce4fdfbbfece9343ad47cd50119045075e05afe49d212", + "zh:4d701e978c2dd8604ba1ce962b047607701e65c078cb22e97171513e9e57491f", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:7b8434212eef0f8c83f5a90c6d76feaf850f6502b61b53c329e85b3b281cba34", + "zh:ac8a23c212258b7976e1621275e3af7099e7e4a3d4478cf8d5d2a27f3bc3e967", + "zh:b516ca74431f3df4c6cf90ddcdb4042c626e026317a33c53f0b445a3d93b720d", + "zh:dc76e4326aec2490c1600d6871a95e78f9050f9ce427c71707ea412a2f2f1a62", + "zh:eac7b63e86c749c7d48f527671c7aee5b4e26c10be6ad7232d6860167f99dbb0", + ] +} diff --git a/modules/scenarios/memory-optimization/README.md b/modules/scenarios/memory-optimization/README.md new file mode 100644 index 0000000..b75203a --- /dev/null +++ b/modules/scenarios/memory-optimization/README.md @@ -0,0 +1,355 @@ +# Memory Optimization Demo - The Friday Afternoon Trap + +This Terraform module demonstrates a realistic scenario where a seemingly simple memory optimization leads to a production outage. It's designed to show how Overmind catches hidden risks that traditional infrastructure tools miss. + +## 🎯 The Scenario + +**The Setup**: It's Friday afternoon, 7 days before Black Friday. Your Java application is running on 15 ECS Fargate containers, each allocated 2048MB of memory. CloudWatch monitoring shows an average memory usage of only 800MB per container. + +**The Temptation**: Your CFO wants cost reductions before the holiday season. You calculate that reducing memory from 2GB to 1GB per container would save **$2,000/month** ($24,000/year). + +**The Trap**: The application is configured with `-Xmx1536m` (1536MB Java heap) plus 256MB overhead, requiring 1792MB total. Reducing to 1024MB will cause immediate OutOfMemoryError crashes. + +**The Hidden Impact**: What appears to be a simple 2-resource change actually affects 47+ resources and risks a complete outage during peak season. + +## 📊 Cost Analysis + +``` +Current State: +- 15 containers × 2GB × $50/GB/month = $4,000/month +- Annual cost: $48,000 + +"Optimized" State: +- 15 containers × 1GB × $50/GB/month = $2,000/month +- Annual cost: $24,000 +- Savings: $24,000/year (50% reduction!) +``` + +## 🏗️ Infrastructure Created + +This module creates a complete, isolated environment: + +- **ECS Cluster** with Container Insights enabled +- **ECS Service** running 15 Tomcat containers with Java heap trap +- **Application Load Balancer** with 5-second deregistration (no rollback time!) +- **CloudWatch Alarms** that will fire when containers crash +- **Security Groups** and networking for realistic production setup +- **SNS Topic** for alert notifications + +## 🚀 Quick Start + +### 1. Deploy the Safe Configuration + +```hcl +# Create: standalone-demo.tf +module "memory_optimization_demo" { + source = "./modules/scenarios/memory-optimization" + + enabled = true + container_memory = 2048 # SAFE - meets Java requirements + + # Optional customizations + name_prefix = "my-memory-demo" + number_of_containers = 15 + use_default_vpc = true +} + +output "demo_info" { + value = module.memory_optimization_demo.demo_status +} + +output "app_url" { + value = module.memory_optimization_demo.alb_url +} +``` + +```bash +terraform init +terraform apply +``` + +### 2. Verify the Application Works + +```bash +# Get the ALB URL +terraform output app_url + +# Test the application (should return Tomcat default page) +curl $(terraform output -raw app_url) +``` + +### 3. Create the Breaking Change + +```bash +# Create a feature branch +git checkout -b memory-optimization + +# Edit your module call to use the "optimized" memory +# Change container_memory from 2048 to 1024 +``` + +```hcl +module "memory_optimization_demo" { + source = "./modules/scenarios/memory-optimization" + + enabled = true + container_memory = 1024 # DANGEROUS - will cause OOM! + + name_prefix = "my-memory-demo" + number_of_containers = 15 + use_default_vpc = true +} +``` + +### 4. See the "Simple" Change + +```bash +terraform plan +``` + +**Terraform shows**: 2 resources to change +- `aws_ecs_task_definition.app[0]` (memory: 2048 → 1024) +- `aws_ecs_service.app[0]` (task definition ARN update) + +**Reality**: 47+ resources will be affected when all containers crash! + +### 5. Apply and Watch the Crash (Optional) + +⚠️ **Warning**: This will actually break the application! + +```bash +terraform apply +``` + +**What happens**: +1. All 15 containers restart with new memory limit +2. Java tries to allocate 1536MB heap in 1024MB container +3. Immediate OutOfMemoryError on startup +4. Containers crash in a loop +5. ALB health checks fail +6. CloudWatch alarms fire +7. Service becomes unavailable + +### 6. Check the Damage + +```bash +# View failed containers +aws ecs describe-services --cluster $(terraform output -raw cluster_name) --services $(terraform output -raw service_name) + +# Check logs for OOM errors +aws logs filter-log-events --log-group-name $(terraform output -raw log_group_name) --filter-pattern "OutOfMemoryError" + +# Monitor CloudWatch alarms (they should be firing) +aws cloudwatch describe-alarms --alarm-names $(terraform output -raw cluster_name)-* +``` + +### 7. Fix and Cleanup + +```bash +# Fix: Change container_memory back to 2048 or higher +# In your module call: +container_memory = 2048 + +terraform apply + +# Or completely clean up +terraform destroy +``` + +## 📋 Module Configuration + +### Required Variables + +```hcl +variable "enabled" { + description = "Toggle module on/off" + type = bool + default = true +} + +variable "container_memory" { + description = "Memory in MB (2048 = safe, 1024 = breaks)" + type = number + default = 2048 +} +``` + +### VPC Configuration Options + +**Option 1: Use Default VPC (Recommended for demo)** +```hcl +use_default_vpc = true +``` + +**Option 2: Create Standalone VPC** +```hcl +use_default_vpc = false +create_standalone_vpc = true +``` + +**Option 3: Use Existing VPC** +```hcl +use_default_vpc = false +create_standalone_vpc = false +vpc_id = "vpc-12345" +subnet_ids = ["subnet-12345", "subnet-67890"] +``` + +## 🔍 Understanding the Trap + +### Why Monitoring Misleads + +1. **CloudWatch shows 800MB average usage** - This is the `memoryReservation` setting, not actual usage +2. **GC cycles hide peak usage** - During garbage collection, memory spikes to ~1.8GB +3. **Container Insights don't show JVM internals** - They see container limits, not heap requirements + +### The Java Memory Model + +``` +Container Memory Limit: 1024MB (after "optimization") +├── Java Heap (-Xmx): 1536MB ❌ DOESN'T FIT! +├── Metaspace: ~100MB +├── Direct Memory: ~50MB +├── Code Cache: ~50MB +├── OS Overhead: ~100MB +└── Buffer: ~56MB +``` + +**Total Required**: ~1792MB +**Container Limit**: 1024MB +**Result**: OutOfMemoryError + +### Why Rollback Fails + +- **ALB deregistration delay**: 5 seconds (industry standard: 300s) +- **All containers restart simultaneously**: No gradual rollout +- **No circuit breaker**: Disabled to show realistic deployment +- **Black Friday timing**: Change 7 days before 10x traffic spike + +## 🎯 What Overmind Would Catch + +While `terraform plan` shows only 2 changing resources, Overmind would reveal: + +### Direct Dependencies (47+ resources) +- ECS Task Definition changes +- ECS Service deployment +- ALB Target Group health checks +- CloudWatch Alarms triggering +- Auto Scaling Group reactions +- Service Discovery updates +- IAM role assumptions +- CloudWatch Log streams + +### Hidden Impacts +- Connected microservices lose connectivity +- Database connection pools drain +- Circuit breakers in dependent services trip +- Load balancer health checks cascade +- Monitoring dashboards show degradation +- Cost allocation tags become inaccurate + +### Business Risk Analysis +- **Timing Risk**: 7 days before Black Friday +- **Blast Radius**: All 15 production containers +- **Recovery Time**: Limited by 5s deregistration delay +- **Customer Impact**: Complete service unavailability +- **Financial Impact**: Lost revenue during peak season + +## 🛡️ Safety Features + +This demo includes several safety mechanisms: + +1. **Isolated Resources**: All resources use unique names with random suffix +2. **Module Toggle**: Set `enabled = false` to disable everything +3. **No Shared Infrastructure**: Won't affect existing resources +4. **Quick Cleanup**: `terraform destroy` removes everything +5. **Cost Controls**: Small instance sizes and short retention periods + +## 📚 Learning Objectives + +After running this demo, you'll understand: + +1. **How simple changes can have complex impacts** +2. **Why monitoring can be misleading** +3. **The importance of understanding application internals** +4. **How timing and context affect risk** +5. **Why change blast radius analysis is critical** +6. **The value of tools that reveal hidden dependencies** + +## 🔧 Troubleshooting + +### Common Issues + +**"No default VPC found"** +```hcl +# Create a standalone VPC instead +use_default_vpc = false +create_standalone_vpc = true +``` + +**"Insufficient permissions"** +- Ensure your AWS credentials have ECS, ALB, and CloudWatch permissions + +**"Resource already exists"** +- The random suffix should prevent conflicts +- Try changing the `name_prefix` variable + +### Validation Commands + +```bash +# Check if module is valid +terraform validate + +# Check current resource status +terraform show + +# Verify infrastructure +aws ecs list-clusters +aws elbv2 describe-load-balancers +``` + +## 💡 Extending the Demo + +### Add More Realistic Scenarios + +1. **Database connections**: Add RDS with connection limits +2. **Service mesh**: Include Istio/Envoy sidecar overhead +3. **Logging overhead**: Add log shipping memory usage +4. **Monitoring agents**: Include DataDog/New Relic agents + +### Customize for Your Environment + +```hcl +module "memory_optimization_demo" { + source = "./modules/scenarios/memory-optimization" + + # Scale the scenario + number_of_containers = 50 + + # Use your network + use_default_vpc = false + vpc_id = var.production_vpc_id + subnet_ids = var.private_subnet_ids + + # Adjust timing + days_until_black_friday = 3 + days_since_last_memory_change = 180 + + # Custom naming + name_prefix = "prod-memory-test" +} +``` + +## 📞 Support + +This module is designed for demonstration purposes. For production use cases: + +1. Always test memory changes in staging first +2. Use gradual deployment strategies +3. Implement proper monitoring and alerting +4. Understand your application's memory requirements +5. Use tools like Overmind to analyze change impact + +--- + +**Remember**: The best time to prevent an outage is before it happens. This demo shows why infrastructure dependency analysis is critical for production changes. \ No newline at end of file diff --git a/modules/scenarios/memory-optimization/ecs.tf b/modules/scenarios/memory-optimization/ecs.tf new file mode 100644 index 0000000..04009fd --- /dev/null +++ b/modules/scenarios/memory-optimization/ecs.tf @@ -0,0 +1,208 @@ +# ecs.tf +# Following guide.md requirements for memory optimization demo +# Create ECS resources showing Java heap trap from guide.md +# Java needs 1536MB heap but container will only have 1024MB after change + +# ECS Cluster with container insights +resource "aws_ecs_cluster" "main" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-cluster" + + setting { + name = "containerInsights" + value = var.enable_container_insights ? "enabled" : "disabled" + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-cluster" + Description = "ECS cluster for memory optimization demo - all ${var.number_of_containers} containers will restart on memory change" + }) +} + +# ECS Task Execution Role +resource "aws_iam_role" "ecs_execution_role" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-ecs-execution-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + } + ] + }) + + tags = local.common_tags +} + +resource "aws_iam_role_policy_attachment" "ecs_execution_role_policy" { + count = var.enabled ? 1 : 0 + role = aws_iam_role.ecs_execution_role[0].name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +# ECS Task Role (for the application itself) +resource "aws_iam_role" "ecs_task_role" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-ecs-task-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + } + ] + }) + + tags = local.common_tags +} + +# CloudWatch Log Group +resource "aws_cloudwatch_log_group" "app" { + count = var.enabled ? 1 : 0 + name = "/ecs/${local.name_prefix}" + retention_in_days = 7 + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-logs" + Description = "Logs will show OOM kills when memory is reduced to 1024MB" + }) +} + +# ECS Task Definition - THE TRAP IS HERE! +resource "aws_ecs_task_definition" "app" { + count = var.enabled ? 1 : 0 + family = "${local.name_prefix}-task" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = var.cpu_units + memory = var.container_memory + execution_role_arn = aws_iam_role.ecs_execution_role[0].arn + task_role_arn = aws_iam_role.ecs_task_role[0].arn + + container_definitions = jsonencode([ + { + name = "tomcat-app" + image = "tomcat:9-jre11" + + # THE CRITICAL CONFIGURATION - Java heap size that will cause OOM! + environment = [ + { + name = "JAVA_OPTS" + # THIS IS THE TRAP! JVM configured for 1536MB heap + 256MB overhead = 1792MB total + # When container_memory changes to 1024MB, this will cause immediate OOM kills + value = "-Xmx${var.java_heap_size_mb}m -Xms${var.java_heap_size_mb}m -XX:+UseG1GC -XX:MaxGCPauseMillis=200" + }, + { + name = "CATALINA_OPTS" + value = "-Djava.security.egd=file:/dev/./urandom" + } + ] + + # MISLEADING METRIC! This shows only 800MB average, hiding the real requirement + memoryReservation = 800 + + # Health check with enough time for JVM startup + healthCheck = { + command = [ + "CMD-SHELL", + "curl -f http://localhost:${var.application_port}/ || exit 1" + ] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = var.health_check_grace_period + } + + portMappings = [ + { + containerPort = var.application_port + hostPort = var.application_port + protocol = "tcp" + } + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.app[0].name + "awslogs-region" = data.aws_region.current.name + "awslogs-stream-prefix" = "ecs" + } + } + + essential = true + } + ]) + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-task" + Description = "Task definition showing Java heap trap - needs ${var.java_heap_size_mb + 256}MB but will get ${var.container_memory}MB" + + # Critical warning tags + "warning:java-heap-size" = "${var.java_heap_size_mb}MB" + "warning:memory-overhead" = "256MB (metaspace + OS)" + "warning:total-required" = "${var.java_heap_size_mb + 256}MB" + "warning:container-memory" = "${var.container_memory}MB" + "warning:will-oom-on-1024" = "true" + }) +} + +# ECS Service - All containers will restart when memory changes +resource "aws_ecs_service" "app" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-service" + cluster = aws_ecs_cluster.main[0].id + task_definition = aws_ecs_task_definition.app[0].arn + desired_count = var.number_of_containers + launch_type = "FARGATE" + + # Rolling deployment configuration - ALL containers will restart! + deployment_controller { + type = "ECS" + } + + deployment_circuit_breaker { + enable = false + rollback = false + } + + network_configuration { + subnets = local.subnet_ids + security_groups = [aws_security_group.ecs_tasks[0].id] + assign_public_ip = true + } + + load_balancer { + target_group_arn = aws_lb_target_group.app[0].arn + container_name = "tomcat-app" + container_port = var.application_port + } + + depends_on = [ + aws_lb_listener.app + ] + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-service" + Description = "ECS service with ${var.number_of_containers} containers - ALL will restart when memory changes" + + # Impact warning tags + "impact:containers-affected" = tostring(var.number_of_containers) + "impact:deployment-type" = "rolling" + "impact:black-friday-risk" = "all containers restart during peak season" + }) +} + +# Data source for current AWS region +data "aws_region" "current" {} \ No newline at end of file diff --git a/modules/scenarios/memory-optimization/main.tf b/modules/scenarios/memory-optimization/main.tf new file mode 100644 index 0000000..cab799e --- /dev/null +++ b/modules/scenarios/memory-optimization/main.tf @@ -0,0 +1,151 @@ +# main.tf +# Following guide.md requirements for memory optimization demo +# This creates a self-contained module showing how memory reduction breaks Java apps + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + random = { + source = "hashicorp/random" + version = "~> 3.1" + } + } +} + +# Generate random suffix for resource uniqueness +resource "random_id" "suffix" { + count = var.enabled ? 1 : 0 + byte_length = 4 +} + +# Data sources for VPC configuration +data "aws_vpc" "default" { + count = var.enabled && var.use_default_vpc ? 1 : 0 + default = true +} + +data "aws_subnets" "default" { + count = var.enabled && var.use_default_vpc ? 1 : 0 + filter { + name = "vpc-id" + values = [data.aws_vpc.default[0].id] + } +} + +data "aws_availability_zones" "available" { + count = var.enabled ? 1 : 0 + state = "available" +} + +# Create standalone VPC if needed +resource "aws_vpc" "standalone" { + count = var.enabled && var.create_standalone_vpc ? 1 : 0 + cidr_block = "10.0.0.0/16" + enable_dns_hostnames = true + enable_dns_support = true + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-vpc" + }) +} + +resource "aws_internet_gateway" "standalone" { + count = var.enabled && var.create_standalone_vpc ? 1 : 0 + vpc_id = aws_vpc.standalone[0].id + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-igw" + }) +} + +resource "aws_subnet" "standalone" { + count = var.enabled && var.create_standalone_vpc ? 2 : 0 + vpc_id = aws_vpc.standalone[0].id + cidr_block = "10.0.${count.index + 1}.0/24" + availability_zone = data.aws_availability_zones.available[0].names[count.index] + map_public_ip_on_launch = true + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-subnet-${count.index + 1}" + }) +} + +resource "aws_route_table" "standalone" { + count = var.enabled && var.create_standalone_vpc ? 1 : 0 + vpc_id = aws_vpc.standalone[0].id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.standalone[0].id + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-rt" + }) +} + +resource "aws_route_table_association" "standalone" { + count = var.enabled && var.create_standalone_vpc ? 2 : 0 + subnet_id = aws_subnet.standalone[count.index].id + route_table_id = aws_route_table.standalone[0].id +} + +# Local calculations and configurations +locals { + # Resource naming with random suffix + random_suffix = var.enabled ? random_id.suffix[0].hex : "" + name_prefix = "${var.name_prefix}-${local.random_suffix}" + + # VPC configuration based on mode + vpc_id = var.enabled ? ( + var.use_default_vpc ? data.aws_vpc.default[0].id : + var.create_standalone_vpc ? aws_vpc.standalone[0].id : + var.vpc_id + ) : null + + subnet_ids = var.enabled ? ( + var.use_default_vpc ? data.aws_subnets.default[0].ids : + var.create_standalone_vpc ? aws_subnet.standalone[*].id : + var.subnet_ids + ) : [] + + # Cost calculations (realistic AWS Fargate pricing) + cost_per_gb_month = 50 + current_memory_gb = var.container_memory / 1024 + current_cost_month = local.current_memory_gb * var.number_of_containers * local.cost_per_gb_month + + # The "optimized" memory that would break everything + optimized_memory = 1024 + optimized_memory_gb = local.optimized_memory / 1024 + optimized_cost_month = local.optimized_memory_gb * var.number_of_containers * local.cost_per_gb_month + + monthly_savings = local.current_cost_month - local.optimized_cost_month + + # The critical calculation: Will this work? + java_heap_mb = 1536 # -Xmx1536m configured in the application + java_overhead_mb = 256 # Metaspace + OS overhead + required_memory_mb = local.java_heap_mb + local.java_overhead_mb + will_it_work = var.container_memory >= local.required_memory_mb + + # Common tags for all resources + common_tags = { + Environment = "demo" + Project = "memory-optimization-trap" + Scenario = "friday-afternoon-optimization" + CreatedBy = "terraform" + Purpose = "demonstrate-hidden-risks" + + # Context tags that tell the story + "demo:current-memory" = "${var.container_memory}MB" + "demo:java-heap-size" = "${local.java_heap_mb}MB" + "demo:required-total-memory" = "${local.required_memory_mb}MB" + "demo:will-optimization-work" = tostring(local.will_it_work) + "demo:monthly-savings" = "$${local.monthly_savings}" + "demo:days-until-black-friday" = tostring(var.days_until_black_friday) + "demo:last-memory-change" = "${var.days_since_last_memory_change} days ago" + "demo:risk-level" = local.will_it_work ? "low" : "CRITICAL" + } +} \ No newline at end of file diff --git a/modules/scenarios/memory-optimization/monitoring.tf b/modules/scenarios/memory-optimization/monitoring.tf new file mode 100644 index 0000000..386f270 --- /dev/null +++ b/modules/scenarios/memory-optimization/monitoring.tf @@ -0,0 +1,170 @@ +# monitoring.tf +# Following guide.md requirements for memory optimization demo +# Create CloudWatch alarms that will fire when containers OOM +# Reference the memory optimization scenario from guide.md + +# SNS Topic for alarm notifications +resource "aws_sns_topic" "alerts" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-alerts" + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-alerts" + Description = "SNS topic for memory optimization demo alerts - will fire when containers OOM" + }) +} + +# CloudWatch Alarm for high memory utilization +resource "aws_cloudwatch_metric_alarm" "high_memory_utilization" { + count = var.enabled ? 1 : 0 + alarm_name = "${local.name_prefix}-high-memory" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "MemoryUtilization" + namespace = "AWS/ECS" + period = "300" + statistic = "Average" + threshold = "80" + alarm_description = "This metric monitors ECS memory utilization - WILL FIRE when containers run out of memory" + alarm_actions = [aws_sns_topic.alerts[0].arn] + ok_actions = [aws_sns_topic.alerts[0].arn] + + dimensions = { + ServiceName = aws_ecs_service.app[0].name + ClusterName = aws_ecs_cluster.main[0].name + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-memory-alarm" + Description = "Memory alarm - will trigger when Java heap (${var.java_heap_size_mb}MB) exceeds container limit (${var.container_memory}MB)" + + # Alarm context tags + "alarm:trigger-condition" = "memory > 80%" + "alarm:java-heap-configured" = "${var.java_heap_size_mb}MB" + "alarm:container-memory" = "${var.container_memory}MB" + "alarm:will-fire-after-change" = tostring(var.container_memory < var.java_heap_size_mb + 256) + }) +} + +# CloudWatch Alarm for low task count (containers crashing) +resource "aws_cloudwatch_metric_alarm" "low_task_count" { + count = var.enabled ? 1 : 0 + alarm_name = "${local.name_prefix}-low-task-count" + comparison_operator = "LessThanThreshold" + evaluation_periods = "2" + metric_name = "RunningTaskCount" + namespace = "AWS/ECS" + period = "300" + statistic = "Average" + threshold = var.number_of_containers * 0.8 # 80% of expected tasks + alarm_description = "This metric monitors ECS running task count - WILL FIRE when containers crash due to OOM" + alarm_actions = [aws_sns_topic.alerts[0].arn] + ok_actions = [aws_sns_topic.alerts[0].arn] + + dimensions = { + ServiceName = aws_ecs_service.app[0].name + ClusterName = aws_ecs_cluster.main[0].name + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-task-count-alarm" + Description = "Task count alarm - will trigger when containers crash after memory reduction" + + # Alarm context tags + "alarm:expected-tasks" = tostring(var.number_of_containers) + "alarm:threshold-tasks" = tostring(var.number_of_containers * 0.8) + "alarm:crash-cause" = "OOM when memory reduced to ${var.container_memory}MB" + "alarm:black-friday-impact" = "service degradation ${var.days_until_black_friday} days before peak" + }) +} + +# CloudWatch Alarm for high CPU (JVM struggling with limited memory) +resource "aws_cloudwatch_metric_alarm" "high_cpu_utilization" { + count = var.enabled ? 1 : 0 + alarm_name = "${local.name_prefix}-high-cpu" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "3" + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + period = "300" + statistic = "Average" + threshold = "80" + alarm_description = "This metric monitors ECS CPU utilization - will spike when JVM struggles with insufficient memory" + alarm_actions = [aws_sns_topic.alerts[0].arn] + + dimensions = { + ServiceName = aws_ecs_service.app[0].name + ClusterName = aws_ecs_cluster.main[0].name + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-cpu-alarm" + Description = "CPU alarm - will spike when JVM struggles with insufficient memory for garbage collection" + + # Technical explanation tags + "technical:gc-pressure" = "high when heap approaches container limit" + "technical:jvm-behavior" = "CPU spikes before OOM crash" + "technical:memory-thrashing" = "frequent GC when memory constrained" + }) +} + +# CloudWatch Alarm for ALB target health +resource "aws_cloudwatch_metric_alarm" "unhealthy_targets" { + count = var.enabled ? 1 : 0 + alarm_name = "${local.name_prefix}-unhealthy-targets" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "UnHealthyHostCount" + namespace = "AWS/ApplicationELB" + period = "60" + statistic = "Average" + threshold = "0" + alarm_description = "This metric monitors ALB unhealthy targets - will fire when containers become unresponsive" + alarm_actions = [aws_sns_topic.alerts[0].arn] + + dimensions = { + TargetGroup = aws_lb_target_group.app[0].arn_suffix + LoadBalancer = aws_lb.app[0].arn_suffix + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-unhealthy-targets-alarm" + Description = "Unhealthy targets alarm - will fire when containers fail health checks after OOM" + + # Impact tags + "impact:user-experience" = "failed requests during container crashes" + "impact:deregistration-time" = "${var.deregistration_delay}s (no rollback time)" + "impact:business-risk" = "outage ${var.days_until_black_friday} days before Black Friday" + }) +} + +# CloudWatch Log Insights query for OOM events (for troubleshooting) +resource "aws_cloudwatch_query_definition" "oom_events" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-oom-analysis" + + log_group_names = [ + aws_cloudwatch_log_group.app[0].name + ] + + query_string = <<-EOT + fields @timestamp, @message + | filter @message like /OutOfMemoryError/ + | sort @timestamp desc + | limit 100 + EOT +} + +# Custom metric for demo purposes - memory pressure indicator +resource "aws_cloudwatch_log_metric_filter" "memory_pressure" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-memory-pressure" + log_group_name = aws_cloudwatch_log_group.app[0].name + pattern = "[timestamp, requestId, level=\"ERROR\", message=\"*OutOfMemoryError*\"]" + + metric_transformation { + name = "JavaOOMErrors" + namespace = "MemoryOptimization/Demo" + value = "1" + } +} \ No newline at end of file diff --git a/modules/scenarios/memory-optimization/networking.tf b/modules/scenarios/memory-optimization/networking.tf new file mode 100644 index 0000000..c6c18e6 --- /dev/null +++ b/modules/scenarios/memory-optimization/networking.tf @@ -0,0 +1,143 @@ +# networking.tf +# Following guide.md requirements for memory optimization demo +# Create ALB, security groups, target groups with dangerous configurations + +# Application Load Balancer +resource "aws_lb" "app" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-alb" + internal = false + load_balancer_type = "application" + security_groups = [aws_security_group.alb[0].id] + subnets = local.subnet_ids + + enable_deletion_protection = false + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-alb" + Description = "ALB for memory optimization demo - will route to failing containers after memory change" + + # Context tags + "context:black-friday-traffic" = "10x normal load expected" + "context:capacity-planning" = "load balancer configured for high traffic" + }) +} + +# Target Group - DANGEROUS CONFIGURATION! +resource "aws_lb_target_group" "app" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-tg" + port = var.application_port + protocol = "HTTP" + vpc_id = local.vpc_id + target_type = "ip" + + # CRITICAL RISK: 5 second deregistration = no time for rollback! + deregistration_delay = var.deregistration_delay + + health_check { + enabled = true + healthy_threshold = 2 + unhealthy_threshold = 2 + timeout = 5 + interval = 30 + path = "/" + matcher = "200" + port = "traffic-port" + protocol = "HTTP" + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-tg" + Description = "Target group with ${var.deregistration_delay}s deregistration - NO TIME FOR ROLLBACK!" + + # Risk warning tags + "risk:deregistration-delay" = "${var.deregistration_delay}s" + "risk:rollback-capability" = "none" + "risk:black-friday-timing" = "change ${var.days_until_black_friday} days before peak" + }) +} + +# ALB Listener +resource "aws_lb_listener" "app" { + count = var.enabled ? 1 : 0 + load_balancer_arn = aws_lb.app[0].arn + port = "80" + protocol = "HTTP" + + default_action { + type = "forward" + + forward { + target_group { + arn = aws_lb_target_group.app[0].arn + } + } + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-listener" + }) +} + +# Security Group for ALB +resource "aws_security_group" "alb" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-alb-sg" + description = "Security group for ALB - allows public HTTP access" + vpc_id = local.vpc_id + + ingress { + description = "HTTP from internet" + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + description = "All outbound traffic" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-alb-sg" + Description = "ALB security group - public access for Black Friday capacity testing" + }) +} + +# Security Group for ECS Tasks +resource "aws_security_group" "ecs_tasks" { + count = var.enabled ? 1 : 0 + name = "${local.name_prefix}-ecs-sg" + description = "Security group for ECS tasks - allows ALB access" + vpc_id = local.vpc_id + + ingress { + description = "HTTP from ALB" + from_port = var.application_port + to_port = var.application_port + protocol = "tcp" + security_groups = [aws_security_group.alb[0].id] + } + + egress { + description = "All outbound traffic" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-ecs-sg" + Description = "ECS tasks security group - containers will crash after memory optimization" + + # Warning tags + "warning:containers-affected" = "${var.number_of_containers} containers" + "warning:crash-behavior" = "immediate OOM after memory reduction" + }) +} \ No newline at end of file diff --git a/modules/scenarios/memory-optimization/outputs.tf b/modules/scenarios/memory-optimization/outputs.tf new file mode 100644 index 0000000..7ea771e --- /dev/null +++ b/modules/scenarios/memory-optimization/outputs.tf @@ -0,0 +1,169 @@ +# outputs.tf +# Following guide.md requirements for memory optimization demo +# Module outputs showing demo status and instructions + +output "alb_url" { + description = "URL to access the application" + value = var.enabled ? "http://${aws_lb.app[0].dns_name}" : null +} + +output "demo_status" { + description = "Object showing current vs required memory, cost calculations, and risk assessment" + value = var.enabled ? { + # Memory analysis + current_memory_mb = var.container_memory + required_memory_mb = local.required_memory_mb + java_heap_size_mb = var.java_heap_size_mb + memory_overhead_mb = 256 + will_it_work = local.will_it_work + + # Cost calculations + current_cost_month = "$${local.current_cost_month}" + optimized_cost_month = "$${local.optimized_cost_month}" + monthly_savings = "$${local.monthly_savings}" + annual_savings = "$${local.monthly_savings * 12}" + + # Risk assessment + risk_level = local.will_it_work ? "LOW" : "CRITICAL" + containers_affected = var.number_of_containers + days_until_black_friday = var.days_until_black_friday + deregistration_delay = "${var.deregistration_delay} seconds" + rollback_capability = var.deregistration_delay > 30 ? "possible" : "insufficient" + + # Business impact + business_context = { + timing = "Friday afternoon change" + black_friday_risk = "${var.days_until_black_friday} days until peak traffic" + last_memory_change = "${var.days_since_last_memory_change} days ago" + traffic_multiplier = "10x expected on Black Friday" + change_window = "unsafe - too close to peak season" + } + + # Technical details + technical_analysis = { + jvm_configuration = "JAVA_OPTS=-Xmx${var.java_heap_size_mb}m -Xms${var.java_heap_size_mb}m" + container_limit = "${var.container_memory}MB" + memory_gap = "${var.container_memory - local.required_memory_mb}MB" + oom_prediction = var.container_memory < local.required_memory_mb ? "IMMEDIATE" : "none" + gc_behavior = var.container_memory < local.required_memory_mb ? "thrashing before crash" : "normal" + } + } : null +} + +output "instructions" { + description = "How to break and fix the demo" + value = var.enabled ? { + demo_flow = { + step_1 = "Current state: Deploy with container_memory = ${var.container_memory}MB (SAFE)" + step_2 = "Create branch: git checkout -b memory-optimization" + step_3 = "Change: Set container_memory = 1024 in variables or module call" + step_4 = "Plan: terraform plan (shows 2 resources changing)" + step_5 = "Reality: All ${var.number_of_containers} containers will crash immediately" + step_6 = "Overmind: Would reveal 47+ resources affected by this change" + step_7 = "Fix: Change container_memory back to 2048MB or higher" + step_8 = "Cleanup: terraform destroy when done" + } + + breaking_change = { + what_to_change = "container_memory variable from ${var.container_memory} to 1024" + where_to_change = [ + "variables.tf default value", + "module call parameter", + "terraform.tfvars file" + ] + terraform_command = "terraform apply -var='container_memory=1024'" + } + + explanation = { + why_it_breaks = [ + "Java heap configured for ${var.java_heap_size_mb}MB (-Xmx${var.java_heap_size_mb}m)", + "JVM needs ${var.java_heap_size_mb}MB heap + 256MB overhead = ${local.required_memory_mb}MB total", + "Container memory limit of 1024MB < ${local.required_memory_mb}MB required", + "Result: Immediate OutOfMemoryError on container startup" + ] + + why_monitoring_misleads = [ + "CloudWatch shows memoryReservation = 800MB (misleading!)", + "Average memory usage appears low due to GC cycles", + "P99 memory usage during GC spikes to ~1.8GB", + "Container insights don't show JVM heap requirements" + ] + + why_timing_matters = [ + "Change scheduled ${var.days_until_black_friday} days before Black Friday", + "Black Friday brings 10x normal traffic load", + "All ${var.number_of_containers} containers restart simultaneously", + "${var.deregistration_delay}s deregistration = no rollback time", + "Last memory change was ${var.days_since_last_memory_change} days ago (stale knowledge)" + ] + } + + overmind_insights = { + visible_changes = "2 resources (task definition, service)" + hidden_impacts = [ + "ALB target group health checks", + "CloudWatch alarms triggering", + "Auto Scaling reactions", + "Service discovery updates", + "Log stream disruptions", + "Connected microservices affected", + "Database connection pooling impact" + ] + total_affected_resources = "47+ resources in typical production environment" + } + } : null +} + +output "cluster_name" { + description = "ECS cluster name for monitoring" + value = var.enabled ? aws_ecs_cluster.main[0].name : null +} + +output "service_name" { + description = "ECS service name for monitoring" + value = var.enabled ? aws_ecs_service.app[0].name : null +} + +output "log_group_name" { + description = "CloudWatch log group name to check for OOM errors" + value = var.enabled ? aws_cloudwatch_log_group.app[0].name : null +} + +output "cost_analysis" { + description = "Detailed cost breakdown showing the financial motivation for the risky change" + value = var.enabled ? { + current_configuration = { + memory_per_container = "${var.container_memory}MB" + number_of_containers = var.number_of_containers + memory_cost_per_gb = "$${local.cost_per_gb_month}/month" + total_monthly_cost = "$${local.current_cost_month}/month" + annual_cost = "$${local.current_cost_month * 12}/year" + } + + proposed_optimization = { + memory_per_container = "1024MB" + number_of_containers = var.number_of_containers + memory_cost_per_gb = "$${local.cost_per_gb_month}/month" + total_monthly_cost = "$${local.optimized_cost_month}/month" + annual_cost = "$${local.optimized_cost_month * 12}/year" + } + + savings_projection = { + monthly_savings = "$${local.monthly_savings}" + annual_savings = "$${local.monthly_savings * 12}" + percentage_saved = "${floor((local.monthly_savings / local.current_cost_month) * 100)}%" + } + + business_pressure = { + motivation = "Significant cost savings opportunity identified" + timing_pressure = "CFO wants cost reductions before Black Friday" + appears_safe = "Monitoring shows only 800MB average usage" + hidden_risk = "JVM actually needs ${local.required_memory_mb}MB total" + } + } : null +} + +output "resource_tags" { + description = "Common tags applied to all resources for tracking demo context" + value = var.enabled ? local.common_tags : null +} \ No newline at end of file diff --git a/modules/scenarios/memory-optimization/variables.tf b/modules/scenarios/memory-optimization/variables.tf new file mode 100644 index 0000000..5d18376 --- /dev/null +++ b/modules/scenarios/memory-optimization/variables.tf @@ -0,0 +1,117 @@ +# variables.tf +# Following guide.md requirements for memory optimization demo +# Create variables for memory optimization demo as specified in guide.md + +variable "enabled" { + description = "Toggle module on/off" + type = bool + default = true +} + +variable "name_prefix" { + description = "Unique prefix for resources" + type = string + default = "memory-opt-demo" +} + +variable "container_memory" { + description = "Memory in MB allocated to each container (will change from 2048 to 1024 to break)" + type = number + default = 2048 + + validation { + condition = var.container_memory >= 512 && var.container_memory <= 30720 + error_message = "Container memory must be between 512 MB and 30 GB." + } +} + +variable "number_of_containers" { + description = "ECS service desired count - number of containers to run" + type = number + default = 15 + + validation { + condition = var.number_of_containers >= 1 && var.number_of_containers <= 100 + error_message = "Number of containers must be between 1 and 100." + } +} + +variable "use_default_vpc" { + description = "Use account's default VPC (mutually exclusive with create_standalone_vpc)" + type = bool + default = true +} + +variable "create_standalone_vpc" { + description = "Create an isolated VPC for this demo (mutually exclusive with use_default_vpc)" + type = bool + default = false +} + +variable "vpc_id" { + description = "VPC ID to use (only when both use_default_vpc and create_standalone_vpc are false)" + type = string + default = null +} + +variable "subnet_ids" { + description = "Subnet IDs to use (only when both use_default_vpc and create_standalone_vpc are false)" + type = list(string) + default = [] +} + +variable "days_until_black_friday" { + description = "Context for urgency - days until Black Friday traffic spike" + type = number + default = 7 +} + +variable "days_since_last_memory_change" { + description = "Shows staleness - days since last memory configuration change" + type = number + default = 423 +} + +# Additional configuration variables +variable "java_heap_size_mb" { + description = "Java heap size in MB (this is the trap - app is configured with -Xmx1536m)" + type = number + default = 1536 +} + +variable "enable_container_insights" { + description = "Enable CloudWatch Container Insights for the ECS cluster" + type = bool + default = true +} + +variable "health_check_grace_period" { + description = "Health check grace period in seconds (JVM needs time to start)" + type = number + default = 120 +} + +variable "deregistration_delay" { + description = "ALB target deregistration delay in seconds (5 seconds = no rollback time!)" + type = number + default = 5 +} + +variable "application_port" { + description = "Port the Tomcat application listens on" + type = number + default = 8080 +} + +variable "cpu_units" { + description = "CPU units for ECS task (1024 = 1 vCPU)" + type = number + default = 1024 +} + +# Tags +variable "additional_tags" { + description = "Additional tags to apply to all resources" + type = map(string) + default = {} +} \ No newline at end of file diff --git a/modules/scenarios/memory_optimization/guide.md b/modules/scenarios/memory_optimization/guide.md new file mode 100644 index 0000000..c612ed2 --- /dev/null +++ b/modules/scenarios/memory_optimization/guide.md @@ -0,0 +1,235 @@ +GitHub Copilot Prompt for Self-Contained Memory Optimization Demo +markdown# Create a self-contained Terraform module for memory optimization demo + +## Context +I need to add a demo scenario to an existing terraform-example repository that shows how Overmind catches hidden risks. The demo must be completely self-contained and not affect any existing infrastructure. The scenario shows a Friday afternoon memory optimization to save costs that would actually cause a production outage. + +## Requirements + +### Module Location +Create all files in: `modules/scenarios/memory-optimization/` +This should be a completely isolated module that can be enabled/disabled without affecting anything else. + +### Directory Structure +modules/scenarios/memory-optimization/ +├── README.md # Demo instructions +├── main.tf # Module entry point with VPC logic +├── variables.tf # All input variables +├── outputs.tf # Module outputs +├── ecs.tf # ECS cluster, service, task definition +├── networking.tf # ALB, security groups, target groups +└── monitoring.tf # CloudWatch alarms + +### Core Scenario +Create infrastructure demonstrating: +1. Java application in ECS Fargate with 15 containers +2. Currently allocated 2048MB memory per container ($4000/month) +3. Java heap configured for 1536MB (-Xmx1536m) +4. CloudWatch showing "only 800MB average usage" +5. Changing to 1024MB would "save $2000/month" +6. But app needs 1536MB heap + 256MB overhead = crash + +### Key Variables in variables.tf +```hcl +variable "enabled" - Toggle module on/off (default: true) +variable "name_prefix" - Unique prefix for resources (default: "memory-opt-demo") +variable "container_memory" - Memory in MB (default: 2048, will change to 1024) +variable "number_of_containers" - ECS service count (default: 15) +variable "use_default_vpc" - Use default VPC (default: true) +variable "days_until_black_friday" - Context for urgency (default: 7) +variable "days_since_last_memory_change" - Shows staleness (default: 423) +Module Requirements +main.tf + +Use locals for all internal calculations +Support three VPC modes: + +use_default_vpc = true (use account's default VPC) +create_standalone_vpc = true (create isolated VPC) +Use provided vpc_id and subnet_ids + + +Generate random suffix for resource uniqueness +Calculate cost savings: current vs proposed memory +Add common tags to all resources showing the scenario context + +ecs.tf + +ECS cluster with container insights enabled +Task definition with: + +Fargate launch type +Public Tomcat image (tomcat:9-jre11) +JAVA_OPTS="-Xmx1536m -Xms1536m" (THE TRAP!) +memoryReservation: 800 (misleading metric) +Health check with 120s startup time for JVM +Container memory from var.container_memory + + +ECS service with: + +desired_count = var.number_of_containers (15) +Rolling update deployment +Tags showing this will affect all containers + + +IAM roles for ECS execution and task + +networking.tf + +ALB with public access +Target group with: + +deregistration_delay = 5 seconds (no rollback!) +Health check on port 8080 + + +Security groups for ALB and ECS +Tags mentioning Black Friday capacity needs + +monitoring.tf + +CloudWatch alarm for memory > 80% +CloudWatch alarm for task count < expected +Tags explaining alarms will fire when OOM occurs + +outputs.tf +hcloutput "alb_url" - URL to access the application +output "demo_status" - Object showing: + - current vs required memory + - will_it_work boolean + - cost calculations + - risk assessment +output "instructions" - How to break and fix the demo +Integration Points +The module should be usable in three ways: + +Standalone file (create as example): + +hcl# standalone-demo.tf +module "memory_optimization_demo" { + source = "./modules/scenarios/memory-optimization" + enabled = true + container_memory = 2048 # Change to 1024 to break +} + +Part of existing scenarios (if they have a pattern): + +hclmodule "scenarios" { + memory_optimization = { + enabled = true + } +} + +Targeted deployment: + +bashterraform apply -target='module.memory_optimization_demo' +Demo Flow in README.md +Include instructions for: + +Deploy initial setup with 2048MB +Create branch and change to 1024MB +Run terraform plan (shows 2 changes) +Overmind reveals 47 resources affected +Explanation of why it breaks (JVM heap > container memory) +How to clean up + +Resource Naming + +All resources must use: ${var.name_prefix}-resourcetype-${random_suffix} +This ensures no conflicts with existing infrastructure +Example: "memory-opt-demo-cluster-abc123" + +Cost Calculations +Include realistic cost math: + +Cost per GB: $50/month +Current: 2GB × 15 containers × $50 = $4000/month +"Optimized": 1GB × 15 containers × $50 = $2000/month +Savings: $2000/month + +Critical Comments +Add comments explaining: + +Why 800MB average is misleading (P99 is 1.8GB during GC) +Why JVM needs 1536MB heap + 256MB metaspace + 256MB OS +Why Black Friday timing matters (10x traffic) +Why 5-second deregistration prevents rollback +Why all 15 containers restarting is risky + +Make it Production-Like + +Use real container images (tomcat:9-jre11) +Real AWS resources (not local testing) +Realistic configurations (proper health checks, security groups) +Actual cost calculations +Production-like tags and metadata + +Safety Features + +Module can be disabled with enabled=false +All resources have unique names with random suffix +No hardcoded values that could conflict +Clean destroy with terraform destroy -target + +Expected Behavior +When container_memory changes from 2048 to 1024: + +Terraform plan: 2 resources to change +Reality: Application crashes immediately (JVM OOM) +Overmind catches: 47 resources affected, multiple critical risks + +Output Format +Generate complete, working Terraform files that: + +Are production-quality with proper error handling +Include extensive comments explaining the demo +Can be deployed immediately without modifications +Won't interfere with any existing infrastructure +Tell the story of why this change seems safe but isn't + + +--- + +## How to Use This with Copilot + +1. **Create the module directory**: +```bash +mkdir -p modules/scenarios/memory-optimization +cd modules/scenarios/memory-optimization + +Create a context file: + +bash# Save the prompt above as: +echo "# Memory Optimization Demo Module Requirements" > .copilot-context.md +# Paste the entire prompt into this file + +Generate each file with Copilot: + +For each file, start with a comment referencing the context: +hcl# main.tf +# Following .copilot-context.md requirements for memory optimization demo +# This creates a self-contained module showing how memory reduction breaks Java apps + +# Copilot will now understand the full context and generate appropriate code + +Helpful Copilot triggers: + +hcl# In variables.tf: +# Create variables for memory optimization demo as specified in .copilot-context.md +# Include container_memory that will change from 2048 to 1024 + +# In ecs.tf: +# Create ECS resources showing Java heap trap from .copilot-context.md +# Java needs 1536MB heap but container will only have 1024MB after change + +# In monitoring.tf: +# Create CloudWatch alarms that will fire when containers OOM +# Reference the memory optimization scenario from .copilot-context.md + +Test the generation: + +bash# After Copilot generates files, validate: +terraform init +terraform validate +terraform plan \ No newline at end of file diff --git a/modules/scenarios/outputs.tf b/modules/scenarios/outputs.tf new file mode 100644 index 0000000..badba59 --- /dev/null +++ b/modules/scenarios/outputs.tf @@ -0,0 +1,39 @@ +# outputs.tf +# Outputs for the scenarios module + +# Memory optimization demo outputs +output "memory_optimization_demo_status" { + description = "Status and analysis of the memory optimization demo" + value = var.enable_memory_optimization_demo ? module.memory_optimization.demo_status : null +} + +output "memory_optimization_demo_url" { + description = "URL to access the memory optimization demo application" + value = var.enable_memory_optimization_demo ? module.memory_optimization.alb_url : null +} + +output "memory_optimization_demo_instructions" { + description = "Instructions for running the memory optimization demo" + value = var.enable_memory_optimization_demo ? module.memory_optimization.instructions : null +} + +output "memory_optimization_cost_analysis" { + description = "Cost analysis for the memory optimization scenario" + value = var.enable_memory_optimization_demo ? module.memory_optimization.cost_analysis : null +} + +# VPC information (useful for other integrations) +output "vpc_id" { + description = "ID of the VPC created for scenarios" + value = module.vpc.vpc_id +} + +output "public_subnet_ids" { + description = "IDs of the public subnets" + value = module.vpc.public_subnets +} + +output "private_subnet_ids" { + description = "IDs of the private subnets" + value = module.vpc.private_subnets +} \ No newline at end of file diff --git a/modules/scenarios/variables.tf b/modules/scenarios/variables.tf index 75eeb5b..4f6befc 100644 --- a/modules/scenarios/variables.tf +++ b/modules/scenarios/variables.tf @@ -4,3 +4,28 @@ variable "example_env" { default = "github" type = string } + +# Memory optimization demo variables +variable "enable_memory_optimization_demo" { + description = "Enable the memory optimization demo scenario" + type = bool + default = false +} + +variable "memory_optimization_container_memory" { + description = "Memory allocation for containers in the demo (2048 = safe, 1024 = breaks)" + type = number + default = 2048 +} + +variable "memory_optimization_container_count" { + description = "Number of containers to run in the memory optimization demo" + type = number + default = 15 +} + +variable "days_until_black_friday" { + description = "Days until Black Friday (demo context)" + type = number + default = 7 +}