From b3bb1d1cad76f1a6cddf4c94d240705f8d58c427 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Mar 2024 16:57:35 +0000
Subject: [PATCH] storage controller: make direct tenant creation more robust
 (#7247)

## Problem

- Creations were not idempotent (unique key violation)
- Creations waited for reconciliation, which control plane blocks while
an operation is in flight

## Summary of changes

- Handle unique key constraint violation as an OK situation: if we're
creating the same tenant ID and shard count, it's reasonable to assume
this is a duplicate creation.
- Make the wait for reconcile during creation tolerate failures: this is
similar to location_conf, where the cloud control plane blocks our
notification calls until it is done with calling into our API (in future
this constraint is expected to relax as the cloud control plane learns
to run multiple operations concurrently for a tenant)
---
 .../attachment_service/src/service.rs         | 31 +++++++++++++++----
 test_runner/fixtures/neon_fixtures.py         |  3 ++
 test_runner/regress/test_sharding_service.py  |  5 +++
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index aa930014b280..925910253b9c 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1523,6 +1523,8 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<TenantCreateResponse, ApiError> {
+        let tenant_id = create_req.new_tenant_id.tenant_id;
+
         // Exclude any concurrent attempts to create/access the same tenant ID
         let _tenant_lock = self
             .tenant_op_locks
@@ -1531,7 +1533,12 @@ impl Service {
 
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
-        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
+            // accept compute notifications while it is in the process of creating.  Reconciliation will
+            // be retried in the background.
+            tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})");
+        }
         Ok(response)
     }
 
@@ -1610,13 +1617,25 @@ impl Service {
                 splitting: SplitState::default(),
             })
             .collect();
-        self.persistence
+
+        match self
+            .persistence
             .insert_tenant_shards(persist_tenant_shards)
             .await
-            .map_err(|e| {
-                // TODO: distinguish primary key constraint (idempotent, OK), from other errors
-                ApiError::InternalServerError(anyhow::anyhow!(e))
-            })?;
+        {
+            Ok(_) => {}
+            Err(DatabaseError::Query(diesel::result::Error::DatabaseError(
+                DatabaseErrorKind::UniqueViolation,
+                _,
+            ))) => {
+                // Unique key violation: this is probably a retry.  Because the shard count is part of the unique key,
+                // if we see a unique key violation it means that the creation request's shard count matches the previous
+                // creation's shard count.
+                tracing::info!("Tenant shards already present in database, proceeding with idempotent creation...");
+            }
+            // Any other database error is unexpected and a bug.
+            Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
+        };
 
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f22ce10c2024..3d60f9bef58f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2126,6 +2126,8 @@ def tenant_create(
             shard_params = {"count": shard_count}
             if shard_stripe_size is not None:
                 shard_params["stripe_size"] = shard_stripe_size
+            else:
+                shard_params["stripe_size"] = 32768
 
             body["shard_parameters"] = shard_params
 
@@ -2139,6 +2141,7 @@ def tenant_create(
             json=body,
             headers=self.headers(TokenScope.PAGE_SERVER_API),
         )
+        response.raise_for_status()
         log.info(f"tenant_create success: {response.json()}")
 
     def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index b7488cadd6db..fc6c13766766 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -89,6 +89,11 @@ def test_sharding_service_smoke(
     for tid in tenant_ids:
         env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
 
+    # Repeating a creation should be idempotent (we are just testing it doesn't return an error)
+    env.storage_controller.tenant_create(
+        tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant
+    )
+
     for node_id, count in get_node_shard_counts(env, tenant_ids).items():
         # we used a multiple of pagservers for the total shard count,
         # so expect equal number on all pageservers