From 9f9ab18f81220ef115c84faa18799adb9e5849a5 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 7 Sep 2022 16:23:37 -0400 Subject: [PATCH] Check if CRDB is telling us to retry the txn Match against DieselDatabaseErrorKind::SerializationFailure and check if it's a case where a CRDB transaction didn't fail but needs to be retried. This can happen if there's too much database contention, and requires clients to retry the request. Return HTTP 429 in this case to signal to clients that a retry is required. --- common/src/api/external/error.rs | 32 ++++++++++++++++++++++- nexus/src/app/session.rs | 3 ++- nexus/src/db/datastore/region.rs | 44 ++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 2 deletions(-) diff --git a/common/src/api/external/error.rs b/common/src/api/external/error.rs index dc1459f46aa..c50612c53fa 100644 --- a/common/src/api/external/error.rs +++ b/common/src/api/external/error.rs @@ -61,6 +61,9 @@ pub enum Error { #[error("Type version mismatch! {internal_message}")] TypeVersionMismatch { internal_message: String }, + + #[error("Too many requests! {internal_message}")] + TooManyRequests { internal_message: String }, } /// Indicates how an object was looked up (for an `ObjectNotFound` error) @@ -108,7 +111,8 @@ impl Error { /// retried pub fn retryable(&self) -> bool { match self { - Error::ServiceUnavailable { .. } => true, + Error::ServiceUnavailable { .. } + | Error::TooManyRequests { .. } => true, Error::ObjectNotFound { .. } | Error::ObjectAlreadyExists { .. } @@ -174,6 +178,15 @@ impl Error { Error::TypeVersionMismatch { internal_message: message.to_owned() } } + /// Generates an [`Error::TooManyRequests`] with a specific message. + /// + /// TooManyRequests is returned when the control plane can't keep up with + /// requests, but there wasn't an associated error. It's meant to signal to + /// users that the request can be re-attempted. + pub fn too_many_requests(message: &str) -> Error { + Error::TooManyRequests { internal_message: message.to_owned() } + } + /// Given an [`Error`] with an internal message, return the same error with /// `context` prepended to it to provide more context /// @@ -188,6 +201,7 @@ impl Error { | Error::InvalidRequest { .. } | Error::InvalidValue { .. } | Error::Forbidden => self, + Error::Unauthenticated { internal_message } => { Error::Unauthenticated { internal_message: format!( @@ -223,6 +237,14 @@ impl Error { ), } } + Error::TooManyRequests { internal_message } => { + Error::TooManyRequests { + internal_message: format!( + "{}: {}", + context, internal_message + ), + } + } } } } @@ -317,6 +339,14 @@ impl From for HttpError { Error::TypeVersionMismatch { internal_message } => { HttpError::for_internal_error(internal_message) } + + Error::TooManyRequests { internal_message } => { + HttpError::for_client_error( + Some(String::from("TooManyRequests")), + http::StatusCode::TOO_MANY_REQUESTS, + internal_message, + ) + } } } } diff --git a/nexus/src/app/session.rs b/nexus/src/app/session.rs index 7ac23cf5ce3..2ce612df3c1 100644 --- a/nexus/src/app/session.rs +++ b/nexus/src/app/session.rs @@ -155,7 +155,8 @@ impl super::Nexus { | Error::InternalError { .. } | Error::ServiceUnavailable { .. } | Error::MethodNotAllowed { .. } - | Error::TypeVersionMismatch { .. } => { + | Error::TypeVersionMismatch { .. } + | Error::TooManyRequests { .. } => { Reason::UnknownError { source: error } } })?; diff --git a/nexus/src/db/datastore/region.rs b/nexus/src/db/datastore/region.rs index ad354b7bb8e..41df536acfa 100644 --- a/nexus/src/db/datastore/region.rs +++ b/nexus/src/db/datastore/region.rs @@ -20,7 +20,10 @@ use crate::db::model::Zpool; use crate::external_api::params; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; +use async_bb8_diesel::{ConnectionError, PoolError}; use diesel::prelude::*; +use diesel::result::DatabaseErrorKind as DieselDatabaseErrorKind; +use diesel::result::Error as DieselError; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; use uuid::Uuid; @@ -314,6 +317,47 @@ impl DataStore { ) => Error::unavail( "Not enough available space to allocate disks", ), + TxnError::Pool(PoolError::Connection( + ConnectionError::Query(DieselError::DatabaseError( + DieselDatabaseErrorKind::SerializationFailure, + ref boxed_error, + )), + )) => { + // Extract the boxed_error from the serialization failure, + // and check if cockroachdb returned a case where the + // transaction didn't fail, but needs to be retried. + // + // From https://www.cockroachlabs.com/docs/v21.2/transactions#client-side-intervention: + // + // To indicate that a transaction must be retried, + // CockroachDB signals an error with the code 40001 and an + // error message that begins with the string "retry + // transaction". + // + // Note I also saw: + // + // restart transaction: TransactionRetryWithProtoRefreshError: TransactionRetryError: retry txn (RETRY_SERIALIZABLE ... + // + // with a link to https://www.cockroachlabs.com/docs/v21.2/transaction-retry-error-reference.html#retry_serializable + // + // Check for "restart transaction" and "retry transaction" + // here, and let users know when too many transactions are + // in flight and causing retryable failures. + if boxed_error.message().starts_with("restart transaction") + || boxed_error + .message() + .starts_with("retry transaction") + { + Error::too_many_requests("transaction must be retried") + } else { + // other types of serialization failure should be + // considered an internal error + Error::internal_error(&format!( + "Transaction error: {}", + e + )) + } + } _ => { Error::internal_error(&format!("Transaction error: {}", e)) }