Skip to content

Commit

Permalink
compute,storage: set SO_REUSEADDR when binding to the HTTP port
Browse files Browse the repository at this point in the history
In scenarios where replicas and sources are rapidly killed and restarted,
computed and storaged may fail to bind to their assigned HTTP port
if that port has been just freed by some other process.

Previously, this would cause the process to panic and be restarted
by the process orchestrator. Now that all panics are fatal, set the
SO_REUSEADDR socket option so that bind() succeeds instead.

Relates to: MaterializeInc#15336
  • Loading branch information
philip-stoev committed Dec 1, 2022
1 parent b39aeb9 commit 222de04
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 13 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/compute/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ mz-repr = { path = "../repr" }
mz-service = { path = "../service" }
mz-storage-client = { path = "../storage-client" }
mz-timely-util = { path = "../timely-util" }
nix = "0.26.1"
once_cell = "1.16.0"
prometheus = { version = "0.13.3", default-features = false }
scopeguard = "1.1.0"
Expand Down
15 changes: 9 additions & 6 deletions src/compute/src/bin/computed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0.

use std::net::SocketAddr;
use std::net::{SocketAddr, TcpListener};
use std::os::unix::io::AsRawFd;
use std::path::PathBuf;
use std::process;

use axum::routing;
use nix::sys::socket::{self, sockopt::ReusePort};
use once_cell::sync::Lazy;
use tracing::info;

Expand Down Expand Up @@ -119,9 +121,10 @@ async fn run(args: Args) -> Result<(), anyhow::Error> {
"serving internal HTTP server on {}",
args.internal_http_listen_addr
);
mz_ore::task::spawn(
|| "computed_internal_http_server",
axum::Server::bind(&args.internal_http_listen_addr).serve(
mz_ore::task::spawn(|| "computed_internal_http_server", {
let listener = TcpListener::bind(args.internal_http_listen_addr)?;
socket::setsockopt(listener.as_raw_fd(), ReusePort, &true)?;
axum::Server::from_tcp(listener)?.serve(
mz_prof::http::router(&BUILD_INFO)
.route(
"/api/livez",
Expand Down Expand Up @@ -154,8 +157,8 @@ async fn run(args: Args) -> Result<(), anyhow::Error> {
}),
)
.into_make_service(),
),
);
)
});
}

let config = mz_compute::server::Config {
Expand Down
1 change: 1 addition & 0 deletions src/orchestrator-process/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ async-trait = "0.1.58"
chrono = { version = "0.4.23", default_features = false, features = ["clock"] }
futures = "0.3.25"
itertools = "0.10.5"
libc = "0.2.137"
mz-orchestrator = { path = "../orchestrator" }
mz-ore = { path = "../ore", features = ["async"] }
mz-pid-file = { path = "../pid-file" }
Expand Down
2 changes: 1 addition & 1 deletion src/orchestrator-process/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ async fn supervise(
if panic_on_child_crash {
if let Ok(status) = status {
match status.signal() {
Some(6) | Some(7) | Some(11) => {
Some(libc::SIGABRT) | Some(libc::SIGBUS) | Some(libc::SIGSEGV) => {
panic!("{} crashed: {:?}; Environmentd will panic itself because --orchestrator-process-panic-on-child-crash is in effect.", full_id, status);
}
_ => {}
Expand Down
1 change: 1 addition & 0 deletions src/storage/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ mz-secrets = { path = "../secrets" }
mz-service = { path = "../service" }
mz-storage-client = { path = "../storage-client" }
mz-timely-util = { path = "../timely-util" }
nix = "0.26.1"
once_cell = { version = "1.16.0" }
postgres-protocol = { git = "https://github.com/MaterializeInc/rust-postgres" }
prometheus = { version = "0.13.3", default-features = false }
Expand Down
16 changes: 10 additions & 6 deletions src/storage/src/bin/storaged.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
// by the Apache License, Version 2.0.

use std::env;
use std::net::SocketAddr;
use std::net::{SocketAddr, TcpListener};
use std::os::unix::io::AsRawFd;
use std::path::PathBuf;
use std::process;

use anyhow::{bail, Context};
use axum::routing;
use mz_cloud_resources::AwsExternalIdPrefix;
use nix::sys::socket::{self, sockopt::ReusePort};
use once_cell::sync::Lazy;
use tracing::info;

Expand Down Expand Up @@ -156,9 +158,11 @@ async fn run(args: Args) -> Result<(), anyhow::Error> {
"serving internal HTTP server on {}",
args.internal_http_listen_addr
);
mz_ore::task::spawn(
|| "storaged_internal_http_server",
axum::Server::bind(&args.internal_http_listen_addr).serve(

mz_ore::task::spawn(|| "storaged_internal_http_server", {
let listener = TcpListener::bind(args.internal_http_listen_addr)?;
socket::setsockopt(listener.as_raw_fd(), ReusePort, &true)?;
axum::Server::from_tcp(listener)?.serve(
mz_prof::http::router(&BUILD_INFO)
.route(
"/api/livez",
Expand Down Expand Up @@ -191,8 +195,8 @@ async fn run(args: Args) -> Result<(), anyhow::Error> {
}),
)
.into_make_service(),
),
);
)
});
}

let secrets_reader = args
Expand Down

0 comments on commit 222de04

Please sign in to comment.