From 3747a091b58a13c64cd69ccb69e2c6634ceb5bb6 Mon Sep 17 00:00:00 2001 From: Floris Bruynooghe Date: Wed, 13 Mar 2024 14:05:51 +0100 Subject: [PATCH] fix(iroh-net): Work around broken windows DNS configuration (#2075) ## Description This actively refuses to use the `fec0:0:0:ffff::1`, `fec0:0:0:ffff::2` and `fec0:0:0:ffff::3` DNS servers if the system has them configured. Windows by default adds 3 IPv6 site-local anycast addresses to the DNS servers: `fec0:0:0:ffff::1`, `fec0:0:0:ffff::2` and `fec0:0:0:ffff::3`. Supposedly Microsoft DNS servers by default listen on those. These are present as soon as there's an IPv6 interface configured it seems, even for a loopback interface which is extremely common if not the default. Our hickory-resolver loads the system configuration, which includes these 3 IPv6 DNS servers. When it needs to make a DNS query it selects a random nameserver and tries this. If that fails it will try another one. For the next query there is bias, it will remember which servers to avoid or use. So if you get lucky and your first query falls on an actual DNS server then you are good. If you get unlucky recovering is a bit of a tussle because: Inside netcheck we do DNS queries with a 1s timeout, this because all the probes have a 3s timeout. However hickory-resolver has a 5s timeout configured, so it's queries stay alive longer than ours. This means almost all subsequent DNS queries will end up reusing an existing connection to one of those bad servers if you are unlucky to land on one. The interplay of these timeouts and the connection reuse make recovering to a good DNS server a rather tough prospect for netcheck. It probably would eventually, given enough netcheck runs (which run at intervals of ~30s). The odds of these nameservers being the sole way of having working DNS is basically zero. The odds of these nameservers breaking the resolver are about 50%. So remove these deprecated things. ## Notes & open questions Unfortunately the resolver returned by `get_resolver()` does not have an API that allows to test it. But the test would basically be the inverse logic of the logic that removes the bad servers so perhaps not that useful anyway. Closes #2069 Closes https://github.com/n0-computer/dumbpipe/issues/17 ## Change checklist - [x] Self-review. - [x] Documentation updates if relevant. - [x] Tests if relevant. --- iroh-net/src/dns.rs | 36 ++++++++++++++++++++++++++---- iroh-net/src/netcheck/reportgen.rs | 1 - 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/iroh-net/src/dns.rs b/iroh-net/src/dns.rs index c8e668702a..ac675a7c66 100644 --- a/iroh-net/src/dns.rs +++ b/iroh-net/src/dns.rs @@ -1,4 +1,4 @@ -use std::net::IpAddr; +use std::net::{IpAddr, Ipv6Addr}; use std::time::Duration; use anyhow::Result; @@ -8,14 +8,44 @@ use once_cell::sync::Lazy; pub static DNS_RESOLVER: Lazy = Lazy::new(|| get_resolver().expect("unable to create DNS resolver")); +/// Deprecated IPv6 site-local anycast addresses still configured by windows. +/// +/// Windows still configures these site-local addresses as soon even as an IPv6 loopback +/// interface is configured. We do not want to use these DNS servers, the chances of them +/// being usable are almost always close to zero, while the chance of DNS configuration +/// **only** relying on these servers and not also being configured normally are also almost +/// zero. The chance of the DNS resolver accidentally trying one of these and taking a +/// bunch of timeouts to figure out they're no good are on the other hand very high. +const WINDOWS_BAD_SITE_LOCAL_DNS_SERVERS: [IpAddr; 3] = [ + IpAddr::V6(Ipv6Addr::new(0xfec0, 0, 0, 0xffff, 0, 0, 0, 1)), + IpAddr::V6(Ipv6Addr::new(0xfec0, 0, 0, 0xffff, 0, 0, 0, 2)), + IpAddr::V6(Ipv6Addr::new(0xfec0, 0, 0, 0xffff, 0, 0, 0, 3)), +]; + /// Get resolver to query MX records. /// /// We first try to read the system's resolver from `/etc/resolv.conf`. /// This does not work at least on some Androids, therefore we fallback /// to the default `ResolverConfig` which uses eg. to google's `8.8.8.8` or `8.8.4.4`. fn get_resolver() -> Result { - let (config, mut options) = + let (system_config, mut options) = hickory_resolver::system_conf::read_system_conf().unwrap_or_default(); + + // Copy all of the system config, but strip the bad windows nameservers. Unfortunately + // there is no easy way to do this. + let mut config = hickory_resolver::config::ResolverConfig::new(); + if let Some(name) = system_config.domain() { + config.set_domain(name.clone()); + } + for name in system_config.search() { + config.add_search(name.clone()); + } + for nameserver_cfg in system_config.name_servers() { + if !WINDOWS_BAD_SITE_LOCAL_DNS_SERVERS.contains(&nameserver_cfg.socket_addr.ip()) { + config.add_name_server(nameserver_cfg.clone()); + } + } + // lookup IPv4 and IPv6 in parallel options.ip_strategy = hickory_resolver::config::LookupIpStrategy::Ipv4thenIpv6; @@ -101,7 +131,6 @@ mod tests { use super::*; #[tokio::test] - #[cfg_attr(target_os = "windows", ignore = "flaky")] async fn test_dns_lookup_basic() { let _logging = iroh_test::logging::setup(); let res = DNS_RESOLVER.lookup_ip(NA_DERP_HOSTNAME).await.unwrap(); @@ -111,7 +140,6 @@ mod tests { } #[tokio::test] - #[cfg_attr(target_os = "windows", ignore = "flaky")] async fn test_dns_lookup_ipv4_ipv6() { let _logging = iroh_test::logging::setup(); let res = lookup_ipv4_ipv6(NA_DERP_HOSTNAME, Duration::from_secs(5)) diff --git a/iroh-net/src/netcheck/reportgen.rs b/iroh-net/src/netcheck/reportgen.rs index 4574ba716b..8e413e6ca8 100644 --- a/iroh-net/src/netcheck/reportgen.rs +++ b/iroh-net/src/netcheck/reportgen.rs @@ -1309,7 +1309,6 @@ mod tests { // // TODO: Not sure what about IPv6 pings using sysctl. #[tokio::test] - #[cfg_attr(target_os = "windows", ignore = "flaky")] async fn test_icmp_probe_eu_derper() { let _logging_guard = iroh_test::logging::setup(); let pinger = Pinger::new();