Skip to content

Commit

Permalink
Additions to TCP/DNS stage check, fix 1027 (#1063)
Browse files Browse the repository at this point in the history
* Catch exception, try again

* Increase timeout

* Increase timeout again

* Fix

* Add finally

* Strip newline char

* Add exponential backup for dns check

* fix

* Add global check settings, wording

* Make prompt use multiple lines instead of one extremely long one

* typo fix

Co-authored-by: t-potts <tylerjacobpotts@gmail.com>
  • Loading branch information
iameskild and tylerpotts committed Feb 15, 2022
1 parent fb916b9 commit 5bdf138
Showing 1 changed file with 53 additions and 24 deletions.
77 changes: 53 additions & 24 deletions qhub/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@

logger = logging.getLogger(__name__)

# check and retry settings
NUM_ATTEMPTS = 10
TIMEOUT = 10 # seconds


def deploy_configuration(
config,
Expand Down Expand Up @@ -433,20 +437,25 @@ def provision_04_kubernetes_ingress(stage_outputs, config, check=True):
def check_04_kubernetes_ingress(stage_outputs, qhub_config):
directory = "stages/04-kubernetes-ingress"

def _attempt_tcp_connect(host, port, num_attempts=3, timeout=5):
# normalize hostname to ip address
host = socket.gethostbyname(host)

def _attempt_tcp_connect(host, port, num_attempts=NUM_ATTEMPTS, timeout=TIMEOUT):
for i in range(num_attempts):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(5)
result = s.connect_ex((host, port))
if result == 0:
print(f"Attempt {i+1} succedded to connect to tcp://{host}:{port}")
return True
s.close()
print(f"Attempt {i+1} failed to connect to tcp tcp://{host}:{port}")
try:
# normalize hostname to ip address
ip = socket.gethostbyname(host)
s.settimeout(5)
result = s.connect_ex((ip, port))
if result == 0:
print(f"Attempt {i+1} succedded to connect to tcp://{ip}:{port}")
return True
print(f"Attempt {i+1} failed to connect to tcp tcp://{ip}:{port}")
except socket.gaierror:
print(f"Attempt {i+1} failed to get IP for {host}...")
finally:
s.close()

time.sleep(timeout)

return False

tcp_ports = {
Expand All @@ -459,6 +468,7 @@ def _attempt_tcp_connect(host, port, num_attempts=3, timeout=5):
}
ip_or_name = stage_outputs[directory]["load_balancer_address"]["value"]
host = ip_or_name["hostname"] or ip_or_name["ip"]
host = host.strip("\n")

for port in tcp_ports:
if not _attempt_tcp_connect(host, port):
Expand Down Expand Up @@ -512,17 +522,19 @@ def provision_ingress_dns(
)

if check:
check_ingress_dns(stage_outputs, config)
check_ingress_dns(stage_outputs, config, disable_prompt)


def check_ingress_dns(stage_outputs, config):
def check_ingress_dns(stage_outputs, config, disable_prompt):
directory = "stages/04-kubernetes-ingress"

ip_or_name = stage_outputs[directory]["load_balancer_address"]["value"]
ip = socket.gethostbyname(ip_or_name["hostname"] or ip_or_name["ip"])
domain_name = config["domain"]

def _attempt_dns_lookup(domain_name, ip, num_attempts=12, timeout=5):
def _attempt_dns_lookup(
domain_name, ip, num_attempts=NUM_ATTEMPTS, timeout=TIMEOUT
):
for i in range(num_attempts):
try:
resolved_ip = socket.gethostbyname(domain_name)
Expand All @@ -542,11 +554,25 @@ def _attempt_dns_lookup(domain_name, ip, num_attempts=12, timeout=5):
time.sleep(timeout)
return False

if not _attempt_dns_lookup(domain_name, ip):
print(
f"ERROR: After stage directory={directory} DNS domain={domain_name} does not point to ip={ip}"
)
sys.exit(1)
attempt = 0
while not _attempt_dns_lookup(domain_name, ip):
sleeptime = 60 * (2 ** attempt)
if not disable_prompt:
input(
f"After attempting to poll the DNS, the record for domain={domain_name} appears not to exist, "
f"has recently been updated, or has yet to fully propogate. This non-deterministic behavior is likely due to "
f"DNS caching and will likely resolve itself in a few minutes.\n\n\tTo poll the DNS again in {sleeptime} seconds "
f"[Press Enter].\n\n...otherwise kill the process and run the deployment again later..."
)

print(f"Will attempt to poll DNS again in {sleeptime} seconds...")
time.sleep(sleeptime)
attempt += 1
if attempt == 5:
print(
f"ERROR: After stage directory={directory} DNS domain={domain_name} does not point to ip={ip}"
)
sys.exit(1)


def provision_05_kubernetes_keycloak(stage_outputs, config, check=True):
Expand Down Expand Up @@ -589,8 +615,8 @@ def _attempt_keycloak_connection(
realm_name,
client_id,
verify=False,
num_attempts=3,
timeout=5,
num_attempts=NUM_ATTEMPTS,
timeout=TIMEOUT,
):
for i in range(num_attempts):
try:
Expand Down Expand Up @@ -665,8 +691,8 @@ def _attempt_keycloak_connection(
client_id,
qhub_realm,
verify=False,
num_attempts=5,
timeout=10,
num_attempts=NUM_ATTEMPTS,
timeout=TIMEOUT,
):
for i in range(num_attempts):
try:
Expand Down Expand Up @@ -784,14 +810,17 @@ def check_07_kubernetes_services(stage_outputs, config):

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def _attempt_connect_url(url, verify=False, num_attempts=3, timeout=5):
def _attempt_connect_url(
url, verify=False, num_attempts=NUM_ATTEMPTS, timeout=TIMEOUT
):
for i in range(num_attempts):
response = requests.get(service_url, verify=verify)
if response.status_code < 400:
print(f"Attempt {i+1} health check succeded for url={url}")
return True
else:
print(f"Attempt {i+1} health check failed for url={url}")
time.sleep(timeout)
return False

services = stage_outputs[directory]["service_urls"]["value"]
Expand Down

0 comments on commit 5bdf138

Please sign in to comment.