Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additions to TCP/DNS stage check, fix 1027 #1063

Merged
merged 12 commits into from
Feb 15, 2022
74 changes: 50 additions & 24 deletions qhub/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@

logger = logging.getLogger(__name__)

# check and retry settings
NUM_ATTEMPTS = 10
TIMEOUT = 10 # seconds


def deploy_configuration(
config,
Expand Down Expand Up @@ -433,20 +437,25 @@ def provision_04_kubernetes_ingress(stage_outputs, config, check=True):
def check_04_kubernetes_ingress(stage_outputs, qhub_config):
directory = "stages/04-kubernetes-ingress"

def _attempt_tcp_connect(host, port, num_attempts=3, timeout=5):
# normalize hostname to ip address
host = socket.gethostbyname(host)

def _attempt_tcp_connect(host, port, num_attempts=NUM_ATTEMPTS, timeout=TIMEOUT):
for i in range(num_attempts):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(5)
result = s.connect_ex((host, port))
if result == 0:
print(f"Attempt {i+1} succedded to connect to tcp://{host}:{port}")
return True
s.close()
print(f"Attempt {i+1} failed to connect to tcp tcp://{host}:{port}")
try:
# normalize hostname to ip address
ip = socket.gethostbyname(host)
s.settimeout(5)
result = s.connect_ex((ip, port))
if result == 0:
print(f"Attempt {i+1} succedded to connect to tcp://{ip}:{port}")
return True
print(f"Attempt {i+1} failed to connect to tcp tcp://{ip}:{port}")
except socket.gaierror:
print(f"Attempt {i+1} failed to get IP for {host}...")
finally:
s.close()

time.sleep(timeout)

return False

tcp_ports = {
Expand All @@ -459,6 +468,7 @@ def _attempt_tcp_connect(host, port, num_attempts=3, timeout=5):
}
ip_or_name = stage_outputs[directory]["load_balancer_address"]["value"]
host = ip_or_name["hostname"] or ip_or_name["ip"]
host = host.strip("\n")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There appears to be a newline character at the end of host for a reason I have yet to figure out.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be fine. If the newline for some reason doesn't exist in the future, this won't cause any problems.


for port in tcp_ports:
if not _attempt_tcp_connect(host, port):
Expand Down Expand Up @@ -512,17 +522,19 @@ def provision_ingress_dns(
)

if check:
check_ingress_dns(stage_outputs, config)
check_ingress_dns(stage_outputs, config, disable_prompt)


def check_ingress_dns(stage_outputs, config):
def check_ingress_dns(stage_outputs, config, disable_prompt):
directory = "stages/04-kubernetes-ingress"

ip_or_name = stage_outputs[directory]["load_balancer_address"]["value"]
ip = socket.gethostbyname(ip_or_name["hostname"] or ip_or_name["ip"])
domain_name = config["domain"]

def _attempt_dns_lookup(domain_name, ip, num_attempts=12, timeout=5):
def _attempt_dns_lookup(
domain_name, ip, num_attempts=NUM_ATTEMPTS, timeout=TIMEOUT
):
for i in range(num_attempts):
try:
resolved_ip = socket.gethostbyname(domain_name)
Expand All @@ -542,11 +554,22 @@ def _attempt_dns_lookup(domain_name, ip, num_attempts=12, timeout=5):
time.sleep(timeout)
return False

if not _attempt_dns_lookup(domain_name, ip):
print(
f"ERROR: After stage directory={directory} DNS domain={domain_name} does not point to ip={ip}"
)
sys.exit(1)
attempt = 0
while not _attempt_dns_lookup(domain_name, ip):
sleeptime = 60 * (2 ** attempt)
if not disable_prompt:
input(
f"After attempting to poll the DNS, the record for domain={domain_name} appears not to exist, has recently been updaed, or has yet to fully propogate. This non-deterministic behavior is likely due to DNS caching and will likely resolve itself in a few minutes.\n\n\tTo poll the DNS again in {sleeptime} seconds [Press Enter].\n\n...otherwise kill the process and run the deployment again later..."
)

print(f"Will attempt to poll DNS again in {sleeptime} seconds...")
time.sleep(sleeptime)
attempt += 1
if attempt == 5:
print(
f"ERROR: After stage directory={directory} DNS domain={domain_name} does not point to ip={ip}"
)
sys.exit(1)


def provision_05_kubernetes_keycloak(stage_outputs, config, check=True):
Expand Down Expand Up @@ -589,8 +612,8 @@ def _attempt_keycloak_connection(
realm_name,
client_id,
verify=False,
num_attempts=3,
timeout=5,
num_attempts=NUM_ATTEMPTS,
timeout=TIMEOUT,
):
for i in range(num_attempts):
try:
Expand Down Expand Up @@ -665,8 +688,8 @@ def _attempt_keycloak_connection(
client_id,
qhub_realm,
verify=False,
num_attempts=5,
timeout=10,
num_attempts=NUM_ATTEMPTS,
timeout=TIMEOUT,
):
for i in range(num_attempts):
try:
Expand Down Expand Up @@ -784,14 +807,17 @@ def check_07_kubernetes_services(stage_outputs, config):

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def _attempt_connect_url(url, verify=False, num_attempts=3, timeout=5):
def _attempt_connect_url(
url, verify=False, num_attempts=NUM_ATTEMPTS, timeout=TIMEOUT
):
for i in range(num_attempts):
response = requests.get(service_url, verify=verify)
if response.status_code < 400:
print(f"Attempt {i+1} health check succeded for url={url}")
return True
else:
print(f"Attempt {i+1} health check failed for url={url}")
time.sleep(timeout)
return False

services = stage_outputs[directory]["service_urls"]["value"]
Expand Down