TSG024 - Namenode is in safe mode
=================================

HDFS can get itself into Safe mode. For example if too many Pods are
re-cycled too quickly in the Storage Pool, Safe modse may be
automatically enabled.

When starting a spark session, the user may see (this is a typicaly of
trying to start a PySpark or PySpark3 session in a notebook from Azure
Data Studio):

> The code failed because of a fatal error: Error sending http request
> and maximum retry encountered..
>
> Some things to try: a) Make sure Spark has enough available resources
> for Jupyter to create a Spark context. b) Contact your Jupyter
> administrator to make sure the Spark magics library is configured
> correctly. c) Restart the kernel.

Use this notebook to run a report to understand more about HDFS, and
optionally move the cluster out of Safe mode if it is safe to do.

Steps
-----

### Common functions

Define helper functions used in this notebook.

In [None]:
# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows
import sys
import os
import re
import platform
import shlex
import shutil
import datetime

from subprocess import Popen, PIPE
from IPython.display import Markdown

def run(cmd, return_output=False, no_output=False, error_hints=[], retry_hints=[], retry_count=0):
    """
    Run shell command, stream stdout, print stderr and optionally return output
    """
    max_retries = 5
    install_hint = None
    output = ""
    retry = False

    # shlex.split is required on bash and for Windows paths with spaces
    #
    cmd_actual = shlex.split(cmd)

    # When running python, use the python in the ADS sandbox ({sys.executable})
    #
    if cmd.startswith("python "):
        cmd_actual[0] = cmd_actual[0].replace("python", sys.executable)

        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail
        # with:
        #
        #       UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)
        #
        # Setting it to a default value of "en_US.UTF-8" enables pip install to complete
        #
        if platform.system() == "Darwin" and "LC_ALL" not in os.environ:
            os.environ["LC_ALL"] = "en_US.UTF-8"

        python_retry_hints, python_error_hints, install_hint = python_hints()
        retry_hints += python_retry_hints
        error_hints += python_error_hints

    if (cmd.startswith("kubectl ")):
        kubectl_retry_hints, kubectl_error_hints, install_hint = kubectl_hints()
        retry_hints += kubectl_retry_hints
        error_hints += kubectl_error_hints

    if (cmd.startswith("azdata ")):
        azdata_retry_hints, azdata_error_hints, install_hint = azdata_hints()
        retry_hints += azdata_retry_hints
        error_hints += azdata_error_hints

    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this
    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) 
    #
    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.
    #
    which_binary = shutil.which(cmd_actual[0])

    if which_binary == None:
        if install_hint is not None:
            display(Markdown(f'SUGGEST: Use {install_hint} to resolve this issue.'))

        raise FileNotFoundError(f"Executable '{cmd_actual[0]}' not found in path (where/which)")
    else:   
        cmd_actual[0] = which_binary

    start_time = datetime.datetime.now().replace(microsecond=0)

    print(f"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)")
    print(f"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})")
    print(f"       cwd: {os.getcwd()}")

    # Command-line tools such as CURL and AZDATA HDFS commands output
    # scrolling progress bars, which causes Jupyter to hang forever, to
    # workaround this, use no_output=True
    #
    try:
        if no_output:
            p = Popen(cmd_actual)
        else:
            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)
            with p.stdout:
                for line in iter(p.stdout.readline, b''):
                    line = line.decode()
                    if return_output:
                        output = output + line
                    else:
                        if cmd.startswith("azdata notebook run"): # Hyperlink the .ipynb file
                            regex = re.compile('  "(.*)"\: "(.*)"') 
                            match = regex.match(line)
                            if match:
                                if match.group(1).find("HTML") != -1:
                                    display(Markdown(f' - "{match.group(1)}": "{match.group(2)}"'))
                                else:
                                    display(Markdown(f' - "{match.group(1)}": "[{match.group(2)}]({match.group(2)})"'))
                        else:
                            print(line, end='')
        p.wait()
    except FileNotFoundError as e:
        if install_hint is not None:
            display(Markdown(f'SUGGEST: Use {install_hint} to resolve this issue.'))

        raise FileNotFoundError(f"Executable '{cmd_actual[0]}' not found in path (where/which)") from e

    if not no_output:
        for line in iter(p.stderr.readline, b''):
            line_decoded = line.decode()

            # azdata emits a single empty line to stderr when doing an hdfs cp, don't
            # print this empty "ERR:" as it confuses.
            #
            if line_decoded == "":
                continue
            
            print(f"ERR: {line_decoded}", end='')

            for error_hint in error_hints:
                if line_decoded.find(error_hint[0]) != -1:
                    display(Markdown(f'SUGGEST: Use [{error_hint[2]}]({error_hint[1]}) to resolve this issue.'))

            for retry_hint in retry_hints:
                if line_decoded.find(retry_hint) != -1:
                    if retry_count < max_retries:
                        print(f"RETRY: {retry_count} (due to: {retry_hint})")
                        retry_count = retry_count + 1
                        output = run(cmd, return_output=return_output, error_hints=error_hints, retry_hints=retry_hints, retry_count=retry_count)

                        if return_output:
                            return output
                        else:
                            return

    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time

    if p.returncode != 0:
        raise SystemExit(f'Shell command:\n\n\t{cmd} ({elapsed}s elapsed)\n\nreturned non-zero exit code: {str(p.returncode)}.\n')

    print(f'\nSUCCESS: {elapsed}s elapsed\n')

    if return_output:
        return output

def kubectl_hints():

    retry_hints = [
        "A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond"
    ]

    error_hints = [
    ["""no such host""", """../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb""", """TSG010 - Get configuration contexts"""],
    ["""no such host""", """../repair/tsg011-restart-sparkhistory-server.ipynb""", """TSG011 - Restart sparkhistory server"""],
    ["""No connection could be made because the target machine actively refused it""", """../repair/tsg056-kubectl-no-connection-could-be-made.ipynb""", """TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it"""]
]

    install_hint = "[SOP036 - Install kubectl command line interface](../install/sop036-install-kubectl.ipynb)'"

    return retry_hints, error_hints, install_hint


print('Common functions defined successfully.')

### Instantiate Kubernetes client

In [None]:
# Instantiate the Python Kubernetes client into 'api' variable

import os

try:
    from kubernetes import client, config
    from kubernetes.stream import stream

    if "KUBERNETES_SERVICE_PORT" in os.environ and "KUBERNETES_SERVICE_HOST" in os.environ:
        config.load_incluster_config()
    else:
        config.load_kube_config()

    api = client.CoreV1Api()

    print('Kubernetes client instantiated')
except ImportError:
    from IPython.display import Markdown
    display(Markdown(f'SUGGEST: Use [SOP059 - Install Kubernetes Python module](../install/sop059-install-kubernetes-module.ipynb) to resolve this issue.'))
    raise

### Get the namespace for the big data cluster

Get the namespace of the big data cluster from the Kuberenetes API.

NOTE: If there is more than one big data cluster in the target
Kubernetes cluster, then set \[0\] to the correct value for the big data
cluster.

In [None]:
# Place Kubernetes namespace name for BDC into 'namespace' variable

try:
    namespace = api.list_namespace(label_selector='MSSQL_CLUSTER').items[0].metadata.name
except IndexError:
    from IPython.display import Markdown
    display(Markdown(f'SUGGEST: Use [TSG081 - Get namespaces (Kubernetes)](../monitor-k8s/tsg081-get-kubernetes-namespaces.ipynb) to resolve this issue.'))
    display(Markdown(f'SUGGEST: Use [TSG010 - Get configuration contexts](../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb) to resolve this issue.'))
    display(Markdown(f'SUGGEST: Use [SOP011 - Set kubernetes configuration context](../common/sop011-set-kubernetes-context.ipynb) to resolve this issue.'))
    raise

print('The kubernetes namespace for your big data cluster is: ' + namespace)

### Get the name of the namenode pod

In [None]:
namenode_pod = run(f'kubectl get pod --selector=role=namenode -n {namespace} -o jsonpath={{.items[0].metadata.name}}', return_output=True)

print ('Namenode pod name: ' + namenode_pod)

### Get the `hdfs dfsadmin` report

In [None]:
name=namenode_pod
container='hadoop'

command='hdfs dfsadmin -report'

string=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)

print(string)

### Set the text that identifies this issue

In [None]:
precondition_text="Safe mode is ON"

### PRECONDITION CHECK

In [None]:
if precondition_text not in string:
    raise Exception("PRECONDITION NON-MATCH: 'tsg024-name-node-is-in-safe-mode' is not a match for an active problem")

print("PRECONDITION MATCH: 'tsg024-name-node-is-in-safe-mode' is a match for an active problem in this cluster")

Resolution
----------

### Move the namenode out of safe mode

In [None]:
command='hdfs dfsadmin -safemode leave'

string=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)

print(string)

### Validate - Verify the namenode is no longer in safe mode

Validate that the text ‘Safe mode is ON’ is no longer in the
`hdfs dfsadmin -report` output

In [None]:
command='hdfs dfsadmin -report'

string=stream(api.connect_get_namespaced_pod_exec, name, namespace, command=['/bin/sh', '-c', command], container=container, stderr=True, stdout=True)

if precondition_text in string:
    raise SystemExit ('FAILED - hdfs dfsadmin -report output still contains: ' + precondition_text)

print ('SUCCESS - hdfs dfsadmin -report output no longer contains: ' + precondition_text)

In [None]:
print('Notebook execution complete.')