## Automatic SLA Reporting Prototype

This notebook is for development and testing of an automated script for pulling data from Thanos and compiling an SLA report to be emailed to Legal.

First, we verify that we trust the Red Hat CAs (since Thanos uses an SSL cert signed by those CA's, and OpenSSL will throw a fit if it doesn't trust them.
Solution derived from https://incognitjoe.github.io/adding-certs-to-requests.html

API client docs: https://prometheus-api-client-python.readthedocs.io/en/latest/source/prometheus_api_client.html#prometheus_api_client.prometheus_connect.PrometheusConnect

Obtain an API key: https://datahub.psi.redhat.com/console/catalog

Other help: https://help.datahub.redhat.com/docs/interacting-with-telemetry-data

Example config file: https://gist.github.com/abyrne55/3dec0d59a31c170ab6d3c824f4ffe2a9

In [1]:
import certifi
import requests
import yaml
import prometheus_api_client
from string import Template
from tabulate import tabulate
import jwt

In [32]:
class UnifiedHybridClient:
    def __init__(self, api_url, offline_token, public_key = None):
        self.api_url = api_url
        self.offline_token = offline_token.strip()
        self.public_key = public_key.strip() if public_key is not None else public_key
        
        # Extract info from the offline token
        
        ot_decoded = jwt.decode(
            self.offline_token,
            self.public_key,
            algorithms='RS256', 
            verify=(self.public_key is not None))
        self.iss_url = ot_decoded['iss']
        self.client_id = ot_decoded['aud']
    
    
    def __get_access_token(self): 
        response = requests.post(
            "{}/protocol/openid-connect/token".format(self.iss_url),
            data={
                'grant_type': "refresh_token",
                'client_id': self.client_id,
                'refresh_token': self.offline_token
            },
            headers={
                'accept': "application/json",
            }
        )
        return response.json()["access_token"]
    
    
    def search_clusters(self, query):
        """
        Query a list of clusters from the UHC HTTP API.

        :param query: (str) Specifies the search criteria. This syntax of 
            this parameter is similar to the syntax of the WHERE clause of 
            an SQL statement, but using the names of the attributes of the
            cluster instead of the names of the columns of a table.
        """
        response = requests.get(
            "{}/api/clusters_mgmt/v1/clusters".format(self.api_url),
            verify=True,
            headers={
                'accept': "application/json",
                "Authorization": "Bearer " + self.__get_access_token(),
            },
            params={'search': query},
        )
        if response.status_code == 200:
            data = response.json()
        else:
            raise Exception("HTTP Status Code {} ({})".format(
                response.status_code,
                response.content
            ))

        return data

In [34]:
def check_ssl_certs(url):
    """
    Checks if the Red Hat SSL CA certs are installed by connecting
    to a URL that uses them.
    
    :param url: (str) an HTTPS URL utilizing Red Hat-signed certificates
    """
    try:
        # print('Checking connection to Thanos...')
        test = requests.get(url)
        # print('Connection to Thanos OK.')
    except requests.exceptions.SSLError as err:
        # print('SSL Error. Adding custom certs to Certifi store...')
        cafile = certifi.where()
        with open("RHCertBundle.pem", "rb") as infile:
            customca = infile.read()
        with open(cafile, "ab") as outfile:
            outfile.write(customca)


def format_sli(sli, sla):
    """
    Adds CSS formatting to the value of an SLI based on whether 
    or not it complies with SLA
    
    :param sli: (float) the current value of the SLI 
    :param sla: (float) the minimum "good" value of the SLI
    :returns: a formatted HTML string
    """
    if sli - sla < 0:
        css_class = "danger"
    elif sli - sla < 0.01:
        css_class = "caution"
    else:
        css_class = "success"

    return "<span class='{}'>{}&#37;</span>".format(css_class, sli)


# Open config file
with open("sla_report_config.yml", "r") as f:
    config = yaml.safe_load(f)

# Connect to Telemeter-LTS
check_ssl_certs(config["api"]["telemeter"]["url"])
pc = prometheus_api_client.prometheus_connect.PrometheusConnect(
    url=config["api"]["telemeter"]["url"],
    headers={"Authorization": "bearer " + config["api"]["telemeter"]["token"]},
    disable_ssl=False,
)

In [39]:
# Get a list of managed clusters
uhc = UnifiedHybridClient(config['api']['uhc']['url'], config['api']['uhc']['token'])
managed_clist = uhc.search_clusters("managed = 't'")
selected = {x['name']:"_id='{}'".format(x['external_id']) for x in managed_clist['items']}
display(selected)

{'osd-v4stg-aws': "_id='c03103eb-1571-498d-b1fd-70587b445faa'",
 'osd-v4prod-aws': "_id='18e66bcf-3090-4519-a188-4ffb63fb6104'",
 'lucky-managed': "_id='de278845-2ed3-4a60-b4b5-a7ba6c47c615'",
 'kb-jun20-wkld': "_id='1e2c79de-e2d7-4a65-841a-8748d54c70ec'",
 'osd4-demo': "_id='bbf0bfaf-0dbf-4de3-9564-4faeb67245ad'",
 'tiwillia-test-07162019': "_id='7426f45a-4bde-4281-af18-035205b74bec'",
 'openshift-web': "_id='f03d1084-a017-4cc0-84ae-d24a17375b50'",
 'nmalik-hive': "_id='89c9de43-98f0-4543-b32f-4d6e62ba2946'",
 'jh-osd-1': "_id='b321e992-7b78-4391-94da-e2737c167564'",
 'demo31managed': "_id='7c27b47c-b7d8-4948-8b80-80b11ba7fd5b'",
 'prod31managed': "_id='05872657-a037-45f8-bfe2-1a97ab5ace33'",
 'jeder-rh-managed': "_id='f7a7bc3b-6ee1-4f9a-8e64-e773446e6ff2'",
 'allhands': "_id='f251fa2b-04a8-4277-975f-e596120bca67'"}

In [41]:
table = []
# Run the queries and populate the table
for name, selector in selected.items():
    row = [name]
    for rule in config["rules"]:
        query_params = {
            **{k: v for k, v in rule.items() if k != "query"},
            **{"sel": selector},
        }
        query = Template(rule["query"]).substitute(**query_params)
        try:
            query_res = pc.custom_query(query)
            sli = round(float(query_res[0]["value"][1]) * 100, 4)
            sla = float(rule["sla"]) * 100
            row += [str(sla) + '&#37;', format_sli(sli, sla)]
        except:
            print("Query failed:" + str(query))
            row += [str(sla) + '&#37;', ""]
    table.append(row)


from IPython.core.display import HTML

# Generate the header row
headers = ['Cluster'] + list(
    sum([(r["name"] + " SLA", r["name"] + " Perf.") for r in config["rules"]], ())
)

# Define what good and bad values look like
css = """
<style>
    .danger {
        color: red;
        font-weight: bold;
    }
    .caution {
        color: darkorange;
        font-weight: bold;
    }
    .success {
        color: green;
    }
</style>
"""

display(HTML(css + tabulate(table, headers, tablefmt="html", stralign="center")))

Query failed:clamp_max(
  sum_over_time(
    (
      sum(up{service='etcd',_id='89c9de43-98f0-4543-b32f-4d6e62ba2946'}) > bool 0
    )[7d:1m]
  ) / (7 * 24 * 60) > 0, 1
)

Query failed:clamp_max(
  sum_over_time(
    (
      sum(up{service='etcd',_id='b321e992-7b78-4391-94da-e2737c167564'}) > bool 0
    )[7d:1m]
  ) / (7 * 24 * 60) > 0, 1
)

Query failed:clamp_max(
  sum_over_time(
    (
      sum(up{service='etcd',_id='7c27b47c-b7d8-4948-8b80-80b11ba7fd5b'}) > bool 0
    )[7d:1m]
  ) / (7 * 24 * 60) > 0, 1
)

Query failed:clamp_max(
  sum_over_time(
    (
      sum(up{service='etcd',_id='05872657-a037-45f8-bfe2-1a97ab5ace33'}) > bool 0
    )[7d:1m]
  ) / (7 * 24 * 60) > 0, 1
)

Query failed:clamp_max(
  sum_over_time(
    (
      sum(up{service='etcd',_id='f7a7bc3b-6ee1-4f9a-8e64-e773446e6ff2'}) > bool 0
    )[7d:1m]
  ) / (7 * 24 * 60) > 0, 1
)

Query failed:clamp_max(
  sum_over_time(
    (
      sum(up{service='etcd',_id='f251fa2b-04a8-4277-975f-e596120bca67'}) > bool 0
    )[7d:1m

Cluster,Control Plane API SLA,Control Plane API Perf.,Control Plane etcd SLA,Control Plane etcd Perf.
osd-v4stg-aws,99.9%,99.9971%,99.9%,99.9901%
osd-v4prod-aws,99.9%,98.6089%,99.9%,100.0%
lucky-managed,99.9%,100.0%,99.9%,99.9702%
kb-jun20-wkld,99.9%,100.0%,99.9%,96.1508%
osd4-demo,99.9%,99.9976%,99.9%,100.0%
tiwillia-test-07162019,99.9%,99.9999%,99.9%,31.2401%
openshift-web,99.9%,100.0%,99.9%,27.7976%
nmalik-hive,99.9%,100.0%,99.9%,
jh-osd-1,99.9%,100.0%,99.9%,
demo31managed,99.9%,100.0%,99.9%,


In [None]:
def custom_query(query: str, params: dict = {}):
    """
    A method to send a custom query to a Prometheus Host.

    This method takes as input a string which will be sent as a query to
    the specified Prometheus Host. This query is a PromQL query.

    :param query: (str) This is a PromQL query, a few examples can be found
                    at https://prometheus.io/docs/prometheus/latest/querying/examples/

    :param params: (dict) Optional dictionary containing GET parameters to be 
                    sent along with the API request, such as "time"

    :Returns: (list) A list of metric data received in response of the query sent

    :raises: (Http Response error) Raises an exception in case of a connection error

    """
    data = None
    query = str(query)
    # using the query API to get raw data
    response = requests.get(
        "{0}/api/v1/query".format("https://telemeter-lts.datahub.redhat.com"),
        params={**{"query": query}, **params},
        verify=True,
        headers={"Authorization": "bearer QWFmutJHVmZdfcUG7zJzFiZ9y7TxKGsnwwJdU5hy5qI"},
    )
    if response.status_code == 200:
        data = response.json()['data']['result']
    else:
        raise Exception("HTTP Status Code {} ({})".format(
            response.status_code,
            response.content
        ))

    return data


q = """
(
  (
    sum(sum_over_time(code:apiserver_request_count:rate:sum{_id='c03103eb-1571-498d-b1fd-70587b445faa'}[7d])) - 
    sum(sum_over_time(code:apiserver_request_count:rate:sum{code=~'5.*',_id='c03103eb-1571-498d-b1fd-70587b445faa'}[7d]))
  ) / sum(sum_over_time(code:apiserver_request_count:rate:sum{_id='c03103eb-1571-498d-b1fd-70587b445faa'}[7d]))
) OR (absent(code:apiserver_request_count:rate:sum{code=~'5.*',_id='c03103eb-1571-498d-b1fd-70587b445faa'} == 0))
"""
custom_query(q, {'time':"2019-07-17T08:54:00Z"})