## Report FIO results for EBS Benchmark on gp2 and gp3

Steps to run this report:

> TODO

### Description:

Scenario (clusters):
- c1: OCP cluster with 1x gp2
- c2: OCP cluster with 2x gp2 (etcd isolated)
- c3: OCP cluster with 1x gp3
- c4: OCP cluster with 2x gp3 (etcd isolated)

This report aggregates the data collected on FIO tests, that tested all control plane disks on layouts described above.

The script to create the "battery 2" and collect the data is defined (by WIP script) [here](https://github.com/mtulio/openshift-cluster-benchmark-lab/blob/init/run-test.sh#L250-L271)

References:
 - [FIO doc](https://fio.readthedocs.io/en/latest/fio_doc.html)
 - This report (notebook): reports/fio-ebs_gp3-b2.ipynb
 - This report (markdown/exported): docs/examples/fio-ebs_gp3-b2.md

In [None]:
# install dependencies
! pip install pandas matplotlib

In [None]:
import os
import json

import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt

import tarfile
from pprint import pprint

In [None]:
# Globals
job_group="b3_loop10"
results_path=(f"/results/byGroup-{job_group}")

parser_path = f"{results_path}/parser"

In [None]:
results_path

In [None]:
def lookup_result_files(base_path, results=[], start_str="", contains_str="", extension="", ignore_str=None):
    """
    Generic lookup based on filters criteria
    """
    for res in os.listdir(base_path):
        # check prefix
        if not res.startswith(filter_results_by_battery):
            #print(f"01: {res}")
            continue

        # check extension
        if not res.endswith(extension):
            #print(f"02: {res}")
            continue

        # check filter
        if contains_str not in res:
            #print(f"03: {res}")
            continue

        # ignore strings
        if (ignore_str != None) and (ignore_str in res):
            #print(f"04: {res}")
            continue

        results.append(res)
    return results

In [None]:
# Custom node alias builder. To get shorter columns =)
ocp_default_subnets = [
    {
        "azName": "us-east-1a-public",
        "azId": "use1-az4",
        "cidr": "10.0.0.0/20",
        "cidr_3o_start": 0,
        "cidr_3o_end": 127
    },
    {
        "azName": "us-east-1b-public",
        "azId": "use1-az6",
        "cidr": "10.0.16.0/20",
        "cidr_3o_start": 16,
        "cidr_3o_end": 31
    },
    {
        "azName": "us-east-1c-public",
        "azId": "use1-az1",
        "cidr": "10.0.32.0/20",
        "cidr_3o_start": 32,
        "cidr_3o_end": 47
    },
    {
        "azName": "us-east-1a-private",
        "azId": "use1-az4",
        "cidr": "10.0.128.0/20",
        "cidr_3o_start": 128,
        "cidr_3o_end": 143
    },
    {
        "azName": "us-east-1b-private",
        "azId": "use1-az6",
        "cidr": "10.0.144.0/20",
        "cidr_3o_start": 144,
        "cidr_3o_end": 159
    },
    {
        "azName": "us-east-1c-private",
        "azId": "use1-az2",
        "cidr": "10.0.160.0/20",
        "cidr_3o_start": 160,
        "cidr_3o_end": 175
    }
]

def locate_azId_by_hostname(hostname):
    """
    Assume AzId by hostname. OCP, by default, will deploy first on AzName=a and so on, with standard cidr,
    so discovery it in us-east-1 in a standard IPI is easy;
    """
    hostname_ip = (hostname.split('ip-')[1].split('.ec2.internal')[0])
    hostname_ip3o = int((hostname_ip.split('-')[2]))
    hostname_netIp = (f"{int((hostname_ip.split('-')[2]))}-{int((hostname_ip.split('-')[3]))}")
    for net in ocp_default_subnets:
        if (hostname_ip3o >= net['cidr_3o_start']) and (hostname_ip3o <= net['cidr_3o_end']):
            return (net['azId'], net['azName'], hostname_netIp)

    return ('AzNotFound', 'NA', hostname_netIp)


def find_node_alias_by_hostname(hostname="", add_prefix="", add_suffix="", fmt="azId"):
    # assuming all AWS node hostname starts with 'ip-...'
    if hostname.startswith('ip-'):
        azId, azName, netIp = locate_azId_by_hostname(hostname)
        if fmt == "azId_ipNet": #> '{region_id}-{az_id}_{ip3o}-{ip4o}'
            return (f"{add_prefix}{azId}_{netIp}{add_suffix}")
        elif fmt == "azIdShort_ipNet":  #> '{az_id}_{ip3o}-{ip4o}'
            return (f"{add_prefix}{azId.split('-')[1]}_{netIp}{add_suffix}")
        elif fmt == "ipNet_azIdShort":  #> '{ip3o}-{ip4o}_{az_id}'
            return (f"{add_prefix}{netIp}_{azId.split('-')[1]}{add_suffix}")
        else: # "azId" #> '{region_id}-{az_id}'
            return (f"{add_prefix}{azId}{add_suffix}")
    
    # default: not transformations
    return hostname

In [None]:
class Node(object):
    def __init__(self, name, cluster):
        self.node_name=name
        self.node_alias=""
        self.cluster=cluster
        self.cluster_full=""

        self.metrics=[]

    def add_metric(self, **kwargs):
        #print(f"Adding metric [{kwargs['metric']}]")
        self.metrics.append({
            "job_name": kwargs["job_name"],
            "job_group": kwargs["job_group"],
            "task_name": kwargs["task_name"],
            "task_group": kwargs["task_group"],
            "task_execId": kwargs["execId"],
            "timestamp": kwargs["timestamp"],
            "metric": kwargs["metric"],
            "value": kwargs["value"],
        })


class Nodes(object):
    def __init__(self):
        self.nodes={}
    
    def add_node(self, node, cluster):
        try:
            node = self.nodes[node]
        except KeyError:
            self.nodes[node] = Node(node, cluster)
            print(f"Node [{node}] added")
            #self.nodes[node].node_alias = find_node_alias_by_hostname(hostname=node, add_prefix=f"{cluster}_", fmt="azIdShort_ipNet")
            self.nodes[node].node_alias = find_node_alias_by_hostname(hostname=node, add_prefix=f"{cluster}_")
        except:
            raise

    def get_node(self, node):
        try:
            return self.nodes[node]
        except:
            raise

In [None]:
def parser_results_fio_runtime(node, data_path, job_info):
    """
    FIO runtime log parser. See below some examples of data.
    sample of header line:
    #cluster=c1gp2x1> Running task [fio_psync_randwrite] on node [ip-10-0-142-138.ec2.internal], registering on log file ./.local/results/byGroup-b3_loop1/fio_stdout-c1-ip-10-0-142-138.ec2.internal.txt
    
    sample of metric line:
    [0] <=> ip-10-0-142-138 <=> Thu Sep  9 13:51:16 UTC 2021 <=>  13:51:16 up 32 min,  0 users,  load average: 1.27, 0.83, 1.21 
    [1] <=> ip-10-0-142-138 <=> Thu Sep  9 14:03:16 UTC 2021 <=>  14:03:16 up 44 min,  0 users,  load average: 0.87, 2.34, 2.93 
    """

    job_name, job_group = job_info
    task_group = "fio_runtime"
    with open(data_path) as f:
        last_job = ''
        time_init = None
        current_task = ""
        for line in f.readlines():
            # parse line : [...] Running task [fio_psync_randwrite] [...],
            if 'Running task [' in line:
                current_task = line.split('Running task [')[1].split(']')[0]
                if node.cluster_full == "":
                    node.cluster_full = line.split('#cluster=')[1].split('>')[0]
                continue
            if line.startswith('['):
                # extract jobId, time and Load1
                jobId = line.split(' <=> ')[0].replace('[','').replace(']','')
                load1 = line.split(' <=> ')[3].split('load average: ')[1].split(',')[0]
                ts = line.split(' <=> ')[2]
                node.add_metric(job_name=job_name,
                                job_group=job_group,
                                task_name=current_task,
                                task_group=task_group,
                                execId=jobId,
                                timestamp=ts,
                                metric='load1',
                                value=load1)
                continue

## FIO Payload (Sample)

Payload sample to build the metric parser fn()

> From task result `fio_psync_randwrite`


```json
{'disk_util': [{'in_queue': 3112979,
                'name': 'nvme1n1',
                'read_ios': 3,
                'read_merges': 0,
                'read_ticks': 2,
                'util': 99.986104,
                'write_ios': 570382,
                'write_merges': 1005,
                'write_ticks': 3112977}],
 'fio version': 'fio-3.6',
 'global options': {'bs': '16k',
                    'direct': '1',
                    'directory': '/var/lib/etcd/_benchmark',
                    'ioengine': 'psync',
                    'numjobs': '16',
                    'runtime': '180',
                    'rw': 'randwrite',
                    'size': '1G'},
 'jobs': [{'ctx': 532537,
           'elapsed': 181,
           'error': 0,
           'eta': 0,
           'groupid': 0,
           'iodepth_level': {'1': 100.0,
                             '16': 0.0,
                             '2': 0.0,
                             '32': 0.0,
                             '4': 0.0,
                             '8': 0.0,
                             '>=64': 0.0},
           'job options': {'name': 'fio_io_1'},
           'jobname': 'fio_io_1',
           'latency_depth': 1,
           'latency_ms': {'10': 98.602706,
                          '100': 0.0,
                          '1000': 0.0,
                          '2': 0.702802,
                          '20': 0.390593,
                          '2000': 0.0,
                          '250': 0.0,
                          '4': 0.23666,
                          '50': 0.01,
                          '500': 0.0,
                          '750': 0.0,
                          '>=2000': 0.0},
           'latency_ns': {'10': 0.0,
                          '100': 0.0,
                          '1000': 0.0,
                          '2': 0.0,
                          '20': 0.0,
                          '250': 0.0,
                          '4': 0.0,
                          '50': 0.0,
                          '500': 0.0,
                          '750': 0.0},
           'latency_percentile': 100.0,
           'latency_target': 0,
           'latency_us': {'10': 0.0,
                          '100': 0.0,
                          '1000': 0.064028,
                          '2': 0.0,
                          '20': 0.0,
                          '250': 0.0,
                          '4': 0.0,
                          '50': 0.0,
                          '500': 0.0,
                          '750': 0.01},
           'latency_window': 0,
           'majf': 0,
           'minf': 143,
           'read': {'bw': 0,
                    'bw_agg': 0.0,
                    'bw_bytes': 0,
                    'bw_dev': 0.0,
                    'bw_max': 0,
                    'bw_mean': 0.0,
                    'bw_min': 0,
                    'bw_samples': 0,
                    'clat_ns': {'max': 0,
                                'mean': 0.0,
                                'min': 0,
                                'percentile': {'1.000000': 0,
                                               '10.000000': 0,
                                               '20.000000': 0,
                                               '30.000000': 0,
                                               '40.000000': 0,
                                               '5.000000': 0,
                                               '50.000000': 0,
                                               '60.000000': 0,
                                               '70.000000': 0,
                                               '80.000000': 0,
                                               '90.000000': 0,
                                               '95.000000': 0,
                                               '99.000000': 0,
                                               '99.500000': 0,
                                               '99.900000': 0,
                                               '99.950000': 0,
                                               '99.990000': 0},
                                'stddev': 0.0},
                    'drop_ios': 0,
                    'io_bytes': 0,
                    'io_kbytes': 0,
                    'iops': 0.0,
                    'iops_max': 0,
                    'iops_mean': 0.0,
                    'iops_min': 0,
                    'iops_samples': 0,
                    'iops_stddev': 0.0,
                    'lat_ns': {'max': 0, 'mean': 0.0, 'min': 0, 'stddev': 0.0},
                    'runtime': 0,
                    'short_ios': 0,
                    'slat_ns': {'max': 0, 'mean': 0.0, 'min': 0, 'stddev': 0.0},
                    'total_ios': 0},
           'sync': {'lat_ns': {'max': 0,
                               'mean': 0.0,
                               'min': 0,
                               'percentile': {'1.000000': 0,
                                              '10.000000': 0,
                                              '20.000000': 0,
                                              '30.000000': 0,
                                              '40.000000': 0,
                                              '5.000000': 0,
                                              '50.000000': 0,
                                              '60.000000': 0,
                                              '70.000000': 0,
                                              '80.000000': 0,
                                              '90.000000': 0,
                                              '95.000000': 0,
                                              '99.000000': 0,
                                              '99.500000': 0,
                                              '99.900000': 0,
                                              '99.950000': 0,
                                              '99.990000': 0},
                               'stddev': 0.0},
                    'total_ios': 0},
           'sys_cpu': 0.408606,
           'trim': {'bw': 0,
                    'bw_agg': 0.0,
                    'bw_bytes': 0,
                    'bw_dev': 0.0,
                    'bw_max': 0,
                    'bw_mean': 0.0,
                    'bw_min': 0,
                    'bw_samples': 0,
                    'clat_ns': {'max': 0,
                                'mean': 0.0,
                                'min': 0,
                                'percentile': {'1.000000': 0,
                                               '10.000000': 0,
                                               '20.000000': 0,
                                               '30.000000': 0,
                                               '40.000000': 0,
                                               '5.000000': 0,
                                               '50.000000': 0,
                                               '60.000000': 0,
                                               '70.000000': 0,
                                               '80.000000': 0,
                                               '90.000000': 0,
                                               '95.000000': 0,
                                               '99.000000': 0,
                                               '99.500000': 0,
                                               '99.900000': 0,
                                               '99.950000': 0,
                                               '99.990000': 0},
                                'stddev': 0.0},
                    'drop_ios': 0,
                    'io_bytes': 0,
                    'io_kbytes': 0,
                    'iops': 0.0,
                    'iops_max': 0,
                    'iops_mean': 0.0,
                    'iops_min': 0,
                    'iops_samples': 0,
                    'iops_stddev': 0.0,
                    'lat_ns': {'max': 0, 'mean': 0.0, 'min': 0, 'stddev': 0.0},
                    'runtime': 0,
                    'short_ios': 0,
                    'slat_ns': {'max': 0, 'mean': 0.0, 'min': 0, 'stddev': 0.0},
                    'total_ios': 0},
           'usr_cpu': 0.086839,
           'write': {'bw': 47061,
                     'bw_agg': 6.249132,
                     'bw_bytes': 48190558,
                     'bw_dev': 339.839869,
                     'bw_max': 9184,
                     'bw_mean': 2940.903786,
                     'bw_min': 1760,
                     'bw_samples': 5758,
                     'clat_ns': {'max': 48851113,
                                 'mean': 5437348.329637,
                                 'min': 665825,
                                 'percentile': {'1.000000': 3981312,
                                                '10.000000': 5079040,
                                                '20.000000': 5210112,
                                                '30.000000': 5275648,
                                                '40.000000': 5341184,
                                                '5.000000': 4947968,
                                                '50.000000': 5341184,
                                                '60.000000': 5406720,
                                                '70.000000': 5472256,
                                                '80.000000': 5603328,
                                                '90.000000': 5865472,
                                                '95.000000': 6193152,
                                                '99.000000': 7307264,
                                                '99.500000': 9895936,
                                                '99.900000': 12386304,
                                                '99.950000': 12910592,
                                                '99.990000': 14745600},
                                 'stddev': 693740.770945},
                     'drop_ios': 0,
                     'io_bytes': 8674541568,
                     'io_kbytes': 8471232,
                     'iops': 2941.318297,
                     'iops_max': 574,
                     'iops_mean': 183.771101,
                     'iops_min': 110,
                     'iops_samples': 5758,
                     'iops_stddev': 21.243864,
                     'lat_ns': {'max': 48851490,
                                'mean': 5438129.180468,
                                'min': 667004,
                                'stddev': 693735.52613},
                     'runtime': 180005,
                     'short_ios': 0,
                     'slat_ns': {'max': 0,
                                 'mean': 0.0,
                                 'min': 0,
                                 'stddev': 0.0},
                     'total_ios': 529452}}],
 'time': 'Thu Sep  9 12:29:38 2021',
 'timestamp': 1631190578,
 'timestamp_ms': 1631190578550}
```

In [None]:
def parser_results_fio_tasks(node, data_path, job_info):
    """
    FIO payload parser.
    Walk through fio result dir and load JSON files with FIO results,
    returning only desired metrics for each test.
    """
    job_name, job_group, task_name = job_info
    task_group = "fio_tasks"

    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith(".json"):
                fpath=os.path.join(root, file)
                with open(fpath, 'r') as f:
                    res_payload=json.loads(f.read())

                    # Extract jobId from different standards (latest is fio_io_)
                    try:
                        jobId = res_payload['jobs'][0]['jobname'].split('fio_io_')[1]
                    except Exception as e:
                        raise e
                        
                    print(job_group, job_name, task_name, task_group, jobId)
                    #pprint(res_payload)

                    ts = res_payload['timestamp']
                    metrics_collection = {
                        "global": {
                            "read_ios": res_payload['disk_util'][0]['read_ios'],
                            "write_ios": res_payload['disk_util'][0]['read_ios'],
                            "bs": res_payload['global options']['bs'],
                            "ioengine": res_payload['global options']['ioengine'],
                            "numjobs": res_payload['global options']['numjobs'],
                            "runtime": res_payload['global options']['runtime'],
                            "rw": res_payload['global options']['rw'],
                            "size": res_payload['global options']['size'],
                            "jobname": res_payload['jobs'][0]['jobname'],
                        },
                        "values": {
                            "elapsed": res_payload['jobs'][0]['elapsed'],
                            "latency_ms": res_payload['jobs'][0]['latency_ms'],
                            "read_bw": res_payload['jobs'][0]['read']['bw'],
                            "read_iops": res_payload['jobs'][0]['read']['iops'],
                            "read_total_ios": res_payload['jobs'][0]['read']['total_ios'],
                            "read_lat_ms_min": (float(res_payload['jobs'][0]['read']['lat_ns']['min'])/1e+6),
                            "read_lat_ms_max": (float(res_payload['jobs'][0]['read']['lat_ns']['max'])/1e+6),
                            "read_lat_ms_mean": (float(res_payload['jobs'][0]['read']['lat_ns']['mean'])/1e+6),
                            "read_clat_ms_p99": (float(res_payload['jobs'][0]['read']['clat_ns']['percentile']['99.000000'])/1e+6),
                            "read_clat_ms_p99.9": (float(res_payload['jobs'][0]['read']['clat_ns']['percentile']['99.900000'])/1e+6),
                            "read_clat_ms_p99.99": (float(res_payload['jobs'][0]['read']['clat_ns']['percentile']['99.990000'])/1e+6),
                            "read_clat_ms_stddev": (float(res_payload['jobs'][0]['read']['clat_ns']['stddev'])/1e+6),
                            "write_bw": res_payload['jobs'][0]['write']['bw'],
                            "write_iops": res_payload['jobs'][0]['write']['iops'],
                            "write_total_ios": res_payload['jobs'][0]['write']['total_ios'],
                            "write_lat_ms_min": (float(res_payload['jobs'][0]['write']['lat_ns']['min'])/1e+6),
                            "write_lat_ms_max": (float(res_payload['jobs'][0]['write']['lat_ns']['max'])/1e+6),
                            "write_lat_ms_mean": (float(res_payload['jobs'][0]['write']['lat_ns']['mean'])/1e+6),
                            "write_clat_ms_p99": (float(res_payload['jobs'][0]['write']['clat_ns']['percentile']['99.000000'])/1e+6),
                            "write_clat_ms_p99.9": (float(res_payload['jobs'][0]['write']['clat_ns']['percentile']['99.900000'])/1e+6),
                            "write_clat_ms_p99.99": (float(res_payload['jobs'][0]['write']['clat_ns']['percentile']['99.990000'])/1e+6),
                            "write_clat_ms_stddev": (float(res_payload['jobs'][0]['write']['clat_ns']['stddev'])/1e+6),
                            "sync_total_ios": res_payload['jobs'][0]['sync']['total_ios'],
                            "sync_lat_ms_min": (float(res_payload['jobs'][0]['sync']['lat_ns']['min'])/1e+6),
                            "sync_lat_ms_max": (float(res_payload['jobs'][0]['sync']['lat_ns']['max'])/1e+6),
                            "sync_lat_ms_mean": (float(res_payload['jobs'][0]['sync']['lat_ns']['mean'])/1e+6),
                            "sync_lat_ms_p99": (float(res_payload['jobs'][0]['sync']['lat_ns']['percentile']['99.000000'])/1e+6),
                            "sync_lat_ms_p99.9": (float(res_payload['jobs'][0]['sync']['lat_ns']['percentile']['99.900000'])/1e+6),
                            "sync_lat_ms_p99.99": (float(res_payload['jobs'][0]['sync']['lat_ns']['percentile']['99.990000'])/1e+6),
                            "sync_lat_ms_stddev": (float(res_payload['jobs'][0]['sync']['lat_ns']['stddev'])/1e+6),
                            "cpu_sys": res_payload['jobs'][0]['sys_cpu'],
                            "cpu_usr": res_payload['jobs'][0]['usr_cpu'],
                            "cpu_ctx": res_payload['jobs'][0]['ctx']
                        }
                    }
                    node.add_metric(job_name=job_name,
                                job_group=job_group,
                                task_name=task_name,
                                task_group=task_group,
                                execId=jobId,
                                timestamp=ts,
                                metric='collection',
                                value=metrics_collection)

In [None]:
def aggregate_metric_collection(data, metric_name, is_collection=True):
    """
    Filter desired {metric_name}, extract the jobs (rows) for each cluster (columns),
    and return the data frame.
    JobId | {cluster1}  | [...clusterN |]
    #id   | metricValue | [...metricValue |]
    """

    data_metric = {}
    for n in data.nodes.keys():
        node = data.nodes[n]
        for metric in node.metrics:
            

            job_id = (f"{metric['task_name']}#{metric['task_execId']}")
            try:
                jid = data_metric[job_id]
            except KeyError:
                data_metric[job_id] = {
                    "job_Id": job_id
                }
                jid = data_metric[job_id]
                pass

            if not(is_collection) or (metric['metric'] != "collection"):
                if metric['metric'] == metric_name:
                    jid[node.node_alias] = metric['value']
                continue
            #print(metric['metric'])
            #print(node.node_alias, metric_name)
            #print(metric['value']['values'])
            jid[node.node_alias] = metric['value']['values'][metric_name]

    data_pd = []
    for dk in data_metric.keys():
        data_pd.append(data_metric[dk])

    # create data frame and force job_id as first column
    df = pd.read_json(json.dumps(data_pd))
    columns = df.columns.drop('job_Id')
    return df.reindex(['job_Id'] + sorted(columns), axis=1)

In [None]:
def _df_style_high(val, value_yellow=None, value_red=None, value_greenS=None, value_greenH=None, invert=False):
    "Data frame styling / cell formating"
    color_map = {
        "green_soft": "#DAF7A6",
        "green_hard": "#02FC11",
        "red_hard": "#FC5A5A",
        "yellow_hard": "#E6ED02",
    }
    color = None

    # ignore 0 values
    if (invert) and (val == 0.0):
        return color
    
    # yellow (high)
    if ((value_yellow != None) and not(invert)) and (val >=  value_yellow):
        color = color_map["yellow_hard"]
    if ((value_yellow != None) and (invert)) and (val <=  value_yellow):
        color = color_map["yellow_hard"]
    
    # red (very high)
    if ((value_red != None) and not(invert))  and (val >=  value_red):
        color = color_map["red_hard"]
    if ((value_red != None) and (invert)) and (val <=  value_red):
        color = color_map["red_hard"]

    # blue (low)
    if ((value_greenS != None) and not(invert))  and (val <=  value_greenS):
        color = color_map["green_soft"]
    if ((value_greenS != None) and (invert)) and (val >=  value_greenS):
        color = color_map["green_soft"]

    # green (very low)
    if ((value_greenH != None) and not(invert))  and (val <=  value_greenH):
        color = color_map["green_hard"]
    if ((value_greenH != None) and (invert)) and (val >=  value_greenH):
        color = color_map["green_hard"]
        
    # default color
    if color == None:
        return color
   
    #return f"color: {color}"
    return f"background-color: {color}"

## Discovery and Load results for 'fio'

In [None]:
# Globals
#battery_id = "b2"
filter_results_by_battery=""

nodes = {}

# Runtime runtime, custom stdout collecting when FIO jobs was running
fio_runtime = {}

# FIO Runtime log parser
result_fio_runtime_files = []

# Nodes entity
nodes = Nodes()

In [None]:
result_fio_runtime_files = lookup_result_files(results_path,
                                                results=result_fio_runtime_files,
                                                start_str=filter_results_by_battery,
                                                contains_str="fio_stdout",
                                                extension=".txt"
                                               )
len(result_fio_runtime_files)

In [None]:
result_fio_runtime_files

In [None]:
# Build metrics from FIO Runtime (stdout parser)
for res in result_fio_runtime_files:
    task_name = f"{res.split('-')[0]}"
    job_name = f"{res.split('-')[1]}"
    node_name = f"{res.split(job_name+'-')[1].split('.txt')[0]}"

    nodes.add_node(node_name, job_name)

    parser_results_fio_runtime(nodes.get_node(node_name), f"{results_path}/{res}", job_info=(job_name, job_group))

In [None]:
nodes.nodes['ip-10-0-166-6.ec2.internal'].node_alias

In [None]:
#nodes.nodes['ip-10-0-166-6.ec2.internal'].metrics

In [None]:
# FIO raw payload: files is saved on the format: {battery_id}_{cluster_id}-fio-{hostname}.tar.gz ;
# TODO unpack it, currently it should be done manually
results_dirs_fio = []
results_dirs_fio = lookup_result_files(results_path,
                                        results=results_dirs_fio,
                                        start_str="fio_",
                                        contains_str="fio_",
                                        extension="tar.gz",
                                        ignore_str=".txt"
                                       )
results_dirs_fio

In [None]:

for res in results_dirs_fio:

    task_name = f"{res.split('-')[0]}"
    job_name = f"{res.split('-')[1]}"
    node_name = f"{res.split(job_name+'-')[1].split('.tar.gz')[0]}"
    
    nodes.add_node(node_name, job_name)

    # crate parser result dir and unpack it
    dest_path_res = f"{parser_path}/{res.split('.tar.gz')[0]}"
    
    # dependens: mkdir .local/results/byGroup-b3_loop10/parser && chmod o+rw .local/results/byGroup-b3_loop10/parser
    !mkdir -p f"{dest_path_res}"
    
    try:
        if res.endswith('tar.gz'):
            tar = tarfile.open(f"{results_path}/{res}")
            tar.extractall(path=dest_path_res)
            tar.close()
    except:
        # when the file is not found, or corrupted. Add empty metric
        nodes.get_node(node_name).add_metric(
            job_name=job_name,
            job_group=job_group,
            task_name=task_name,
            task_group="fio_tasks",
            execId='',
            timestamp='',
            metric='empty',
            value=''
        )
        print(job_group, job_name, task_name, "fio_tasks", node_name)
        print(f"ERR, dataset not found or corrupted [{res}]; Empty metric added")
        continue

    # parser    
    parser_results_fio_tasks(nodes.get_node(node_name), f"{dest_path_res}", job_info=(job_name, job_group, task_name))
    #break

In [None]:
#nodes.nodes[node_name].metrics

In [None]:
#results_fio

## Results for 'fio'

As described, the tests was done in 4 clusters in two disk layouts (single disk, etcd isolated) using gp2 and gp3. The volume has same capacity using standard values for IOPS and throughput (gp3)

- Total of FIO consecutive tests: 50
- Max IOPS on all jobs job: ~1.5/2k IOPS
- Max IOPS for gp2 device: 386 (capacity=128GiB, throughput*=128 MiB/s)
- Max IOPS for gp3 device: 3000 (capacity=128GiB, throughput=120MiB/s) 

\*[Important note from AWS doc](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-volume-types.html): 
*"The throughput limit is between 128 MiB/s and 250 MiB/s, depending on the volume size. Volumes smaller than or equal to 170 GiB deliver a maximum throughput of 128 MiB/s. Volumes larger than 170 GiB but smaller than 334 GiB deliver a maximum throughput of 250 MiB/s if burst credits are available. Volumes larger than or equal to 334 GiB deliver 250 MiB/s regardless of burst credits. gp2 volumes that were created before December 3, 2018 and that have not been modified since creation might not reach full performance unless you modify the volume."*

____
____
**FIO sync lattency p99 in ms (sync_lat_p99_ms)**

Summary of results:
- after 32nd job the gp2 disks consumed all burst credits (higher than max [380 IOPS]) and become slow (5/6x) due to throttlings
- the cluster with etcd as second disk using gp2 was more reliable for a longer period, comparing with single disk node
- gp3 become bellow from max and stable until the end of all tests
- gp3 in normal conditions had lattency higher than gp2
- Trade-off in reliability (when long intensive IOPS) and performance (in normal operation)


## Results (Lattency)

In [None]:
#nodes.nodes.keys()

In [None]:
#nodes.nodes['ip-10-0-140-177.ec2.internal'].metrics

In [None]:
oper_lat_prefix = {
    "write": "lat_",
    "read": "lat_",
    "sync": "lat_"
}
for op in oper_lat_prefix.keys():
    metric=f"{op}_{oper_lat_prefix[op]}ms_mean"
    title=f"metric ({metric}) by Node(all)"

    df = aggregate_metric_collection(nodes, f"{metric}")
    df_columns = df.columns.drop('job_Id')

    print(f">> {title}")
    display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=5.4, value_red=6))

In [None]:
oper_lat_prefix = {
    "write": "lat_",
    "read": "lat_",
    "sync": "lat_"
}
for op in oper_lat_prefix.keys():
    metric=f"{op}_{oper_lat_prefix[op]}ms_max"
    title=f"metric ({metric}) by Node(all)"

    df = aggregate_metric_collection(nodes, f"{metric}")
    df_columns = df.columns.drop('job_Id')

    print(f">> {title}")
    display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=20, value_red=50))

## Results (Percentile)

In [None]:
oper_lat_prefix = {
    "write": "clat_",
    "read": "clat_",
    "sync": "lat_"
}
for op in oper_lat_prefix.keys():
    metric=f"{op}_{oper_lat_prefix[op]}ms_p99"
    title=f"metric ({metric}) by Node(all)"

    df = aggregate_metric_collection(nodes, f"{metric}")
    df_columns = df.columns.drop('job_Id')

    print(f">> {title}")
    display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=5, value_red=10.0))

In [None]:
oper_lat_prefix = {
    "write": "clat_",
    "read": "clat_",
    "sync": "lat_"
}
for op in oper_lat_prefix.keys():
    metric=f"{op}_{oper_lat_prefix[op]}ms_p99.9"
    title=f"metric ({metric}) by Node(all)"

    df = aggregate_metric_collection(nodes, f"{metric}")
    df_columns = df.columns.drop('job_Id')

    print(f">> {title}")
    display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=10, value_red=20.0))

In [None]:
oper_lat_prefix = {
    "write": "clat_",
    "read": "clat_",
    "sync": "lat_"
}
for op in oper_lat_prefix.keys():
    metric=f"{op}_{oper_lat_prefix[op]}ms_stddev"
    title=f"metric ({metric}) by Node(all)"

    df = aggregate_metric_collection(nodes, f"{metric}")
    df_columns = df.columns.drop('job_Id')

    print(f">> {title}")
    display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=1, value_red=2))

## Results (totals)

In [None]:
oper_lat_prefix = {
    "write": "",
    "read": ""
}
for op in oper_lat_prefix.keys():
    metric=f"{op}_{oper_lat_prefix[op]}iops"
    title=f"metric ({metric}) by Node(all)"

    df = aggregate_metric_collection(nodes, f"{metric}")
    df_columns = df.columns.drop('job_Id')

    print(f">> {title}")
    display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=2000, value_red=1000, value_greenH=2950, value_greenS=2900, invert=True))

In [None]:
oper_lat_prefix = {
    "write": "",
    "read": "",
    "sync": ""
}
for op in oper_lat_prefix.keys():
    metric=f"{op}_{oper_lat_prefix[op]}total_ios"
    title=f"metric ({metric}) by Node(all)"

    df = aggregate_metric_collection(nodes, f"{metric}")
    df_columns = df.columns.drop('job_Id')

    print(f">> {title}")
    display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=200000, value_red=100000, value_greenS=500000, value_greenH=530000, invert=True))

In [None]:
oper_lat_prefix = {
    "write": "",
    "read": ""
}
for op in oper_lat_prefix.keys():
    metric=f"{op}_{oper_lat_prefix[op]}bw"
    title=f"metric ({metric}) by Node(all)"

    df = aggregate_metric_collection(nodes, f"{metric}")
    df_columns = df.columns.drop('job_Id')

    print(f">> {title}")
    display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=20000, value_red=10000, value_greenS=40000, value_greenH=47500, invert=True))

In [None]:
metric="cpu_ctx"
title=f"metric ({metric}) by Node(all)"

df = aggregate_metric_collection(nodes, f"{metric}")
df_columns = df.columns.drop('job_Id')

print(f">> {title}")
display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=530000, value_red=537500))

In [None]:
metric="cpu_sys"
title=f"metric ({metric}) by Node(all)"

df = aggregate_metric_collection(nodes, f"{metric}")
df_columns = df.columns.drop('job_Id')

print(f">> {title}")
display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=0.25, value_red=0.50, value_greenS=0.1, value_greenH=0.04))

In [None]:
metric="cpu_usr"
title=f"metric ({metric}) by Node(all)"

df = aggregate_metric_collection(nodes, f"{metric}")
df_columns = df.columns.drop('job_Id')

print(f">> {title}")
display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=0.1, value_red=0.2, value_greenS=0.05, value_greenH=0.02))

In [None]:
metric="load1"
title=f"metric ({metric}) by Node(all)"

df = aggregate_metric_collection(nodes, f"{metric}", is_collection=False)
df_columns = df.columns.drop('job_Id')

print(f">> {title}")
display(df.style.applymap(_df_style_high, subset=df_columns, value_yellow=2, value_red=4, value_greenS=1, value_greenH=0.5))