# Post-mortem analysis

Direct links : 

* [Ombt-statistics](#Ombt-statistics)
* [Influxdb-Metrics](#Influxdb-Metrics)

## Get ombt statistics
### Preparation

In [None]:
import json
import os
import glob
import pandas
import statistics

In [None]:
# The path to the env dir of the experimental campaign
RESULT_PATH = "./"

In [None]:
# Inserting some ombt code (this could be removed when used as a library)
# This is used to recover the global stats from the per-agent stats
# Per agent stats are outputed from the controller in a dedicated.
import math

class Stats(object):
    """Manage a single statistic"""
    def __init__(self, min=None, max=None, total=0, count=0,
                 sum_of_squares=0, distribution=None):
        self.min = min
        self.max = max
        self.total = total
        self.count = count
        self.sum_of_squares = sum_of_squares
        # distribution of values grouped by powers of 10
        self.distribution = distribution or dict()

    @classmethod
    def from_dict(cls, values):
        if 'distribution' in values:
            # hack alert!
            # when a Stats is passed via an RPC call it appears as if the
            # distribution map's keys are converted from int to str.
            # Fix that by re-indexing the distribution map:
            new_dict = dict()
            old_dict = values['distribution']
            for k in old_dict.keys():
                new_dict[int(k)] = old_dict[k];
            values['distribution'] = new_dict
        return Stats(**values)

    def to_dict(self):
        new_dict = dict()
        for a in ["min", "max", "total", "count", "sum_of_squares"]:
            new_dict[a] = getattr(self, a)
        new_dict["distribution"] = self.distribution.copy()
        return new_dict

    def update(self, value):
        self.total += value
        self.count += 1
        self.sum_of_squares += value**2
        self.min = min(self.min, value) if self.min else value
        self.max = max(self.max, value) if self.max else value
        log = int(math.log10(value)) if value >= 1.0 else 0
        base = 10**log
        index = int(value/base)  # 0..9
        if log not in self.distribution:
            self.distribution[log] = [0 for i in range(10)]
        self.distribution[log][index] += 1

    def reset(self):
        self.__init__()

    def average(self):
        return (self.total / float(self.count)) if self.count else 0

    def std_deviation(self):
        return math.sqrt((self.sum_of_squares / float(self.count))
                         - (self.average() ** 2)) if self.count else -1

    def merge(self, stats):
        if stats.min is not None and self.min is not None:
            self.min = min(self.min, stats.min)
        else:
            self.min = self.min or stats.min
        if stats.max is not None and self.max is not None:
            self.max = max(self.max, stats.max)
        else:
            self.max = self.max or stats.max

        self.total += stats.total
        self.count += stats.count
        self.sum_of_squares += stats.sum_of_squares
        for k in stats.distribution.keys():
            if k in self.distribution:
                self.distribution[k] = [z for z in map(lambda a, b: a + b,
                                                       stats.distribution[k],
                                                       self.distribution[k])]
            else:
                self.distribution[k] = stats.distribution[k]

    def __str__(self):
        return "min=%i, max=%i, avg=%f, std-dev=%f" % (self.min, self.max,
                                                       self.average(),
                                                       self.std_deviation())

    def print_distribution(self):
        keys = list(self.distribution.keys())
        keys.sort()
        for order in keys:
            row = self.distribution[order]
            # order=0, index=0 is special case as it is < 1.0, for all orders >
            # 0, index 0 is ignored since everthing < 10^order is accounted for
            # in index 9 of the (order - 1) row
            index = 0 if order == 0 else 1
            while index < len(row):
                print("[%d..<%d):  %d" %
                      ((10 ** int(order)) * index,
                       (10 ** int(order)) * (index + 1),
                       row[index]))
                index += 1

class TestResults(object):
    """Client results of a test run.
    """
    def __init__(self, start_time=None, stop_time=None, latency=None,
                 msgs_ok=0, msgs_fail=0, errors=None):
        super(TestResults, self).__init__()
        self.start_time = start_time
        self.stop_time = stop_time
        self.latency = latency or Stats()
        self.msgs_ok = msgs_ok  # count of successful msg transfers
        self.msgs_fail = msgs_fail  # count of failed msg transfers
        self.errors = errors or dict()  # error msgs and counts

    @classmethod
    def from_dict(cls, values):
        if 'latency' in values:
            values['latency'] = Stats.from_dict(values['latency'])
        if 'errors' in values:
            values['errors'] = values['errors'].copy()
        return TestResults(**values)

    def to_dict(self):
        new_dict = dict()
        for a in ['start_time', 'stop_time', 'msgs_ok', 'msgs_fail']:
            new_dict[a] = getattr(self, a)
        new_dict['latency'] = self.latency.to_dict()
        new_dict['errors'] = self.errors.copy()
        return new_dict

    def error(self, reason):
        key = str(reason)
        self.errors[key] = self.errors.get(key, 0) + 1

    def reset(self):
        self.__init__()

    def merge(self, results):
        self.start_time = (min(self.start_time, results.start_time)
                           if self.start_time and results.start_time
                           else (self.start_time or results.start_time))
        self.stop_time = (max(self.stop_time, results.stop_time)
                              if self.stop_time and results.stop_time
                          else (self.stop_time or results.stop_time))
        self.msgs_ok += results.msgs_ok
        self.msgs_fail += results.msgs_fail
        self.latency.merge(results.latency)
        for err in results.errors:
            self.errors[err] = self.errors.get(err, 0) + results.errors[err]

    def print_results(self):
        if self.msgs_fail:
            print("Error: %d message transfers failed"
                  % self.msgs_fail)
        if self.errors:
            print("Error: errors detected:")
            for err in self.errors:
                print("  '%s' (occurred %d times)" % (err, self.errors[err]))

        total = self.msgs_ok + self.msgs_fail
        print("Total Messages: %d" % total)

        delta_time = self.stop_time - self.start_time
        print("Test Interval: %f - %f (%f secs)" % (self.start_time,
                                                    self.stop_time,
                                                    delta_time))

        if delta_time > 0.0:
            print("Aggregate throughput: %f msgs/sec" % (float(total)/delta_time))

        latency = self.latency
        if latency.count:
            print("Latency %d samples (msecs): Average %f StdDev %f"
                  " Min %f Max %f"
                  % (latency.count,
                     latency.average(), latency.std_deviation(),
                     latency.min, latency.max))
            print("Latency Distribution: ")
            latency.print_distribution()


In [None]:
# Some util functions
def load_stats(param):
    """Loads the stats for the controller output file."""
    try:
        controller_docker = os.path.join(RESULT_PATH, param["backup_dir"], "*controller*.log")
        # beware of the files _docker.log that would also match
        # and contains the global stats in a human readable format.
        files = glob.glob(controller_docker)
        controller_log = files[0]
        if "docker" in controller_log:
            controller_log = files[1]     
        a = []
        with open(controller_log) as f:
            a = f.readlines()
            stats = json.loads(a[0]), json.loads(a[1])
            return stats
    except:
        return False
    
def build_agg_results(results):
    agg = TestResults()
    for result in results:
        result["latency"] = Stats(**result["latency"])
        agg.merge(TestResults(**result))
        
    duration = agg.stop_time - agg.start_time
    total = agg.msgs_ok + agg.msgs_fail
    rate = float(total)/duration
    result = agg.to_dict()
    result["rate"] = rate
    return result

def build_msgs_stats(results, msg_type):
    # NOTE(msimonin): we don't expect a TestResult here
    msgs = [r[msg_type] for r in results]
    return {
        "mean": statistics.mean(msgs),
        #"stdev": statistics.stdev(msgs),
        "min": min(msgs),
        "max": max(msgs)
    }

def augment(mydict, myparams, in_key, out_key=None):
    out_key = out_key or in_key
    mydict.update({out_key: [p[in_key] for p in myparams]})

In [None]:
# Load the params from the params file
params = []
with open(os.path.join(RESULT_PATH, "./params.json")) as f:
    params = json.load(f)

In [None]:
# Wich parameters to deal with
# this allows to test for a subset only
PARAMS = params[:]

for param in PARAMS:
    stats = load_stats(param)
    if not stats:
        continue
    clients, servers = stats
    # what has been seen by ombt
    param["_ombt_clients"] = len(clients.values())
    param["_ombt_servers"] = len(servers.values())
    param["_ombt_msgs_sent_ok"] = build_msgs_stats(clients.values(), "msgs_ok")
    param["_ombt_msgs_received_ok"] = build_msgs_stats(servers.values(), "msgs_ok")
    param["_ombt_msgs_sent_fail"] = build_msgs_stats(clients.values(), "msgs_fail")
    param["_ombt_msgs_received_fail"] = build_msgs_stats(servers.values(), "msgs_fail")
    #param["_raw_servers_test_result"] = servers
    #param["_raw_clients_test_result"] = clients
    param["_agg_servers"] = build_agg_results(servers.values())
    param["_agg_clients"] = build_agg_results(clients.values())

In [None]:
with open("params_calculated.json", "w") as f:
    json.dump(PARAMS, f)

### Getting some stats

In [None]:
extraction = {}
to_extract = ["_ombt_clients", "_ombt_servers", "executor", "call_type", "pause", "version"]
for e in to_extract:
    augment(extraction, PARAMS, e)

# Rate server side
extraction.update({
    "server_rate": [p["_agg_servers"]["rate"] for p in PARAMS]
})

# Number of message processed correctly by all the servers
extraction.update({
    "server_ok": [p["_agg_servers"]["msgs_ok"] for p in PARAMS]
})

# Number of message processed with a failure by all the servers
extraction.update({
    "server_fail": [p["_agg_servers"]["msgs_fail"] for p in PARAMS]
})

# Average latency server side
extraction.update({
    "server_latency": [p["_agg_servers"]["latency"]["total"]/p["_agg_servers"]["latency"]["count"] for p in PARAMS]
})

# Average latency client side
extraction.update({
    "client_latency": [p["_agg_clients"]["latency"]["total"]/p["_agg_servers"]["latency"]["count"]for p in PARAMS]
})

# Rate server side
extraction.update({
    "client_rate": [p["_agg_clients"]["rate"] for p in PARAMS]
})

# Number of message processed correctly by all the clients
extraction.update({
    "client_ok": [p["_agg_clients"]["msgs_ok"] for p in PARAMS]
})

# Number of message processed with a failure by all the servers
extraction.update({
    "client_fail": [p["_agg_clients"]["msgs_fail"] for p in PARAMS]
})

# Get a sense of what is happening on each client/server
# min, max, avg of the number of message processed correctly by the clients
extraction.update({
    "per_client_ok": [p["_ombt_msgs_sent_ok"] for p in PARAMS]
})

# min, max, avg of the number of message processed with a failure by the clients
extraction.update({
    "per_client_fail": [p["_ombt_msgs_sent_fail"] for p in PARAMS]
})

# min, max, avg of the number of message processed correctly the servers
extraction.update({
    "per_server_ok": [p["_ombt_msgs_received_ok"] for p in PARAMS]
})

# min, max, avg of the number of message processed with a failure by the servers
extraction.update({
    "per_server_fail": [p["_ombt_msgs_received_fail"] for p in PARAMS]
})

### Ombt statistics

In [None]:
df = pandas.DataFrame(extraction)
df 

In [None]:
import matplotlib.pyplot as pyplot

def plot_distribution(params, client_server, index):
    if client_server == "client":
        agent = "_agg_clients"
    else:
        agent = "_agg_servers"        
    distribution = params[index][agent]["latency"]["distribution"]
    x = []
    data = []
    labels = []
    max_pw = 0
    for p, numbers in distribution.items():
        pw = int(p)
        x.extend([math.log(x * 10 ** pw, 10)  for x in range(1, 11)])
        labels.extend([10 ** pw] + 9 * [""])
        data.extend(numbers)
        max_pw = max(pw, max_pw)

    pyplot.bar(x, data, tick_label=labels, align='edge', edgecolor='black', width=-0.05)

## Recovering metrics from influxdb


### Preparation

In [None]:
import docker
from influxdb import InfluxDBClient

client = docker.from_env()

In [None]:
import shutil
import tarfile
import time
from datetime import datetime
import subprocess

RABBITMQ_OVERVIEW = [
        "messages_delivered",
#        "messages_ready",
#        "messages_unacked",
#        "messages_acked",
#        "messages_published",
        "queues",
        "connections",
        "consumers",
        "exchanges"
    ]
RABBITMQ_NODE = [
    "mem_used",
    "fd_used",
    "sockets_used"
]

for param in PARAMS:
    # get experimentation boundaries
    start_time = max(param['_agg_clients']['start_time'], param['_agg_servers']['start_time'])
    stop_time = max(param['_agg_clients']['stop_time'], param['_agg_servers']['stop_time'])
    duration = stop_time - start_time
    start_utc = datetime.utcfromtimestamp(start_time)
    stop_utc = datetime.utcfromtimestamp(stop_time)
    print("start=%s, stop=%s" % (start_utc, stop_utc))
    tar = os.path.join(RESULT_PATH, param['backup_dir'], 'influxdb-data.tar.gz')
    tarfile.open(tar).extractall()
    # docker run --name influxdb -v $(pwd)/influxdb-data:/var/lib/influxdb -p 8083:8083 -p 8086:8086 -ti influxdb
    # Evaluate the "load" of ombt-server/bus :
    # we take the min of the usage_idle of all host in the groups ombt-server/bus
    QUERIES = []
    for role in ['ombt-server', 'bus']:
        key = "min_usage_idle_%s" % role.replace("-", "_")
        query = "SELECT min(usage_idle) as %s FROM (SELECT mean(usage_idle) as usage_idle from cpu WHERE role='%s' and time>='%s' AND time<='%s' GROUP BY host)" % (key, role, start_utc, stop_utc)
        QUERIES.append([
            "cpu", (key), query
        ])

    # Evaluate some rabbitmq metrics
    # Take the max during of the metrics over the interval of the experiment
    # NOTE(msimonin): we could first group the metrics in a 10s interval and then take the max
    for field in RABBITMQ_OVERVIEW:
        QUERIES.append([
            "rabbitmq_overview",
            field,
            "SELECT max(%s) as %s from rabbitmq_overview" % (field, field)
        ])
    for field in RABBITMQ_NODE:
        QUERIES.append([
            "rabbitmq_node",
            field,
            "SELECT max(%s) as %s from rabbitmq_node" % (field, field)
        ])

    print(QUERIES)

    try:
        container = client.containers.run(
            'influxdb:latest',
            detach=True,
            ports={'8086/tcp': 8086, '8083/tcp': 8083},
            volumes={os.path.join(os.getcwd(), 'influxdb-data'): {'bind': '/var/lib/influxdb', 'mode': 'rw'}}
        )    
        influx = InfluxDBClient(database='telegraf', timeout=600)
        # TODO(msimonin): make a tcp socket retry test on port 8083
        time.sleep(15)
        
        for serie, key,  query in QUERIES:
            result = influx.query(query)
            print(result)
            result = list(result.get_points())[0]
            key_param = "_%s" % serie
            param.setdefault(key_param, {})
            param[key_param].setdefault(key, result[key])
            
    except Exception as e:
        print(e)
    finally:
        container.remove(force=True)
        subprocess.check_call("sudo rm -rf influxdb-data", shell=True)

In [None]:
# put all rabbitmq stuffs
for serie, field,_ in QUERIES:
    extraction.update({
        field : [p["_%s" % serie][field] for p in PARAMS]
    })


### Influxdb Metrics

In [None]:
df = pandas.DataFrame(extraction)
pandas.options.display.max_columns = 1000
df

In [None]:
with open("all_stats.json", "w") as f:
    f.write(df.to_json())

# Graphs

In [None]:
with open("all_stats.json", "r") as f:
    all_stats = json.load(f)

In [None]:
all_stats = pandas.read_json("all_stats.json")
all_stats

In [None]:
servers = [250, 500, 750, 1000]
ax = None
for server in servers:
    extract = df[(df.call_type == "rpc-call") & (df["_ombt_servers"] == server)].loc[:, ["_ombt_clients", "client_rate"]]
    kwargs = {"x": "_ombt_clients", "y": "client_rate"}
    if ax:         
        kwargs.update({"ax": ax})
    ax = extract.plot(**kwargs)
ax.legend([str(server) for server in servers])
ax.set_title("rpc call message rate (msg/s)")

In [None]:
servers = [250, 500, 750, 1000]
ax = None
for server in servers:
    extract = df[(df.call_type == "rpc-cast") & (df["_ombt_servers"] == server)].loc[:, ["_ombt_clients", "server_rate"]]
    kwargs = {"x": "_ombt_clients", "y": "server_rate"}
    if ax:         
        kwargs.update({"ax": ax})
    ax = extract.plot(**kwargs)
ax.legend([str(server) for server in servers])
ax.set_title("rpc cast message rate (msg/s)")

In [None]:
servers = [250, 500, 750, 1000]
ax = None
for server in servers:
    extract = df[(df.call_type == "rpc-call") & (df["_ombt_servers"] == server)].loc[:, ["_ombt_clients", "client_latency"]]
    kwargs = {"x": "_ombt_clients", "y": "client_latency"}
    if ax:         
        kwargs.update({"ax": ax})
    ax = extract.plot(**kwargs)
ax.legend([str(server) for server in servers])
ax.set_title("rpc call latency (ms)")

In [None]:
servers = [250, 500, 750, 1000]
ax = None
for server in servers:
    extract = df[(df.call_type == "rpc-cast") & (df["_ombt_servers"] == server)].loc[:, ["_ombt_clients", "server_latency"]]
    kwargs = {"x": "_ombt_clients", "y": "server_latency"}
    if ax:         
        kwargs.update({"ax": ax})
    ax = extract.plot(**kwargs)
ax.legend([str(server) for server in servers])
ax.set_title("rpc cast latency (ms)")