In [None]:
! tar -xvzf data.tar.gz

In [10]:
from glob import glob
import pandas as pd

In [8]:
file_paths = {
  "kpa1": {
      "files": glob("./data/kpa1/*.txt"),
      "pods": "./data/kpa1/monitor/pods_log.txt",
      "nodes": "./data/kpa1/monitor/nodes_log.txt"
  },
  "kpa2": {
      "files": glob("./data/kpa2/*.txt"),
      "pods": "./data/kpa2/monitor/pods_log.txt",
      "nodes": "./data/kpa2/monitor/nodes_log.txt"
  },
  "hpa": {
      "files": glob("./data/hpa/*.txt"),
      "pods": "./data/hpa/monitor/pods_log.txt",
      "nodes": "./data/hpa/monitor/nodes_log.txt"
  },
  
} 

In [224]:
traces = pd.read_csv("traces.csv")

In [53]:
def get_pods_data(file_path):
  df_pods = pd.DataFrame(columns=["timestamp", "n_pods", "avg_cpu", "avg_memory"])
  f = open(file_path, "r")
  counter = 0
  cpu = 0
  memory = 0
  for l in f:
    if l[0] == ">":
      if counter > 0:
        cpu /= counter
        memory /= counter
        df_pods = df_pods.append({"timestamp": timestamp, "n_pods":counter,
                                  "avg_cpu": cpu, "avg_memory": memory}, ignore_index=True)
      timestamp = int(l[1:-1])
      counter = 0
      cpu = 0
      memory = 0
    elif "NAME" in l:
      continue
    else:
      counter += 1
      words = l.split()
      cpu += int(words[1][:-1])
      memory += int(words[2][:-2])
  cpu /= counter
  memory /= counter
  df_pods = df_pods.append({"timestamp": timestamp, "n_pods":counter,
                            "avg_cpu": cpu, "avg_memory": memory}, ignore_index=True)
      
  f.close()
  return df_pods

In [54]:
def get_nodes_data(file_path):
  df_nodes = pd.DataFrame(columns=["timestamp", "avg_cpu", "avg_memory"])
  f = open(file_path, "r")
  counter = 0
  cpu = 0
  memory = 0
  for l in f:
    if l[0] == ">":
      if counter > 0:
        cpu /= counter
        memory /= counter
        df_nodes = df_nodes.append({"timestamp": timestamp,
                                  "avg_cpu": cpu, "avg_memory": memory},
                                  ignore_index=True)
      timestamp = int(l[1:-1])
      counter = 0
      cpu = 0
      memory = 0
    elif "NAME" in l or len(l) == 1:
      continue
    else:
      counter += 1
      words = l.split()
      cpu += int(words[1][:-1])
      memory += int(words[3][:-2])
  cpu /= counter
  memory /= counter
  df_pods = df_nodes.append({"timestamp": timestamp,
                            "avg_cpu": cpu, "avg_memory": memory},
                            ignore_index=True)
      
  f.close()
  return df_nodes

In [213]:
from datetime import datetime

def to_timestamp(s):
  format = "%a %b %d %Y %H:%M:%S %Z%z (Mountain Standard Time)"
  return int(datetime.timestamp(datetime.strptime(s,format)))

In [187]:
import shutil
import os
def fix_file(file_path):
  f1 = open(file_path, "r")
  f2 = open("./tmp.txt", "w")

  for l in f1:
    index = l.rfind("[")
    while index > 0:
      print(l)
      l2 = l[:l[1:].find("[")]
      if "Total time" in l2 and l2[-1].isnumeric():
        l2 += " s"
      f2.write(l2+"\n")
      l = l[l[1:].find("[")+1:]
      index = l.rfind("[")
    f2.write(l)
  f1.close()
  f2.close()
  os.remove(file_path)
  os.rename("./tmp.txt", file_path)  

In [221]:
def parse_loadtest(file_path):
  # fix_file(file_path)
  f = open(file_path, "r")
  timestamp = -1
  n_errors = -1
  total_time = -1
  requests_per_second = -1
  mean_latency = -1
  for l in f:
    date_str = l[1:l.find("]")]
    l = l[l.find("]")+1:]
    if "Total errors" in l:
      n_errors = int(l.split()[-1])
    elif "Total time" in l:
      timestamp = to_timestamp(date_str)
      total_time = float(l.split()[-2])
      if l.split()[-1] != "s" and total_time > 0:
        if l.split()[-1] == "ms":
          total_time = total_time/1000
        else:
          print("Error total time not in seconds but in " + l.split()[-1])
    elif "Requests per second" in l:
      requests_per_second = int(l.split()[-1])
    elif "Mean latency" in l:
      mean_latency = float(l.split()[-2])
      if l.split()[-1] != "ms" and total_time > 0:
        if l.split()[-1] == "s":
          mean_latency = mean_latency*1000
        else:
          print("Error total time not in ms but in " + l.split()[-1])
  f.close()
  return timestamp, n_errors, total_time, requests_per_second, mean_latency

In [356]:
def get_requests_data(loadtest_files):
  df_requests = pd.DataFrame(columns=["step", "HashApp", "timestamp", "errors",
                                      "total_time", "rps", "mean_latency"])
  for lf in loadtest_files:
    # print(lf)
    words = lf[lf.rfind("/")+1:].split("_")
    hash_app = words[1][:-4]
    step = int(words[0])
    
    # app_request_load = traces[traces["HashApp"] == hash_app]
    # print(app_request_load)
    # print(app_request_load.Memory.values)
    # if (app_request_load[str(step)].values[0] > 0 and 
    #     app_request_load.Memory.values[0]//app_request_load[str(step)].values[0] == 0):
    #   print(step, hash_app)
    #   continue
    timestamp, n_errors, total_time, requests_per_second, mean_latency = parse_loadtest(lf)
    df_requests = df_requests.append({
        "step": step,
        "HashApp": hash_app,
        "timestamp": timestamp,
        "errors": n_errors,
        "total_time": total_time,
        "rps": requests_per_second,
        "mean_latency": mean_latency
    }, ignore_index=True)
  return df_requests

In [321]:
df_kpa1_pods = get_pods_data(file_paths["kpa1"]["pods"])
df_kpa2_pods = get_pods_data(file_paths["kpa2"]["pods"])
df_hpa_pods = get_pods_data(file_paths["hpa"]["pods"])

In [322]:
df_kpa1_nodes = get_nodes_data(file_paths["kpa1"]["nodes"])
df_kpa2_nodes = get_nodes_data(file_paths["kpa2"]["nodes"])
df_hpa_nodes = get_nodes_data(file_paths["hpa"]["nodes"])

In [357]:
df_kpa1_requests = get_requests_data(file_paths["kpa1"]["files"])
df_kpa2_requests = get_requests_data(file_paths["kpa2"]["files"])
df_hpa_requests = get_requests_data(file_paths["hpa"]["files"])

In [358]:
df_final = pd.DataFrame(columns=[
        "time_step", "total_requests", "total_memory_requested", "total_duration_requested",
])

for c in [str(i) for i in range(1,1441)]:
  rows = traces2[traces2[c] > 0][["Memory", "Duration", c]].sum()
  df_final = df_final.append({
    "time_step": c,
    "total_requests": rows[c],
    "total_memory_requested": rows["Memory"],
    "total_duration_requested": rows["Duration"]
  }, ignore_index=True)
df_final.set_index("time_step", inplace=True)

In [359]:
# extract kpa metrics
def extract_metrics(requests, pods, nodes, prefix="kpa1"):
  timestamp = requests["timestamp"].min()
  time_step = []
  n_pods = []
  pods_avg_cpu = []
  pods_avg_memory = []
  nodes_avg_cpu = []
  nodes_avg_memory = []
  errors = []
  for c in [str(i) for i in range(1,1441)]:
    # timestamp = requests[requests["step"] == c]["timestamp"].min()
    # print(requests[requests["step"] == c]["errors"])
    errors += [requests[requests["step"] == int(c)]["errors"].sum()]
    pod_info = pods[pods["timestamp"] == timestamp].iloc[-1]
    node_info = nodes[nodes["timestamp"] == timestamp].iloc[-1]
    time_step += [c]
    n_pods += [pod_info["n_pods"]]
    pods_avg_cpu += [pod_info["avg_cpu"]]
    pods_avg_memory += [pod_info["avg_memory"]]
    nodes_avg_cpu += [pod_info["avg_cpu"]]
    nodes_avg_memory += [pod_info["avg_memory"]]
    timestamp += 1
  df = pd.DataFrame({
      "time_step": [str(i) for i in range(1,1441)], 
      prefix+"_n_pods": n_pods,
      prefix+"_pods_avg_cpu": pods_avg_cpu,
      prefix+"_pods_avg_memory": pods_avg_memory,
      prefix+"_pods_avg_cpu": nodes_avg_cpu,
      prefix+"_pods_avg_memory": nodes_avg_memory,
      prefix+"_errors": errors
  })
  df.set_index("time_step", inplace=True)

  return df

In [360]:
df_final = pd.concat([df_final, 
           extract_metrics(df_kpa1_requests, df_kpa1_pods, df_kpa1_nodes, "kpa1"),
           extract_metrics(df_kpa2_requests, df_kpa2_pods, df_kpa2_nodes, "kpa2"),
           extract_metrics(df_hpa_requests, df_hpa_pods, df_hpa_nodes, "hpa")], axis=1)

In [361]:
df_final.to_csv("knative_benchmark_dataset.csv")

In [362]:
df_final

Unnamed: 0_level_0,total_requests,total_memory_requested,total_duration_requested,kpa1_n_pods,kpa1_pods_avg_cpu,kpa1_pods_avg_memory,kpa1_errors,kpa2_n_pods,kpa2_pods_avg_cpu,kpa2_pods_avg_memory,kpa2_errors,hpa_n_pods,hpa_pods_avg_cpu,hpa_pods_avg_memory,hpa_errors
time_step,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,223,2489,123,1.0,8.000000,50.000000,193,1.0,13.000000,42.000000,193,1.0,9.0,40.0,193
2,209,1417,22,1.0,8.000000,50.000000,179,1.0,13.000000,42.000000,179,1.0,9.0,40.0,179
3,219,1783,25,1.0,8.000000,50.000000,185,1.0,13.000000,42.000000,185,1.0,9.0,40.0,185
4,183,1897,95,1.0,8.000000,50.000000,155,1.0,13.000000,42.000000,155,1.0,9.0,40.0,155
5,182,1490,83,1.0,8.000000,50.000000,159,1.0,13.000000,42.000000,159,1.0,9.0,40.0,159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436,826,2461,121,217.0,17.829493,63.041475,824,232.0,13.232759,63.474138,804,1.0,2430.0,6083.0,819
1437,903,1518,91,217.0,17.829493,63.041475,903,226.0,13.557522,64.243363,903,1.0,2430.0,6083.0,903
1438,1547,1524,21,217.0,17.829493,63.041475,1547,219.0,13.958904,65.319635,1529,1.0,2430.0,6083.0,1547
1439,1316,1632,92,217.0,13.142857,65.525346,1278,217.0,14.078341,65.433180,1316,1.0,2430.0,6083.0,1316


In [409]:
def app_metrics(app, requests, prefix):
  n_requests = []
  errors = []
  total_time = []
  mean_latency = []
  rps = []
  for c in [str(i) for i in range(1,1441)]:
    row = requests[requests["step"] == int(c)][requests["HashApp"] == app["HashApp"]]
    if row.shape[0] > 0:
      row = row.iloc[0]  
      n_requests += [app[c]]
      errors += [row["errors"]]
      total_time += [row["total_time"]]
      mean_latency += [row["mean_latency"]]
    else:
      n_requests += [0]
      errors += [0]
      total_time += [0]
      mean_latency += [0]

  df = pd.DataFrame({
    "time_step":  [str(i) for i in range(1,1441)],
    prefix+"_n_requests": n_requests,
    prefix+"_errors": errors,
    prefix+"_total_time": total_time,
    prefix+"_mean_latency": mean_latency
  })
  df.set_index("time_step", inplace=True)
  return df

In [392]:
app1 = traces[traces["HashApp"] == "3f9fb0df8cc1017c171a7f07afd0fd8ca810f9a6f422a0872c1d3c471eb080b6"].iloc[1]
df_app1 = pd.concat([app_metrics(app1, df_kpa1_requests, "kpa1"),
           app_metrics(app1, df_kpa2_requests, "kpa2"),
           app_metrics(app1, df_hpa_requests, "hpa")], axis=1)

  


In [410]:
app2 = traces[traces["HashApp"] == "4c34c02803614b18d4181a0ea3ff83c76c08d559bda561c2e55849dd8cd8f950"].iloc[0]
df_app2 = pd.concat([app_metrics(app2, df_kpa1_requests, "kpa1"),
           app_metrics(app2, df_kpa2_requests, "kpa2"),
           app_metrics(app2, df_hpa_requests, "hpa")], axis=1)

  


In [411]:
df_app1.to_csv("app1.csv")
df_app2.to_csv("app2.csv")