In [1]:
import json
import pandas as pd
import os
from pandas.io.json import json_normalize

def read_jsonl(relative_path):
    with open(relative_path, "r") as file:
        lines = file.readlines()
    jsons = [json.loads(line) for line in lines]
    return pd.json_normalize(jsons)

def list_files_by_extension(extension, directory_path):
    files = [file for file in os.listdir(directory_path) if file.lower().endswith(extension.lower())]
    sorted_by_name = sorted(files)
    return sorted_by_name

def list_directores_in_path(relative_path):
    return [os.path.join(relative_path, fname) for fname in os.listdir(relative_path) if os.path.isdir(os.path.join(relative_path, fname))]

In [2]:
from functools import reduce
from itertools import chain
import numpy as np

def load_data_run(path):
    print(f"Loading from {path}")
    files = [os.path.join(path, file) for file in ["job_descriptions.jsonl", "sys_info.jsonl"]]
    meta = [read_jsonl(file) for file in files]
    events = read_jsonl(os.path.join(path, "metrics.jsonl"))
    meta = pd.merge(meta[0], meta[1], on="jobId")
    return np.array([meta, events])

def load_from_sources(paths):
    frames = np.array([load_data_run(path) for path in paths])
    return pd.concat(frames[:, 0]), pd.concat(frames[:, 1])

def load_frames_for_base(path, base):
    base_path = os.path.join(path,base)
    categories = list_directores_in_path(base_path)
    frames_paths =list(chain(*[list_directores_in_path(category) for category in categories]))
    return load_from_sources(frames_paths)

In [3]:
data_root = "../data/parsed-logs/"

aws_bases = list_directores_in_path(data_root + "aws")
gcloud_bases = list_directores_in_path(data_root +"gcloud")

In [4]:
df = load_frames_for_base(data_root, "aws")

Loading from ../data/parsed-logs/aws/unassigned/montage__1.0__1.0.0__2020-04-28-22-02-06


  return np.array([meta, events])


Loading from ../data/parsed-logs/aws/unassigned/soykb__134__1.0.0__2020-04-29-04-29-50
Loading from ../data/parsed-logs/aws/unassigned/montage__2.0__1.0.0__2020-04-29-13-04-35
Loading from ../data/parsed-logs/aws/unassigned/montage__0.25__1.0.0__2020-04-29-09-24-06
Loading from ../data/parsed-logs/aws/unassigned/montage__0.25__1.0.0__2020-04-28-21-42-32
Loading from ../data/parsed-logs/aws/unassigned/montage__1.0__1.0.0__2020-04-29-10-48-16
Loading from ../data/parsed-logs/aws/unassigned/montage__0.25__1.0.0__2020-04-28-21-45-01
Loading from ../data/parsed-logs/aws/unassigned/montage__2.0__1.0.0__2020-04-29-12-01-28
Loading from ../data/parsed-logs/aws/unassigned/montage__2.0__1.0.0__2020-04-28-22-24-43
Loading from ../data/parsed-logs/aws/unassigned/montage__0.25__1.0.0__2020-04-29-09-29-33
Loading from ../data/parsed-logs/aws/unassigned/soykb__56__1.0.0__2020-04-29-00-58-21
Loading from ../data/parsed-logs/aws/unassigned/montage__1.0__1.0.0__2020-04-29-10-15-49
Loading from ../data/p

In [5]:
meta, events = df

In [6]:
# filtered_events = events.loc[events["parameter"] != "event"].reset_index()
# filtered_events

In [7]:
# filtered_events.parameter.unique()

{"read": 1225, "write": 1, "readSyscalls": 5, "writeSyscalls": 1, "readReal": 0, "writeReal": 0, "writeCancelled": 0}

In [8]:
io_params = ["read", "write", "readSyscalls", "writeSyscalls", "readReal", "writeReal", "writeCancelled"]
network_params = ["rxBytes", "rxPackets", "rxErrors", "rxDrop", "rxFifo", "rxFrame", "rxCompressed", "rxMulticast", "txBytes", "txPackets", "txErrors", "txDrop", "txFifo", "txColls", "txCarrier", "txCompressed"]

io_agg_kwargs={f"{param}_sum" : (f"value.{param}", "sum") for param in io_params}
network_agg_kwargs={f"{param}_sum" : (f"value.{param}", "sum") for param in network_params}
cpu_agg_kwargs={"cpu_mean": ('value', 'mean'), "cpu_max": ('value', 'max')}
memory_agg_kwargs={"memory_mean": ('value', 'mean'), "memory_max": ('value', 'max')}
ctime_agg_kwargs={"ctime_mean": ('value', 'mean'), "ctime_max": ('value', 'max'), "ctime_sum": ('value', 'sum')}

def extract_meta_from_metrics(frame):
    frame["value"] = pd.to_numeric(frame["value"])
    def postprocess(aggregated_frame, parameter_type):
        aggregated_frame = aggregated_frame.reset_index()
        return aggregated_frame.loc[aggregated_frame["parameter"] == parameter_type].drop("parameter", axis=1)
    
    frame_grp = frame.groupby(['jobId', 'parameter'])
    metrics = [
        postprocess(frame_grp.agg(**io_agg_kwargs), "io"),
        postprocess(frame_grp.agg(**network_agg_kwargs), "network"),          
        postprocess(frame_grp.agg(**cpu_agg_kwargs), "cpu"),
        postprocess(frame_grp.agg(**memory_agg_kwargs), "memory"),
        postprocess(frame_grp.agg(**ctime_agg_kwargs), "ctime")   
    ]   
    return reduce(lambda left, right: pd.merge(left,right,on='jobId'), metrics)

In [9]:
# extract_meta_from_metrics(filtered_events)

In [10]:
# import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = [10, 10]

In [11]:
# meta.groupby('executable').count().sort_values('size').plot.barh(y='size', logx=True, legend=False)

### Todo - zmerge'ować base z metadanymi. Zrobić matrycę pearsona. Podstawowe klasyfikatory. GPT-2. Podział na 

In [12]:
meta.iloc[0]
# meta.columns

workflowName                                                       montage
size                                                                   1.0
version                                                              1.0.0
hyperflowId                                                      0EPinjEE9
jobId                                                      0EPinjEE9-1-225
executable                                                        mDiffFit
args                     [-s, fit.000044.000076.txt, p2mass-atlas-00020...
inputs                   [{'name': 'big_region_20180402_165223_16974.hd...
outputs                  [{'name': 'fit.000044.000076.txt'}, {'name': '...
name                                                              mDiffFit
command                  mDiffFit -s fit.000044.000076.txt p2mass-atlas...
execTimeMs                                                              59
cpu.manufacturer                                                    Intel®
cpu.brand                

In [13]:
def preprocess_metrics(events_frame):
    print("Preprocessing metrics")
    events_frame = events_frame.loc[events_frame["parameter"] != "event"].reset_index()
    return extract_meta_from_metrics(events_frame)

def preprocess_meta(meta_frame):
    print("Preprocessing meta")
    return meta_frame.drop(["hyperflowId", 'version', 'nodeName', 'cpu.socket', 'cpu.speedmin', 
                            'cpu.speedmax', 'cpu.governor', 'cpu.revision', 'cpu.voltage', 'env.nodeName', 
                            'env.podIp', 'env.podServiceAccount', 'env.podName', 'env.podNamespace', 'stdout'], axis=1)

def join_metrics_with_meta(metrics_frame, meta_frame):
    print("Joining metrics with meta")
    return pd.merge(metrics_frame, meta_frame, on="jobId")

def dump_preprocessed(data_frame, path="merged.csv"):
    print("Dumping preprocessed")
    data_frame.to_csv(os.path.join(data_root, path))

In [14]:
def store_intermediate():
    metrics = preprocess_metrics(events)
    metadata = preprocess_meta(meta)
    joint_df = join_metrics_with_meta(metrics, metadata)
    dump_preprocessed(joint_df, "aws.csv")

In [15]:
store_intermediate()

Preprocessing metrics
Preprocessing meta
Joining metrics with meta
Dumping preprocessed
