In [1]:
import json
import pandas as pd
import os
from pandas.io.json import json_normalize

def read_jsonl(relative_path):
    with open(relative_path, "r") as file:
        lines = file.readlines()
    jsons = [json.loads(line) for line in lines]
    return pd.json_normalize(jsons)

def list_files_by_extension(extension, directory_path):
    files = [file for file in os.listdir(directory_path) if file.lower().endswith(extension.lower())]
    sorted_by_name = sorted(files)
    return sorted_by_name

def list_directores_in_path(relative_path):
    return [os.path.join(relative_path, fname) for fname in os.listdir(relative_path) if os.path.isdir(os.path.join(relative_path, fname))]

In [34]:
from functools import reduce
from itertools import chain
import numpy as np

def load_data_run(path):
    print(f"Loading from {path}")
    files = [os.path.join(path, file) for file in ["job_descriptions.jsonl", "sys_info.jsonl"]]
    meta = [read_jsonl(file) for file in files]
    events = read_jsonl(os.path.join(path, "metrics.jsonl"))
    meta = pd.merge(meta[0], meta[1], on="jobId")
    return np.array([meta, events])

def load_from_sources(paths):
    frames = np.array([load_data_run(path) for path in paths])
    return pd.concat(frames[:, 0]), pd.concat(frames[:, 1])

def load_frames_for_base(path, base):
    base_path = os.path.join(path,base)
    categories = list_directores_in_path(base_path)
    frames_paths =list(chain(*[list_directores_in_path(category) for category in categories]))
    return load_from_sources(frames_paths[100:])

In [35]:
data_root = "../data/parsed-logs/"

aws_bases = list_directores_in_path(data_root + "aws")
gcloud_bases = list_directores_in_path(data_root +"gcloud")

In [36]:
df = load_frames_for_base(data_root, "gcloud")

Loading from ../data/parsed-logs/gcloud/n2d-standard-4/montage2__1.0__1.0.0__2020-06-20-21-44-11


  return np.array([meta, events])


Loading from ../data/parsed-logs/gcloud/n2d-standard-4/soykb__56__1.0.0__2020-06-21-21-25-12
Loading from ../data/parsed-logs/gcloud/n2d-standard-4/montage2__0.25__1.0.0__2020-06-20-20-15-27
Loading from ../data/parsed-logs/gcloud/n2d-standard-4/montage__0.25__1.0.0__2020-06-21-14-10-27
Loading from ../data/parsed-logs/gcloud/n2d-standard-4/montage__2.0__1.0.0__2020-06-21-17-04-44
Loading from ../data/parsed-logs/gcloud/n2d-standard-4/soykb__238__1.0.0__2020-06-21-07-11-24
Loading from ../data/parsed-logs/gcloud/n2d-standard-4/montage__0.25__1.0.0__2020-06-20-19-19-24
Loading from ../data/parsed-logs/gcloud/n2d-standard-4/montage2__0.01__1.0.0__2020-06-21-17-17-48
Loading from ../data/parsed-logs/gcloud/n2d-standard-4/soykb__82__1.0.0__2020-06-21-22-38-21
Loading from ../data/parsed-logs/gcloud/n2d-standard-4/montage__1.0__1.0.0__2020-06-21-16-46-00
Loading from ../data/parsed-logs/gcloud/n2d-standard-4/soykb__134__1.0.0__2020-06-22-00-37-22
Loading from ../data/parsed-logs/gcloud/n2d-

In [37]:
meta, events = df

In [38]:
# filtered_events = events.loc[events["parameter"] != "event"].reset_index()
# filtered_events

In [39]:
# filtered_events.parameter.unique()

{"read": 1225, "write": 1, "readSyscalls": 5, "writeSyscalls": 1, "readReal": 0, "writeReal": 0, "writeCancelled": 0}

In [40]:
io_params = ["read", "write", "readSyscalls", "writeSyscalls", "readReal", "writeReal", "writeCancelled"]
network_params = ["rxBytes", "rxPackets", "rxErrors", "rxDrop", "rxFifo", "rxFrame", "rxCompressed", "rxMulticast", "txBytes", "txPackets", "txErrors", "txDrop", "txFifo", "txColls", "txCarrier", "txCompressed"]

io_agg_kwargs={f"{param}_sum" : (f"value.{param}", "sum") for param in io_params}
network_agg_kwargs={f"{param}_sum" : (f"value.{param}", "sum") for param in network_params}
cpu_agg_kwargs={"cpu_mean": ('value', 'mean'), "cpu_max": ('value', 'max')}
memory_agg_kwargs={"memory_mean": ('value', 'mean'), "memory_max": ('value', 'max')}
ctime_agg_kwargs={"ctime_mean": ('value', 'mean'), "ctime_max": ('value', 'max'), "ctime_sum": ('value', 'sum')}

def extract_meta_from_metrics(frame):
    frame["value"] = pd.to_numeric(frame["value"])
    def postprocess(aggregated_frame, parameter_type):
        aggregated_frame = aggregated_frame.reset_index()
        return aggregated_frame.loc[aggregated_frame["parameter"] == parameter_type].drop("parameter", axis=1)
    
    frame_grp = frame.groupby(['jobId', 'parameter'])
    metrics = [
        postprocess(frame_grp.agg(**io_agg_kwargs), "io"),
        postprocess(frame_grp.agg(**network_agg_kwargs), "network"),          
        postprocess(frame_grp.agg(**cpu_agg_kwargs), "cpu"),
        postprocess(frame_grp.agg(**memory_agg_kwargs), "memory"),
        postprocess(frame_grp.agg(**ctime_agg_kwargs), "ctime")   
    ]   
    return reduce(lambda left, right: pd.merge(left,right,on='jobId'), metrics)

In [41]:
# extract_meta_from_metrics(filtered_events)

In [42]:
# import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = [10, 10]

In [43]:
# meta.groupby('executable').count().sort_values('size').plot.barh(y='size', logx=True, legend=False)

### Todo - zmerge'ować base z metadanymi. Zrobić matrycę pearsona. Podstawowe klasyfikatory. GPT-2. Podział na 

In [44]:
meta.iloc[0]
# meta.columns

workflowName                                                      montage2
size                                                                   1.0
version                                                              1.0.0
hyperflowId                                                      0EUZ6gtGK
jobId                                                     0EUZ6gtGK-1-4049
nodeName                                gke-cluster-x-worker-f40a1baf-vlcx
executable                                                        mDiffFit
args                     [-d, -s, 3-fit.000068.000103.txt, p2mass-atlas...
inputs                   [{'name': 'region-oversized.hdr', 'size': 277}...
outputs                  [{'name': '3-fit.000068.000103.txt', 'size': 2...
name                                                              mDiffFit
command                  mDiffFit -d -s 3-fit.000068.000103.txt p2mass-...
execTimeMs                                                             415
env.podIp                

In [45]:
def preprocess_metrics(events_frame):
    print("Preprocessing metrics")
    events_frame = events_frame.loc[events_frame["parameter"] != "event"].reset_index()
    return extract_meta_from_metrics(events_frame)

def preprocess_meta(meta_frame):
    print("Preprocessing meta")
    return meta_frame.drop(["hyperflowId", 'version', 'nodeName', 'cpu.socket', 'cpu.speedmin', 
                            'cpu.speedmax', 'cpu.governor', 'cpu.revision', 'cpu.voltage', 'env.nodeName', 
                            'env.podIp', 'env.podServiceAccount', 'env.podName', 'env.podNamespace', 'stdout'], axis=1)

def join_metrics_with_meta(metrics_frame, meta_frame):
    print("Joining metrics with meta")
    return pd.merge(metrics_frame, meta_frame, on="jobId")

def dump_preprocessed(data_frame, path="merged.csv"):
    print("Dumping preprocessed")
    data_frame.to_csv(os.path.join(data_root, path))

def vectorize_data(data_frame):
    # inputs -> number of inputs
    # outputs -> number of outputs
    pass

In [46]:
def store_intermediate():
    metrics = preprocess_metrics(events)
    metadata = preprocess_meta(meta)
    joint_df = join_metrics_with_meta(metrics, metadata)
    dump_preprocessed(joint_df, "googles150.csv")

In [None]:
store_intermediate()

Preprocessing metrics
