In [11]:
import json
import pandas as pd
import os

def read_jsonl(relative_path):
    return pd.read_json(path_or_buf=relative_path, lines=True)

def list_files_by_extension(extension, directory_path):
    files = [file for file in os.listdir(directory_path) if file.lower().endswith(extension.lower())]
    sorted_by_name = sorted(files)
    return sorted_by_name

def list_directores_in_path(relative_path):
    return [os.path.join(relative_path, fname) for fname in os.listdir(relative_path) if os.path.isdir(os.path.join(relative_path, fname))]

In [32]:
from functools import reduce
from itertools import chain

def load_data_run(path):
    files = [read_jsonl(os.path.join(path,file)) for file in list_files_by_extension("jsonl", path)]
    joined_df = reduce(lambda df1, df2: pd.merge(df1, df2, on="jobId"), files)
    return joined_df

def load_from_sources(paths):
    frames = [load_data_run(path) for path in paths]
    return pd.concat(frames)

def load_frames_for_base(path, base):
    base_path = os.path.join(path,base)
    categories = list_directores_in_path(base_path)
    frames_paths =list(chain(*[list_directores_in_path(category) for category in categories]))
    return load_from_sources(frames_paths[:10])

In [33]:
data_root = "../data/parsed-logs/"

aws_bases = list_directores_in_path(data_root + "aws")
gcloud_bases = list_directores_in_path(data_root +"gcloud")
load_frames_for_base(data_root, "aws").

Unnamed: 0,workflowName,size,version,hyperflowId,jobId,executable,args,inputs,outputs,name_x,...,execTimeMs,time,workflowId,parameter,value,name_y,pid,cpu,mem,stdout
0,montage,1.00,1.0.0,0EPinjEE9,0EPinjEE9-1-225,mDiffFit,"[-s, fit.000044.000076.txt, p2mass-atlas-00020...",[{'name': 'big_region_20180402_165223_16974.hd...,"[{'name': 'fit.000044.000076.txt'}, {'name': '...",mDiffFit,...,59.0,2020-04-28T21:59:43.484,0EPinjEE9-1,event,handlerStart,mDiffFit,,"{'manufacturer': 'Intel®', 'brand': 'Xeon® Pla...","{'total': 7994712064, 'free': 2274062336, 'use...",
1,montage,1.00,1.0.0,0EPinjEE9,0EPinjEE9-1-225,mDiffFit,"[-s, fit.000044.000076.txt, p2mass-atlas-00020...",[{'name': 'big_region_20180402_165223_16974.hd...,"[{'name': 'fit.000044.000076.txt'}, {'name': '...",mDiffFit,...,59.0,2020-04-28T21:59:43.502,0EPinjEE9-1,io,"{'read': 1225, 'write': 1, 'readSyscalls': 5, ...",mDiffFit,18.0,"{'manufacturer': 'Intel®', 'brand': 'Xeon® Pla...","{'total': 7994712064, 'free': 2274062336, 'use...",
2,montage,1.00,1.0.0,0EPinjEE9,0EPinjEE9-1-225,mDiffFit,"[-s, fit.000044.000076.txt, p2mass-atlas-00020...",[{'name': 'big_region_20180402_165223_16974.hd...,"[{'name': 'fit.000044.000076.txt'}, {'name': '...",mDiffFit,...,59.0,2020-04-28T21:59:43.503,0EPinjEE9-1,network,"{'name': 'lo', 'rxBytes': 0, 'rxPackets': 0, '...",mDiffFit,18.0,"{'manufacturer': 'Intel®', 'brand': 'Xeon® Pla...","{'total': 7994712064, 'free': 2274062336, 'use...",
3,montage,1.00,1.0.0,0EPinjEE9,0EPinjEE9-1-225,mDiffFit,"[-s, fit.000044.000076.txt, p2mass-atlas-00020...",[{'name': 'big_region_20180402_165223_16974.hd...,"[{'name': 'fit.000044.000076.txt'}, {'name': '...",mDiffFit,...,59.0,2020-04-28T21:59:43.509,0EPinjEE9-1,event,jobStart,mDiffFit,,"{'manufacturer': 'Intel®', 'brand': 'Xeon® Pla...","{'total': 7994712064, 'free': 2274062336, 'use...",
4,montage,1.00,1.0.0,0EPinjEE9,0EPinjEE9-1-225,mDiffFit,"[-s, fit.000044.000076.txt, p2mass-atlas-00020...",[{'name': 'big_region_20180402_165223_16974.hd...,"[{'name': 'fit.000044.000076.txt'}, {'name': '...",mDiffFit,...,59.0,2020-04-28T21:59:43.524,0EPinjEE9-1,io,"{'read': 14785427, 'write': 0, 'readSyscalls':...",mDiffFit,19.0,"{'manufacturer': 'Intel®', 'brand': 'Xeon® Pla...","{'total': 7994712064, 'free': 2274062336, 'use...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458,montage,0.25,1.0.0,j4ikpOmyz,j4ikpOmyz-1-28,mConcatFit,"[statfile_20180402_165339_22325.tbl, fits.tbl, .]","[{'name': 'fit.000001.000002.txt', '_id': 34, ...",[{'name': 'fits.tbl'}],mConcatFit,...,126.0,2020-04-29T09:29:19.477,j4ikpOmyz-1,cpu,0,mConcatFit,18.0,"{'manufacturer': 'Intel®', 'brand': 'Xeon® Pla...","{'total': 16214822912, 'free': 13431709696, 'u...",
459,montage,0.25,1.0.0,j4ikpOmyz,j4ikpOmyz-1-28,mConcatFit,"[statfile_20180402_165339_22325.tbl, fits.tbl, .]","[{'name': 'fit.000001.000002.txt', '_id': 34, ...",[{'name': 'fits.tbl'}],mConcatFit,...,126.0,2020-04-29T09:29:19.477,j4ikpOmyz-1,memory,6696960,mConcatFit,18.0,"{'manufacturer': 'Intel®', 'brand': 'Xeon® Pla...","{'total': 16214822912, 'free': 13431709696, 'u...",
460,montage,0.25,1.0.0,j4ikpOmyz,j4ikpOmyz-1-28,mConcatFit,"[statfile_20180402_165339_22325.tbl, fits.tbl, .]","[{'name': 'fit.000001.000002.txt', '_id': 34, ...",[{'name': 'fits.tbl'}],mConcatFit,...,126.0,2020-04-29T09:29:19.477,j4ikpOmyz-1,ctime,10,mConcatFit,18.0,"{'manufacturer': 'Intel®', 'brand': 'Xeon® Pla...","{'total': 16214822912, 'free': 13431709696, 'u...",
461,montage,0.25,1.0.0,j4ikpOmyz,j4ikpOmyz-1-28,mConcatFit,"[statfile_20180402_165339_22325.tbl, fits.tbl, .]","[{'name': 'fit.000001.000002.txt', '_id': 34, ...",[{'name': 'fits.tbl'}],mConcatFit,...,126.0,2020-04-29T09:29:19.587,j4ikpOmyz-1,event,jobEnd,mConcatFit,,"{'manufacturer': 'Intel®', 'brand': 'Xeon® Pla...","{'total': 16214822912, 'free': 13431709696, 'u...",
