In [1]:
import re
import os
import pandas as pd
import numpy as np
from io import StringIO

In [2]:
directory = sorted(os.listdir("raw_data"))
directory

['H.Wang2020_N10.txt',
 'H.Wang2020_N5.txt',
 'Wang2021.txt',
 'Zheng2014_N10.txt',
 'Zheng2014_N5.txt']

In [3]:
raw_df_dict = {} # {"nome_file1": {"1": [100*df], "5": [100*df], "10": [100*df] ...}, "nome_file2": {...}, ...}

In [4]:
# creazione dataframe per ogni blocco di esecuzione, per ogni numero di processi, per ogni file e caricamento in raw_df_dict

for txtfile in directory:
    
    key_filename = str(txtfile).replace(".txt", "")
    raw_df_dict[key_filename] = {}

    with open(f"raw_data/{txtfile}", "r") as file:
        raw_data = file.read()
        processes_set = list(raw_data.split("\n\n"))
        
        for prc_set in processes_set:
            n_process = int(re.findall(r"[0-9]+", prc_set[0:20])[0])
            raw_df_dict[key_filename][str(n_process)] = []
            
            # divisione sottogruppi di file .csv da inserire in una lista unica di 100 dataframe per numero di processi
            process_subgroups = list(prc_set.split("***\n"))[1:]
            for prc_sub in process_subgroups:
                csv_data = StringIO(prc_sub)
                raw_data_df = pd.read_csv(csv_data, names=["N", "t_start", "t_end", "t_CPU"])
                raw_data_df.sort_values("t_start", axis=0, ascending=True, inplace=True)
                raw_df_dict[key_filename][str(n_process)].append(raw_data_df)
                

In [5]:
raw_df_dict["Wang2021"]["800"][15]

Unnamed: 0,N,t_start,t_end,t_CPU
0,2,0.000214,0.330036,0.329822
8,7,0.003500,0.629697,0.626197
20,8,0.007048,0.678413,0.671365
12,9,0.011436,0.668854,0.657418
46,75,0.011489,0.730938,0.719449
...,...,...,...,...
772,797,3.787403,4.017749,0.230346
786,798,3.791630,3.996854,0.205224
781,799,3.795665,4.020652,0.224987
785,800,3.799411,4.024886,0.225475


In [6]:
raw_data_df_start = {}  # {"data_file1": {"1": [[...],[...],...], "5": [data_rows], "10": [data_rows] ...}, "data_file2": ...}
raw_data_df_end = {}
raw_data_df_CPU = {}

In [7]:
# per ogni lista di dataframe creo tre dataframe unici aventi per colonne i numeri di processi (indici di esecuzione):
# - il primo ha per righe tutti i tempi di inizio
# - il secondo ha per righe tutti i tempi di fine
# - il terzo ha per righe tutti i tempi di processing
# in pratica, ogni colonna dei raw_df deve diventare una riga di uno dei tre data_df


# preparazione dati
for txtfile_dict in raw_df_dict:
    raw_data_df_start[txtfile_dict] = {}
    raw_data_df_end[txtfile_dict] = {}
    raw_data_df_CPU[txtfile_dict] = {}
    
    for n_process in raw_df_dict[txtfile_dict]:
        raw_data_df_start[txtfile_dict][n_process] = []
        raw_data_df_end[txtfile_dict][n_process] = []
        raw_data_df_CPU[txtfile_dict][n_process] = []
        
        for raw_df in raw_df_dict[txtfile_dict][n_process]:
            raw_data_df_start[txtfile_dict][n_process].append(raw_df["t_start"])
            raw_data_df_end[txtfile_dict][n_process].append(raw_df["t_end"])
            raw_data_df_CPU[txtfile_dict][n_process].append(raw_df["t_CPU"])
        

In [8]:
data_df_start = {}  # {"data_file1": {"1": [], "5": [], "10": [] ...}, "data_file2": ...}
data_df_end = {}
data_df_CPU = {}

In [9]:
# creazione DataFrame start e media colonne
for txtfile in raw_data_df_start:
    
    data_df_start[txtfile] = {}
    for n_process in raw_data_df_start[txtfile]:
        df_data = np.array(raw_data_df_start[txtfile][n_process])
        df = pd.DataFrame(df_data)
        df_mean = df.mean(axis=0)
        
        data_df_start[txtfile][n_process] = df_mean

# creazione DataFrame end e media colonne
for txtfile in raw_data_df_end:
    
    data_df_end[txtfile] = {}
    for n_process in raw_data_df_end[txtfile]:
        df_data = np.array(raw_data_df_end[txtfile][n_process])
        df = pd.DataFrame(df_data)
        df_mean = df.mean(axis=0)
        
        data_df_end[txtfile][n_process] = df_mean
        
# creazione DataFrame CPU e media colonne
for txtfile in raw_data_df_CPU:
    
    data_df_CPU[txtfile] = {}
    for n_process in raw_data_df_CPU[txtfile]:
        df_data = np.array(raw_data_df_CPU[txtfile][n_process])
        df = pd.DataFrame(df_data)
        df_mean = df.mean(axis=0)
        
        data_df_CPU[txtfile][n_process] = df_mean


In [10]:
data_df_start["Wang2021"]["10"].tolist()

[0.00019248,
 0.0008416399999999993,
 0.004161549999999999,
 0.0064136,
 0.008634499999999996,
 0.01027639,
 0.012180490000000002,
 0.014394930000000002,
 0.017291539999999998,
 0.0224005]

In [11]:
excel_pages = {} # {"data_file1": {"pag1": {"t_start": [], "t_end": [], "t_CPU": []}, "pag5": {...}, ...}, "data_file2": ...}

In [12]:
for txtfile in data_df_start: # è giusto per ricavare i nomi dei file da passare come chiavi
    excel_pages[txtfile] = {}
    
    for n_process in data_df_start[txtfile]: # è giusto per ricavare le chiavi
        excel_pages[txtfile][n_process] = pd.DataFrame(data={
            
            "t_start": data_df_start[txtfile][n_process].tolist(),
            "t_end": data_df_end[txtfile][n_process].tolist(),
            "t_CPU": data_df_CPU[txtfile][n_process].tolist()
        })    

In [13]:
excel_pages["Wang2021"]

{'1':     t_start     t_end    t_CPU
 0  0.000191  0.011381  0.01119, '5':     t_start     t_end     t_CPU
 0  0.000181  0.019076  0.018895
 1  0.000476  0.016132  0.015656
 2  0.005327  0.024170  0.018844
 3  0.008094  0.026061  0.017968
 4  0.011508  0.026937  0.015429, '10':     t_start     t_end     t_CPU
 0  0.000192  0.035022  0.034829
 1  0.000842  0.038172  0.037331
 2  0.004162  0.046157  0.041996
 3  0.006414  0.047268  0.040855
 4  0.008634  0.048788  0.040154
 5  0.010276  0.049546  0.039270
 6  0.012180  0.050079  0.037898
 7  0.014395  0.050594  0.036199
 8  0.017292  0.051890  0.034599
 9  0.022401  0.053191  0.030790, '100':      t_start     t_end     t_CPU
 0   0.000290  0.328044  0.327755
 1   0.004992  0.434611  0.429619
 2   0.007107  0.446962  0.439856
 3   0.008864  0.445640  0.436775
 4   0.009985  0.446248  0.436263
 ..       ...       ...       ...
 95  0.192327  0.489273  0.296946
 96  0.196297  0.484329  0.288032
 97  0.200622  0.486765  0.286142
 98  0.20887

In [14]:
# creazione file excel: ad ogni file .txt si fa corrispondere un file .xlsx con una pagina per numero di processi

for txtfile in excel_pages: 
        
    with pd.ExcelWriter(f'data_timestamp/{txtfile}.xlsx') as writer:
        for page in excel_pages[txtfile]:
            page_df = excel_pages[txtfile][page]
            page_df.to_excel(writer, sheet_name=page)
            

In [15]:
# creazione file excel times_dict_ts: ad ogni file .txt si fa corrispondere una pagina con una colonna per numero di processi

ts_dict_tot = {} # {"filename1": {"proc_1": [], "proc_5": [], ...}, "filename2": {...}}
for txtfile in excel_pages: 

    ts_dict_tot[txtfile] = {}
    for page in excel_pages[txtfile]:
        end_col = excel_pages[txtfile][page]["t_end"]
        ts_dict_tot[txtfile][f"proc_{page}"] = end_col

    for page in excel_pages[txtfile]:
        ts_df = pd.DataFrame(ts_dict_tot[txtfile])

# print(ts_dict_tot)

with pd.ExcelWriter(f'data_timestamp/times_dict_ts.xlsx') as writer:
    for txtfile in excel_pages:
        page_df = pd.DataFrame(ts_dict_tot[txtfile])
        page_df.to_excel(writer, sheet_name=txtfile)


In [16]:
# plot conteggio cumulativo processi:

prc_counter_dict = {}
for txtfile in excel_pages:
    t_dict = excel_pages[txtfile]
    prc_counter_dict[txtfile] = {}
    for n_proc in t_dict.keys():
        t_v = np.arange(start=0, stop=max(t_dict[n_proc]["t_end"])+0.2, step=0.01)
        t_v = sorted([round(x, ndigits=2) for x in t_v])
        p_times = sorted([round(x, ndigits=2) for x in t_dict[n_proc]["t_end"]])

        v_count = []
        n = 0
        for time_step in t_v:
            c = [x for x in p_times if x == time_step]
            if c:
                n += len(c)
            v_count.append(n)

        prc_counter_dict[txtfile][n_proc] = v_count
               
df_counter_Wang2021 = pd.DataFrame(dict([(f"proc_{k}", 
                                          pd.Series(v)) for k, v in prc_counter_dict["Wang2021"].items()]))
df_counter_HWang2020_N5 = pd.DataFrame(dict([(f"proc_{k}", 
                                              pd.Series(v)) for k, v in prc_counter_dict["H.Wang2020_N5"].items()]))
df_counter_HWang2020_N10 = pd.DataFrame(dict([(f"proc_{k}", 
                                               pd.Series(v)) for k, v in prc_counter_dict["H.Wang2020_N10"].items()]))
df_counter_Zheng2014_N5 = pd.DataFrame(dict([(f"proc_{k}", 
                                              pd.Series(v)) for k, v in prc_counter_dict["Zheng2014_N5"].items()]))
df_counter_Zheng2014_N10 = pd.DataFrame(dict([(f"proc_{k}", 
                                               pd.Series(v)) for k, v in prc_counter_dict["Zheng2014_N10"].items()]))
                                        
max_max_N5 = round(max(prc_counter_dict["Zheng2014_N5"]["1000"]), ndigits=2)
max_max_N10 = round(max(prc_counter_dict["Zheng2014_N10"]["1000"]), ndigits=2)
df_max_max_N5 = pd.DataFrame({"general_time_max": [max_max_N5]})
df_max_max_N10 = pd.DataFrame({"general_time_max": [max_max_N10]})
                                        
with pd.ExcelWriter(f'data_timestamp/prc_counter_ts.xlsx') as writer:
    df_counter_Wang2021.to_excel(writer, sheet_name="Wang2021")
    df_counter_HWang2020_N5.to_excel(writer, sheet_name="H.Wang2020_N5")
    df_counter_Zheng2014_N5.to_excel(writer, sheet_name="Zheng2014_N5")
    df_counter_HWang2020_N10.to_excel(writer, sheet_name="H.Wang2020_N10")
    df_counter_Zheng2014_N10.to_excel(writer, sheet_name="Zheng2014_N10")
    df_max_max_N5.to_excel(writer, sheet_name="time_max_N5")
    df_max_max_N10.to_excel(writer, sheet_name="time_max_N10")
                                        