## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Read the data

Get a dataframe for each experiment

In [2]:
df_ba = pd.read_parquet("results/BA/results.parquet")
df_ba_more = pd.read_parquet("results/BA-more/results.parquet")
df_ba_norel = pd.read_parquet("results/BA-norel/results.parquet")
df_er = pd.read_parquet("results/ER07/results.parquet")
df_er_more = pd.read_parquet("results/ER07-more/results.parquet")
df_er_norel = pd.read_parquet("results/ER07-norel/results.parquet")

In [3]:
dfs = {
    "ba": df_ba,
    "er": df_er,
    "ba_more": df_ba_more,
    "er_more": df_er_more,
    "ba_norel": df_ba_norel,
    "er_norel": df_er_norel,
}

In [4]:
for a, df in dfs.items():
    print(a, df.shape)

ba (10500, 28)
er (10500, 28)
ba_more (2100, 28)
er_more (2100, 28)
ba_norel (4200, 28)
er_norel (4200, 28)


## Set topology

In [5]:
for name, df in dfs.items():
    if name.startswith("ba"):
        df["Topology"] = "Barabasi-Albert"
    elif name.startswith("er"):
        df["Topology"] = "Erdos-Renyi"
    else:
        raise ValueError("Unknown topology")

Remove unused columns (w.r.t further analysis)

In [6]:
cols_to_get = ['Output', 'Inferences', 'Time', 'Seed', 'RepProb', 'Infr', 'Flows',
       'Nodes', 'Edges', 'Topology', 'timestamp', 'time_this_iter_s', 'time_total_s']

for df in dfs.values():
    df.drop(columns=[col for col in df.columns if col not in cols_to_get], inplace=True)

Set the types of the columns

In [7]:
for df in dfs.values():
    df['Output'] = df['Output'].apply(lambda x: 'success' if x not in ['no_result', 'timeout'] else x)
    df["Inferences"] = df["Inferences"].replace({"None": 0})

    df["Inferences"] = df["Inferences"].astype(np.int64)
    df["Time"] = df["Time"].astype(np.float64)
    df["Seed"] = df["Seed"].astype(np.int32)
    df["RepProb"] = df["RepProb"].astype(np.float64)
    df["Flows"] = df["Flows"].astype(np.int32)
    df["Nodes"] = df["Nodes"].astype(np.int16)
    df["Edges"] = df["Edges"].astype(np.int32)

Statistics on the number of _success_, _timeout_ and _no_result_ labels

In [8]:
for name, df in dfs.items():
    print(name, len(df[df["Output"] == "success"]), len(df[df["Output"] == "timeout"]), len(df[df["Output"] == "no_result"]))

ba 5077 5223 200
er 6978 3519 3
ba_more 77 1983 40
er_more 656 1444 0
ba_norel 3830 370 0
er_norel 4200 0 0


Merge all the dataset into two dataframes: one for the _original_ and one for the _modified/new_ methodology

In [9]:
# df_merged = pd.concat(list(dfs.values()))
# df_sorted = df_merged.sort_index().sort_values(by=["Topology", "Seed", "RepProb", "Nodes", "Flows"])
# df_sorted.to_parquet("results/all.parquet")

df_merged_norel = pd.concat([df_ba_norel, df_er_norel])
df_merged_norel.drop(columns=["RepProb"])
df_sorted_norel = df_merged_norel.sort_index().sort_values(by=["Topology", "Seed", "Nodes", "Flows"])
df_sorted_norel.to_parquet("results/all-norel.parquet")

df_merged = pd.concat([df_ba, df_er, df_ba_more, df_er_more])
df_sorted = df_merged.sort_index().sort_values(by=["Topology", "Seed", "RepProb", "Nodes", "Flows"])
df_sorted.to_parquet("results/all.parquet")