## Import libraries

In [4]:
import pandas as pd
import numpy as np

## Read the data

Get a dataframe for each experiment

In [5]:
df_ba = pd.read_parquet('results/BA-norel/results.parquet')
df_er = pd.read_parquet('results/ER-norel/results.parquet')

In [7]:
df_ba["Seed"].unique()

array(['110396', '151195', '300997', '10664', '21297', '30997', '70799',
       '90597', '42', '80824'], dtype=object)

In [8]:
dfs = {
    "ba": df_ba,
    "er": df_er,
}

In [9]:
for a, df in dfs.items():
    print(a, df.shape)

ba (1400, 28)
er (1400, 28)


## Set topology

In [10]:
for name, df in dfs.items():
    if name.startswith("ba"):
        df["Topology"] = "Barabasi-Albert"
    elif name.startswith("er"):
        df["Topology"] = "Erdos-Renyi"
    else:
        raise ValueError("Unknown topology")

Remove unused columns (w.r.t further analysis)

In [11]:
cols_to_get = ['Output', 'Inferences', 'Time', 'Seed', 'RepProb', 'Infr', 'Flows',
       'Nodes', 'Edges', 'Topology', 'Timeout', 'timestamp', 'time_this_iter_s', 'time_total_s']

for df in dfs.values():
    df.drop(columns=[col for col in df.columns if col not in cols_to_get], inplace=True)

Set the types of the columns

In [12]:
for df in dfs.values():
    df['Output'] = df['Output'].apply(lambda x: 'no_result' if x == "{}" else x)
    df['Output'] = df['Output'].apply(lambda x: 'success' if x not in ['no_result', 'timeout'] else x)
    df["Inferences"] = df["Inferences"].replace({"None": 0})

    df["Inferences"] = df["Inferences"].astype(np.int64)
    df["Time"] = df["Time"].astype(np.float64)
    df["Seed"] = df["Seed"].astype(np.int32)
    df["RepProb"] = df["RepProb"].astype(np.float64)
    df["Flows"] = df["Flows"].astype(np.int32)
    df["Nodes"] = df["Nodes"].astype(np.int16)
    df["Edges"] = df["Edges"].astype(np.int32)
    df["Timeout"] = 1800

Statistics on the number of _success_, _timeout_ and _no_result_ labels

In [13]:
for name, df in dfs.items():
    print(name, df.groupby("Output").size())

ba Output
success    1366
timeout      34
dtype: int64
er Output
success    1400
dtype: int64


Merge all the dataset into two dataframes: one for the _original_ and one for the _modified/new_ methodology

In [14]:
# df_merged = pd.concat(list(dfs.values()))
# df_sorted = df_merged.sort_index().sort_values(by=["Topology", "Seed", "RepProb", "Nodes", "Flows"])
# df_sorted.to_parquet("results/all.parquet")

df_ba.to_parquet("results/clean/ba-norel.parquet")
df_er.to_parquet("results/clean/er-norel.parquet")