# Notebook for results concatenation and preprocessing

Important note: during research we decided to extend analysis for AND protocol,
therefore you can find two types of `csv` files for each method - one that has
been calculated originally (`results.csv`) and another from repeated experiments
(`results_recalc_and.csv`)

In [None]:
from pathlib import Path
import pandas as pd

## Random seed selector processing

In [None]:
random_path_1 = Path("_experiments/all_methods/random/results.csv")
random_df_1 = pd.read_csv(random_path_1).drop("Unnamed: 0", axis=1)
print(len(random_df_1))
random_df_1.head()

In [None]:
random_path_2 = Path("_experiments/all_methods/random/results_recalc_and.csv")
random_df_2 = pd.read_csv(random_path_2).drop("Unnamed: 0", axis=1)
print(len(random_df_2))
random_df_2.head()

In [None]:
random_path_3 = Path("_experiments/all_methods/random/results_recalc_single_layer.csv")
random_df_3 = pd.read_csv(random_path_3).drop("Unnamed: 0", axis=1)
print(len(random_df_3))
random_df_3.head()

In [None]:
random_path_4 = Path("_experiments/all_methods/random/results_recalc_single_layer_and.csv")
random_df_4 = pd.read_csv(random_path_4).drop("Unnamed: 0", axis=1)
print(len(random_df_4))
random_df_4.head()

In [None]:
random_df = pd.concat(
    [random_df_1, random_df_2, random_df_3, random_df_4], ignore_index=True
)
print(
    len(random_df),
    len(random_df_1) + len(random_df_2) + len(random_df_3) + len(random_df_4)
)
random_df.head()

In [None]:
experiment_params = set(random_df.columns)
experiment_params.remove("repetition_run")
experiment_params.remove("gain")
experiment_params.remove("diffusion_len")
experiment_params.remove("active_actors_prct")
experiment_params.remove("seed_actors_prct")

experiment_metrics = set(random_df.columns).difference(experiment_params)

experiment_params = list(experiment_params)
experiment_metrics = list(experiment_metrics)

print(f"Columns that are multi-indices: {experiment_params}")
print(f"Columns that have been left: {experiment_metrics}")

In [None]:
reindexed_df = random_df.set_index(experiment_params)
reindexed_df.head()

In [None]:
averaged_random_df = pd.DataFrame()
for metric in experiment_metrics:
    avg = reindexed_df.groupby(reindexed_df.index)[str(metric)].mean()
    averaged_random_df = pd.concat([averaged_random_df, avg], axis=1)
averaged_random_df.index = pd.MultiIndex.from_tuples(averaged_random_df.index, names=experiment_params)
averaged_random_df = averaged_random_df.reset_index()

averaged_random_df["selection_metric"] = "random"

print(f"Length of averaged random dataframe: {len(averaged_random_df)}")
averaged_random_df.head()
# averaged_random_df.to_csv("random_avg.csv")

In [None]:
assert len(reindexed_df) / max(reindexed_df["repetition_run"]) == len(averaged_random_df)

## Greedy seed selector processing

Postprocessing of greedy is more complicated - first we need to remove 
duplicated computations for enhanced evaluation of AND, because we had to 
repeat all computations that have been done in the first experiment.

In [None]:
greedy_path_1 = Path("_experiments/all_methods/greedy/results.csv")
greedy_df_1 = pd.read_csv(greedy_path_1).drop("Unnamed: 0", axis=1)
greedy_df_1 = greedy_df_1.drop(
    index=greedy_df_1.loc[greedy_df_1["protocol"] == "AND"].index
)

greedy_path_2 = Path("_experiments/all_methods/greedy/results_recalc_and.csv")
greedy_df_2 = pd.read_csv(greedy_path_2).drop("Unnamed: 0", axis=1)

# for eu_transportation we conucted experiments with s. budget up to 40% both 
# for OR and AND protocols, but in all another s.s. medhods we have budgets for 
# OR up to 30%, hence we need to drop redundant columns
greedy_path_3 = Path("_experiments/all_methods/greedy/results_recalc_eutr.csv")
greedy_df_3 = pd.read_csv(greedy_path_3).drop("Unnamed: 0", axis=1)
eutr_to_drop_cols = greedy_df_3.loc[
    (greedy_df_3["protocol"] == "OR") & (greedy_df_3["seeding_budget"] >= 31)
]
greedy_df_3 = greedy_df_3.drop(eutr_to_drop_cols.index)

greedy_df = pd.concat(
    [greedy_df_1, greedy_df_2, greedy_df_3], ignore_index=True
)

print(
    f"Length of concatenated greedy df: {len(greedy_df)}\
    {len(greedy_df_1) + len(greedy_df_2) + len(greedy_df_3)}"
)
greedy_df.head()

First we need to remove rows which have `seed_actors_prct` inconsistent with 
reference data. In greedy method we cannot jump e.g. directly from 10% seeding
budget to 15% - we need to ewaluate all sizes between that range. For other 
methods we could do that, hence to obtain consistent visualisation we will 
remove these intermediate results

In [None]:
print(f"Length of raw dataframe: {len(greedy_df)}")
reference_df = averaged_random_df

for net in greedy_df["network"].unique():

    # take unique seed_actors_prct vals for all ssm except greedy given the net
    allowed_values = reference_df.loc[
        reference_df["network"] == net
    ]["seed_actors_prct"].unique()

    # take all rows where ssm is greedy and seed_actors_prct is not a good val
    greedy_rows_to_be_dropped = greedy_df.loc[
        (greedy_df["network"] == net) &
        (~greedy_df["seed_actors_prct"].round(2).isin(allowed_values.round(2)))
    ]

    print(f"Removing {len(greedy_rows_to_be_dropped)} rows for net: {net}")
    greedy_df = greedy_df.drop(greedy_rows_to_be_dropped.index)

greedy_df["selection_metric"] = "greedy"

print(f"Length of processed dataframe: {len(greedy_df)}")
greedy_df.head()

Next thing is alignment or used seeding budgets. For small networks seeding
budgets like 1% and 2% actors of the network can be the same. Greedy ignores 
that - it takes 1 actor, 2 actors, 3 actors and so on. We need therefore to
align these records to obtain the same seeding bugdets as for another methods

In [None]:
final_greedy_df = pd.DataFrame(columns=greedy_df.columns)

for _, greedy_row in greedy_df.iterrows():
    
    reference_rows = reference_df.loc[
        (reference_df["network"] == greedy_row["network"]) &
        (reference_df["protocol"] == greedy_row["protocol"]) &
        (reference_df["mi_value"].round(2) == round(greedy_row["mi_value"], 2)) &
        (reference_df["seed_actors_prct"].round(2) == round(greedy_row["seed_actors_prct"], 2))
    ]

    if len(reference_rows) < 1:
        print(greedy_row)
        raise ValueError("Inconsistency in data!")

    for __, reference_row in reference_rows.iterrows():

        reference_dict = reference_row.to_dict()
        reference_dict["repetition_run"] = greedy_row["repetition_run"]
        reference_dict["diffusion_len"] = greedy_row["diffusion_len"]
        reference_dict["active_actors_prct"] = greedy_row["active_actors_prct"]
        reference_dict["gain"] = greedy_row["gain"]
        reference_dict["selection_metric"] = greedy_row["selection_metric"]

        final_greedy_df = pd.concat(
            [final_greedy_df, pd.DataFrame.from_records([reference_dict])]
        )

del greedy_df

print(f"Length of final greedy dataframe: {len(final_greedy_df)}")
final_greedy_df.head()

## Processing of another metrics

In [None]:
root_path = Path("_experiments/all_methods")
final_path = root_path.joinpath("all_results.csv")
if final_path.exists():
    final_path.unlink()
experiments = [*root_path.glob("*")]

In [None]:
def prepare_csv(metric_path):
    print(f"\n\n\nPreparing results for: {metric_path}")
    partial_results = []
    for pr_path in ["results.csv", "results_recalc_and.csv", "results_recalc_single_layer.csv", "results_recalc_single_layer_and.csv"]:
        try:
            pr_df = pd.read_csv(metric_path.joinpath(pr_path), index_col=0)
            partial_results.append(pr_df)
        except:
            print(f"File {pr_path} doesn't exist for {metric_path}")
    df = pd.concat(partial_results, ignore_index=True)
    df["selection_metric"] = metric_path.stem  
    print(f"{metric_path.stem}: " + "+".join([str(len(pr_df)) for pr_df in partial_results]) + f"={len(df)}")
    assert len(df) == sum([len(pr_df) for pr_df in partial_results])
    return df

In [None]:
exp_dfs = [
    prepare_csv(e_name) for e_name in experiments if 
    ("random" not in str(e_name) and "greedy" not in str(e_name))
]

## Saving final dataframe

In [None]:
exp_dfs.append(averaged_random_df)
exp_dfs.append(final_greedy_df)
final_df = pd.concat(exp_dfs)
final_df = final_df.drop("repetition_run", axis=1)
final_df = final_df.reset_index().drop("index", axis=1)
final_df.head()

In [None]:
mapping_ssm = {
    "cbim_and": "cbim",
    "cbim_or": "cbim",
    "cim_and": "cim",
    "cim_or": "cim",
    # "degree_centrality": "degree_centrality",
    "degree_centrality_discount_and": "degree_centrality_discount",
    "degree_centrality_discount_or": "degree_centrality_discount",
    # "greedy": "greedy",
    # "k_shell": "k_shell",
    # "k_shell_mln": "k_shell_mln",
    "kpp_shell_and": "kpp_shell",
    "kpp_shell_or": "kpp_shell",
    # "neighbourhood_2_hop_size": "neighbourhood_2_hop_size",
    # "neighbourhood_size": "neighbourhood_size",
    "neighbourhood_size_discount_and": "neighbourhood_size_discount",
    "neighbourhood_size_discount_or": "neighbourhood_size_discount",
    # "page_rank": "page_rank",
    # "page_rank_mln": "page_rank_mln",
    # "random": "random",
    # "vote_rank": "vote_rank",
    # "vote_rank_mln": "vote_rank_mln",
}

for ssm_raw, ssm_target in mapping_ssm.items():
    final_df["selection_metric"] = final_df["selection_metric"].replace(ssm_raw, ssm_target)

print(final_df["selection_metric"].unique())
final_df.head()

In [None]:
final_df.to_csv(root_path.joinpath("all_results.csv"))

## Processing experiments for top methods 

In [None]:
root_path = Path("_experiments/top_methods")
final_path = root_path.joinpath("all_results.csv")
if final_path.exists():
    final_path.unlink()
experiments = [*root_path.glob("*")]

In [None]:
def prepare_csv(metric_path):
    print(f"\nPreparing results for: {metric_path}")
    df = pd.read_csv(metric_path.joinpath("results.csv"), index_col=0)
    df["selection_metric"] = metric_path.stem  
    print(f"{metric_path.stem}: {len(df)}")
    return df

In [None]:
exp_dfs = [prepare_csv(e_name) for e_name in experiments]

In [None]:
final_df = pd.concat(exp_dfs)
final_df = final_df.drop("repetition_run", axis=1)
final_df.head()

In [None]:
mapping_ssm = {
    "degree_centrality_discount_and": "degree_centrality_discount",
    "degree_centrality_discount_or": "degree_centrality_discount",
    "neighbourhood_size_discount_and": "neighbourhood_size_discount",
    "neighbourhood_size_discount_or": "neighbourhood_size_discount",
    "page_rank_and": "page_rank",
    "page_rank_or": "page_rank",
    "vote_rank_and": "vote_rank",
    "vote_rank_or": "vote_rank",
    "vote_rank_mln_and": "vote_rank_mln",
    "vote_rank_mln_or": "vote_rank_mln",
}

for ssm_raw, ssm_target in mapping_ssm.items():
    final_df["selection_metric"] = final_df["selection_metric"].replace(ssm_raw, ssm_target)

print(final_df["selection_metric"].unique())
final_df.head()

In [None]:
final_df.to_csv(root_path.joinpath("all_results.csv"))