# Simple statistical analysis of the results

For full data, computations can take a while, hence we recommend to run this notebook in
the terminal mode: `jupyter nbconvert --execute visualise.ipynb`

In [1]:
%load_ext autoreload
%autoreload 2

import glob
import re
from itertools import product
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from matplotlib.backends.backend_pdf import PdfPages
from tqdm.notebook import tqdm

from src import visualisation
from src.loaders.net_loader import load_network

## Load data

In [2]:
results = visualisation.Results(
    [
        csv_file for csv_file in glob.glob(r"data/raw_results/**", recursive=True)
        if re.search(r"batch_([1-4])/.*\.csv$", csv_file)
    ]
)

workdir = Path("data/processed_results")
workdir.mkdir(exist_ok=True, parents=True)

## Demo

In [3]:
network_name = "l2_course_net_1"
budget = 15
protocol = "AND"
mi_value = 0.10
ss_method = "random"
network_graph = load_network(network_name, as_tensor=False)

In [None]:
r_slice_nml = results.get_slice(
    protocol=protocol,
    mi_value=mi_value,
    seed_budget=budget,
    network=network_name,
    ss_method=ss_method,
)
r_slice_nml

In [None]:
r_slice_mds = results.get_slice(
    protocol=protocol,
    mi_value=mi_value,
    seed_budget=budget,
    network=network_name,
    ss_method=f"d^{ss_method}",
)
r_slice_mds

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(2, 1.5))  # budget x mi
visualisation.Plotter().plot_single_comparison_dynamics(
    record_mds=results.mean_expositions_rec(r_slice_mds),
    record_nml=results.mean_expositions_rec(r_slice_nml),
    actors_nb=results.get_actors_nb(r_slice_mds),
    mi_value=mi_value,
    seed_budget=budget,
    ax=ax
)

In [None]:
all_centralities, histogram = results.prepare_centrality(network_graph, "degree")
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(2, 1.5))
visualisation.Plotter().plot_single_comparison_centralities(
    record_mds=r_slice_mds,
    record_nml=r_slice_nml,
    all_centralities=all_centralities,
    hist_centralities=histogram,
    mi_value=mi_value,
    seed_budget=budget,
    ax=ax
)

## Plot visualisaitons of spreading dynamics to PDF

In [7]:
plotter = visualisation.Plotter()
pdf = PdfPages(workdir.joinpath(f"expositions.pdf"))

In [None]:
for page_idx, page_case in enumerate(plotter.yield_page()):
    print(page_case)

    fig, axs = plt.subplots(
        nrows=len(plotter._seed_budgets_and if page_case[1] == "AND" else plotter._seed_budgets_and),
        ncols=len(plotter._mi_values),
        figsize=(15, 20),
    )

    for fig_idx, fig_case in tqdm(enumerate(plotter.yield_figure(protocol=page_case[1]))):
        row_idx = fig_idx // len(axs[0])
        col_idx = fig_idx % len(axs[1])
        # print(page_case, fig_case, page_idx, row_idx, col_idx)

        nml_slice = results.get_slice(
            protocol=page_case[1],
            mi_value=fig_case[1],
            seed_budget=fig_case[0],
            network=page_case[0],
            ss_method=page_case[2],
        )
        mds_slice = results.get_slice(
            protocol=page_case[1],
            mi_value=fig_case[1],
            seed_budget=fig_case[0],
            network=page_case[0],
            ss_method=f"d^{page_case[2]}",
        )
        if len(nml_slice) == 0 or len(mds_slice) == 0:
            plotter.plot_dummy_fig(
                mi_value=fig_case[1],
                seed_budget=fig_case[0],
                ax=axs[row_idx][col_idx],
            )
        else:
            plotter.plot_single_comparison_dynamics(
                record_mds=results.mean_expositions_rec(mds_slice),
                record_nml=results.mean_expositions_rec(nml_slice),
                actors_nb=results.get_actors_nb(nml_slice),
                mi_value=fig_case[1],
                seed_budget=fig_case[0],
                ax=axs[row_idx][col_idx],
            )
    
    fig.tight_layout(pad=.5, rect=(0.05, 0.05, 0.95, 0.95))
    fig.suptitle(f"Network: {page_case[0]}, Protocol: {page_case[1]}, SSM: {page_case[2]}")
    fig.savefig(pdf, format="pdf")
    plt.close(fig)

pdf.close()

## Plot visualisaitons of seed distributions to PDF

In [14]:
newtorks_centralities = {}
for network_name in results.raw_df["network"].unique():
    graph = load_network(network_name, as_tensor=False)
    degrees = results.prepare_centrality(graph, "degree")
    neighbourhood_sizes = results.prepare_centrality(graph, "neighbourhood_size")
    newtorks_centralities[network_name] = {
        "graph": graph,
        "degree": {"centr": degrees[0], "hist": degrees[1]},
        "neighbourhood_size": {"centr": neighbourhood_sizes[0], "hist": neighbourhood_sizes[1]},
    }

In [15]:
plotter = visualisation.Plotter()
pdf = PdfPages(workdir.joinpath(f"distributions.pdf"))

In [None]:
i = 0
for page_idx, page_case in enumerate(plotter.yield_page()):
    print(page_case)

    centr_name = plotter._centralities[page_case[2]]
    fig, axs = plt.subplots(
        nrows=len(plotter._seed_budgets_and if page_case[1] == "AND" else plotter._seed_budgets_and),
        ncols=len(plotter._mi_values),
        figsize=(15, 20),
    )

    for fig_idx, fig_case in tqdm(enumerate(plotter.yield_figure(protocol=page_case[1]))):
        row_idx = fig_idx // len(axs[0])
        col_idx = fig_idx % len(axs[1])
        # print(page_case, fig_case, page_idx, row_idx, col_idx)

        nml_slice = results.get_slice(
            protocol=page_case[1],
            mi_value=fig_case[1],
            seed_budget=fig_case[0],
            network=page_case[0],
            ss_method=page_case[2],
        )
        mds_slice = results.get_slice(
            protocol=page_case[1],
            mi_value=fig_case[1],
            seed_budget=fig_case[0],
            network=page_case[0],
            ss_method=f"d^{page_case[2]}",
        )
        if len(nml_slice) == 0 or len(mds_slice) == 0:
            plotter.plot_dummy_fig(
                mi_value=fig_case[1],
                seed_budget=fig_case[0],
                ax=axs[row_idx][col_idx],
            )
        else:
            plotter.plot_single_comparison_centralities(
                record_mds=mds_slice,
                record_nml=nml_slice,
                all_centralities=newtorks_centralities[page_case[0]][centr_name]["centr"],
                hist_centralities=newtorks_centralities[page_case[0]][centr_name]["hist"],
                mi_value=fig_case[1],
                seed_budget=fig_case[0],
                ax=axs[row_idx][col_idx],
            )
    
    fig.tight_layout(pad=.5, rect=(0.05, 0.05, 0.95, 0.95))
    fig.suptitle(f"Network: {page_case[0]}, Protocol: {page_case[1]}, SSM: {page_case[2]}")
    fig.savefig(pdf, format="pdf")
    plt.close(fig)

    i += 1
    if i >= 2:
        break

pdf.close()

## Statistics of seed sets used in simulations

In [None]:
iterator_or = product(
    visualisation.Plotter._networks,
    [visualisation.Plotter._protocol_or],
    visualisation.Plotter._seed_budgets_or,
    visualisation.Plotter._mi_values,
    [*[f"d^{ssm}" for ssm in visualisation.Plotter._ss_methods], *visualisation.Plotter._ss_methods]
)
iterator_or = list(iterator_or)

iterator_and = product(
    visualisation.Plotter._networks,
    [visualisation.Plotter._protocol_and],
    visualisation.Plotter._seed_budgets_and,
    visualisation.Plotter._mi_values,
    [*[f"d^{ssm}" for ssm in visualisation.Plotter._ss_methods], *visualisation.Plotter._ss_methods]
)
iterator_and = list(iterator_and)

similarity_list = []

for iterator in [iterator_or, iterator_and]:
    for idx, simulated_case in enumerate(tqdm(iterator)):
        seed_sets = results.obtain_seed_sets_for_simulated_case(results.raw_df, *simulated_case)
        similarity = visualisation.analyse_set_similarity(seed_sets)
        similarity_list.append(
            {
                "network": simulated_case[0],
                "protocol": simulated_case[1],
                "seed_budget": simulated_case[2],
                "mi_value": simulated_case[3],
                "ss_method": simulated_case[4],
                **similarity,
            }
        )

similarity_df = pd.DataFrame(similarity_list)
similarity_df

In [10]:
similarity_df.to_csv(workdir.joinpath("similarities.csv"))

## Statistical analysis of MDS rankings

In [None]:
zip_1_path = "data/raw_results/batch_1/rankings.zip"
zip_2_path = "data/raw_results/batch_2/rankings.zip"
used_mds_list = [
    *visualisation.JSONParser().read_minimal_dominating_sets(zip_1_path),
    *visualisation.JSONParser().read_minimal_dominating_sets(zip_2_path),
]
used_mds_df = pd.DataFrame(used_mds_list)
used_mds_df

In [None]:
iterator_mds = product(
    used_mds_df["ss_method"].unique(),
    used_mds_df["network"].unique(),
)
iterator_mds = list(iterator_mds)


mds_similarity_list = []
for idx, simulated_case in enumerate(tqdm(iterator_mds)):
    case_mds = used_mds_df.loc[
        (used_mds_df["ss_method"] == simulated_case[0]) &
        (used_mds_df["network"] == simulated_case[1])
    ]["mds"]
    mds_lengths = [len(cm) for cm in case_mds]
    mds_similarity = visualisation.analyse_set_similarity(case_mds)
    mds_similarity_list.append(
        {
            "network": simulated_case[1],
            "ss_method": simulated_case[0],
            "max_mds_length": np.max(mds_lengths),
            "min_mds_length": np.min(mds_lengths),            
            "avg_mds_length": np.mean(mds_lengths),
            "std_mds_length": np.std(mds_lengths),
            **mds_similarity,
        }
    )

mds_similarity_df = pd.DataFrame(mds_similarity_list)
mds_similarity_df

In [13]:
mds_similarity_df.to_csv(workdir.joinpath("similarities_mds.csv"))
used_mds_df.to_csv(workdir.joinpath("used_mds.csv"))