In [23]:
import wandb
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import dill
import copy
import numpy as np

plt.rcParams["axes.xmargin"] = 0

In [24]:
def get_sweeps(project):
    sweeps = []
    for sweep in project:
        sweeps.append((sweep.name, sweep.id))
    return sweeps[::-1]


def get_run_links(sweeps, project_name):
    runs = []
    for sweep_name, sweep_link in sweeps:
        sweep = wandb.Api().sweep(f"lucacorbucci/{project_name}/{sweep_link}")
        run_list_sweep = []
        for run in sweep.runs:
            run_list_sweep.append(run.id)
        runs.append((sweep_name, run_list_sweep[::-1]))
    return runs


def get_run_data(run_links_per_sweep, project_name):
    run_data = []
    for sweep_name, sweep in run_links_per_sweep:
        print("Downloading data for a sweep")
        tmp_run_data = []
        for run_link in sweep:
            run = wandb.Api().run(f"lucacorbucci/{project_name}/{run_link}")

            tmp_run_data.append(pd.DataFrame(run.scan_history()))
        run_data.append((sweep_name, tmp_run_data))
    return run_data


def remove_nan(column_names, dataframe):
    column_names = [
        column_name for column_name in column_names if column_name in dataframe.columns
    ]

    current_df = dataframe[column_names]
    # consider each column in training_data_disparity independently and
    # remove the rows where we have NaN
    new_columns = []

    for column in current_df.columns:
        new_values = list(current_df[column].dropna())
        new_columns.append(new_values)

    # if the lists have different lengths, we need to modify them so that
    # we have the same length:
    min_size = min([len(item) for item in new_columns])
    new_columns = [item[:min_size] for item in new_columns]

    # create the new dataframe with baseline_test_nodes_columns_disparity_dataset as columns
    # names and new_columns as values
    new_df = pd.DataFrame(dict(zip(column_names, new_columns)))
    return new_df


def create_avg_dataset(new_list):
    # compute the mean and the std of the data in new_list
    # and create a new dataset with these values
    mean = np.mean(new_list, axis=0)
    std = np.std(new_list, axis=0)

    df = pd.DataFrame()
    new_mean = []
    current_max = 0
    for value in mean:
        if value > current_max:
            current_max = value
            new_mean.append(current_max)
        else:
            new_mean.append(None)
    df["mean"] = mean
    df["dots"] = new_mean
    df["std"] = std
    df["index"] = list(range(0, len(mean)))
    return df


def prepare_pareto_frontier(data_lists):
    custom_metrics = []

    for data_list in data_lists:
        tmp_custom_metrics = []
        current_max = 0
        for value in data_list:
            if value >= current_max:
                current_max = value
            tmp_custom_metrics.append(current_max)
        custom_metrics.append(copy.deepcopy(tmp_custom_metrics))

    return custom_metrics


def create_avg_dataset(new_list):
    # compute the mean and the std of the data in new_list
    # and create a new dataset with these values
    mean = np.mean(new_list, axis=0)
    std = np.std(new_list, axis=0)

    df = pd.DataFrame()
    new_mean = []
    current_max = 0
    for value in mean:
        if value > current_max:
            current_max = value
            new_mean.append(current_max)
        else:
            new_mean.append(None)
    df["mean"] = mean
    df["dots"] = new_mean
    df["std"] = std
    df["index"] = list(range(0, len(mean)))
    return df


def extract_last_custom_metrics(sweep):
    custom_metrics = []
    for sweep_dfs in sweep:
        tmp_custom_metrics = []
        for df in sweep_dfs:
            custom_metric = remove_nan(["Custom_metric"], df)
            last_value = custom_metric.values.tolist()[-1][0]
            if last_value == "-Infinity":
                last_value = 0
            tmp_custom_metrics.append(last_value)
        custom_metrics.append(tmp_custom_metrics)
    return custom_metrics


# Functions to plot
def plot_pareto_frontier(mean_df_tunable, mean_df_fixed, name):
    plt.figure(figsize=(15, 10))

    # Plot mean as a line
    plt.plot(
        mean_df_tunable["index"],
        mean_df_tunable["mean"],
        label="Tunable Lambda",
        color="blue",
    )
    plt.plot(
        mean_df_fixed["index"],
        mean_df_fixed["mean"],
        label="Fixed Lambda",
        color="green",
    )

    # Plot dots for non-None values in "dots" column
    dots_mask = mean_df_tunable["dots"].notnull()
    plt.scatter(
        mean_df_tunable["index"][dots_mask],
        mean_df_tunable["dots"][dots_mask],
        color="blue",
        marker="o",
    )

    dots_mask = mean_df_fixed["dots"].notnull()
    plt.scatter(
        mean_df_fixed["index"][dots_mask],
        mean_df_fixed["dots"][dots_mask],
        color="green",
        marker="o",
    )

    # Plot std as shaded area
    plt.fill_between(
        mean_df_tunable["index"],
        mean_df_tunable["mean"] - mean_df_tunable["std"],
        mean_df_tunable["mean"] + mean_df_tunable["std"],
        alpha=0.2,
        color="blue",
    )

    plt.fill_between(
        mean_df_fixed["index"],
        mean_df_fixed["mean"] - mean_df_fixed["std"],
        mean_df_fixed["mean"] + mean_df_fixed["std"],
        alpha=0.2,
        color="green",
    )

    # Customize the plot
    plt.title("Fixed vs Tunable Lambda")
    plt.xlabel("Experiments")
    plt.ylabel("Maximixed Metric")
    plt.rcParams.update({"font.size": 22})

    plt.legend(loc="upper left")
    plt.grid(True)
    plt.show()
    plt.savefig(f",/plot_paper/Dutch/{name}.png")

# Target 0.05 Epsilon 1

In [25]:
project_name = "Dutch_005_epsilon_1"
project = wandb.Api().project(project_name).sweeps()
sweeps = get_sweeps(project)
run_links = get_run_links(sweeps, project_name)
data = get_run_data(
    run_links,
    project_name,
)
tunable = [df for name, df in data if name == "prob_tunable_private"]
fixed = [df for name, df in data if name == "prob_fixed_private"]
custom_metrics_tunable = extract_last_custom_metrics(tunable)
custom_metrics_fixed = extract_last_custom_metrics(fixed)
pareto_list_tunable = prepare_pareto_frontier(custom_metrics_tunable)
pareto_list_fixed = prepare_pareto_frontier(custom_metrics_fixed)
df_pareto_tunable = create_avg_dataset(pareto_list_tunable)
df_pareto_fixed = create_avg_dataset(pareto_list_fixed)
plot_pareto_frontier(df_pareto_tunable, df_pareto_fixed, project_name)

Downloading data for a sweep
Downloading data for a sweep
Downloading data for a sweep
Downloading data for a sweep


# Target 0.05 Epsilon 0.5

In [None]:
project_name = "Dutch_005_epsilon_05"
project = wandb.Api().project(project_name).sweeps()
sweeps = get_sweeps(project)
run_links = get_run_links(sweeps, project_name)
data = get_run_data(
    run_links,
    project_name,
)
tunable = [df for name, df in data if name == "prob_tunable_private"]
fixed = [df for name, df in data if name == "prob_fixed_private"]
custom_metrics_tunable = extract_last_custom_metrics(tunable)
custom_metrics_fixed = extract_last_custom_metrics(fixed)
pareto_list_tunable = prepare_pareto_frontier(custom_metrics_tunable)
pareto_list_fixed = prepare_pareto_frontier(custom_metrics_fixed)
df_pareto_tunable = create_avg_dataset(pareto_list_tunable)
df_pareto_fixed = create_avg_dataset(pareto_list_fixed)
plot_pareto_frontier(df_pareto_tunable, df_pareto_fixed, project_name)

# Target 0.075 Epsilon 1

In [None]:
project_name = "Dutch_0075_epsilon_1"
project = wandb.Api().project(project_name).sweeps()
sweeps = get_sweeps(project)
run_links = get_run_links(sweeps, project_name)
data = get_run_data(
    run_links,
    project_name,
)
tunable = [df for name, df in data if name == "prob_tunable_private"]
fixed = [df for name, df in data if name == "prob_fixed_private"]
custom_metrics_tunable = extract_last_custom_metrics(tunable)
custom_metrics_fixed = extract_last_custom_metrics(fixed)
pareto_list_tunable = prepare_pareto_frontier(custom_metrics_tunable)
pareto_list_fixed = prepare_pareto_frontier(custom_metrics_fixed)
df_pareto_tunable = create_avg_dataset(pareto_list_tunable)
df_pareto_fixed = create_avg_dataset(pareto_list_fixed)
plot_pareto_frontier(df_pareto_tunable, df_pareto_fixed, project_name)

# Target 0.075 Epsilon 0.5

In [None]:
project_name = "Dutch_0075_epsilon_05"
project = wandb.Api().project(project_name).sweeps()
sweeps = get_sweeps(project)
run_links = get_run_links(sweeps, project_name)
data = get_run_data(
    run_links,
    project_name,
)
tunable = [df for name, df in data if name == "prob_tunable_private"]
fixed = [df for name, df in data if name == "prob_fixed_private"]
custom_metrics_tunable = extract_last_custom_metrics(tunable)
custom_metrics_fixed = extract_last_custom_metrics(fixed)
pareto_list_tunable = prepare_pareto_frontier(custom_metrics_tunable)
pareto_list_fixed = prepare_pareto_frontier(custom_metrics_fixed)
df_pareto_tunable = create_avg_dataset(pareto_list_tunable)
df_pareto_fixed = create_avg_dataset(pareto_list_fixed)
plot_pareto_frontier(df_pareto_tunable, df_pareto_fixed, project_name)

# Target 0.1 Epsilon 1

In [None]:
project_name = "Dutch_01_epsilon_1"
project = wandb.Api().project(project_name).sweeps()
sweeps = get_sweeps(project)
run_links = get_run_links(sweeps, project_name)
data = get_run_data(
    run_links,
    project_name,
)
tunable = [df for name, df in data if name == "prob_tunable_private"]
fixed = [df for name, df in data if name == "prob_fixed_private"]
custom_metrics_tunable = extract_last_custom_metrics(tunable)
custom_metrics_fixed = extract_last_custom_metrics(fixed)
pareto_list_tunable = prepare_pareto_frontier(custom_metrics_tunable)
pareto_list_fixed = prepare_pareto_frontier(custom_metrics_fixed)
df_pareto_tunable = create_avg_dataset(pareto_list_tunable)
df_pareto_fixed = create_avg_dataset(pareto_list_fixed)
plot_pareto_frontier(df_pareto_tunable, df_pareto_fixed, project_name)

# Target 0.1 Epsilon 0.5

In [None]:
project_name = "Dutch_01_epsilon_05"
project = wandb.Api().project(project_name).sweeps()
sweeps = get_sweeps(project)
run_links = get_run_links(sweeps, project_name)
data = get_run_data(
    run_links,
    project_name,
)
tunable = [df for name, df in data if name == "prob_tunable_private"]
fixed = [df for name, df in data if name == "prob_fixed_private"]
custom_metrics_tunable = extract_last_custom_metrics(tunable)
custom_metrics_fixed = extract_last_custom_metrics(fixed)
pareto_list_tunable = prepare_pareto_frontier(custom_metrics_tunable)
pareto_list_fixed = prepare_pareto_frontier(custom_metrics_fixed)
df_pareto_tunable = create_avg_dataset(pareto_list_tunable)
df_pareto_fixed = create_avg_dataset(pareto_list_fixed)
plot_pareto_frontier(df_pareto_tunable, df_pareto_fixed, project_name)