In [None]:
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn.metrics
import scipy
import adjustText
import matplotlib.ticker

In [None]:
# Set the default plot style
#default_plt_width = 15
#default_plt_height = 10
#plt.rcParams['figure.figsize'] = [default_plt_width, default_plt_height]

In [None]:
sns.set_style("whitegrid")
sns.set_context("paper")
sns.set(font_scale=1.1)
sns.despine(left=True)
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
cmap = sns.color_palette("Set1")
sns.palplot(cmap)
sns.set_palette(cmap)
plt_y_axis_fmt_string = '%.3f'

In [None]:
filename_prefix = "aug_results_MNIST_3_vs_8_translate_10"

In [None]:
# Parameters
filename_prefix = "aug_results_CIFAR10_0_vs_1_crop_10_loss"


In [None]:
n_clusters = [1, 10, 50, 100, 250, 500, 750, 1000]

In [None]:
runs_data = np.load("{}.npz".format(filename_prefix))

In [None]:
baseline_acc = runs_data["no_aug_no_poison_acc"]
poisoned_acc = runs_data["poisoned_acc"]
all_aug_train_poisoned_acc = runs_data["all_aug_train_poisoned_acc"]
n_aug_sample_points = runs_data["n_aug_sample_points"]
n_train = runs_data["n_train"]
VSV_acc = runs_data["VSV_acc"]
is_SV = runs_data["is_SV"].astype(np.int)
n_SV = np.sum(is_SV)

In [None]:
dict(runs_data)

In [None]:
#runs_data_inf = pd.read_pickle("{}.pkl".format(filename_prefix))
#runs_data_loss = pd.read_pickle("{}.pkl".format(filename_prefix_margin))

In [None]:
runs_data_clusters = []
for i in n_clusters:
    if filename_prefix[-5:] == "_loss":
        # Loss naming convention conflicts with n_clusters number
        runs_data_n_clusters = pd.read_pickle("{}_{}_loss.pkl".format(filename_prefix[:-5], i))
    else:
        runs_data_n_clusters = pd.read_pickle("{}_{}.pkl".format(filename_prefix, i))
    runs_data_n_clusters["n_clusters"] = i
    runs_data_clusters.append(runs_data_n_clusters)
runs_data_0 = pd.read_pickle("{}.pkl".format(filename_prefix))
runs_data_0["n_clusters"] = 0
runs_data_clusters.append(runs_data_0)

In [None]:
run_df_unprocessed = pd.concat(
    runs_data_clusters
)

In [None]:
run_df_unprocessed

In [None]:
fixed_ratio_n_aug_n_clusters = []
for i in n_clusters:
    ret = run_df_unprocessed.query("n_clusters == '{}' & n_auged == '{}'".format(i, i))
    print(ret)
    fixed_ratio_n_aug_n_clusters.append(ret)
fixed_ratio_n_aug_n_clusters_df = pd.concat(
    fixed_ratio_n_aug_n_clusters
)

In [None]:
fixed_ratio_n_aug_n_clusters_df

In [None]:
baseline_0 = run_df_unprocessed.query("test_type == 'baseline' & n_clusters == '0'").reset_index()

In [None]:
baseline_0["score"] = "baseline"
baseline_0["clustered"] = False
baseline_0["test_type"] = "Baseline"

In [None]:
baseline_0

In [None]:
baseline_n_clusters = fixed_ratio_n_aug_n_clusters_df.query("test_type == 'baseline'").reset_index()

In [None]:
baseline_n_clusters["score"] = "baseline"
baseline_n_clusters["clustered"] = True
baseline_n_clusters["test_type"] = "Baseline Clustered"

In [None]:
baseline_n_clusters

In [None]:
prop_inf_0 = run_df_unprocessed.query("test_type == 'random_proportional' & n_clusters == '0'")
prop_inf_0["test_type"] = "Random Proportional Influence"
prop_inf_0["clustered"] = False

In [None]:
prop_n_clusters = fixed_ratio_n_aug_n_clusters_df.query("test_type == 'random_proportional'").reset_index()
prop_n_clusters["test_type"] = "Random Proportional Influence Clustered"
prop_n_clusters["clustered"] = True

In [None]:
run_df = pd.concat([
    baseline_0,
    baseline_n_clusters,
    prop_inf_0,
    prop_n_clusters,
])

In [None]:
run_df = run_df.rename(
    index=str,
    columns={"test_accuracy": "Test Accuracy",
             "n_auged": "Number of Augmented Points",
            },
)

In [None]:
run_df

In [None]:
VSV_x = n_SV
VSV_y = VSV_acc

In [None]:
fig, ax = plt.subplots()
run_plot = sns.lineplot(x="Number of Augmented Points",
                        y="Test Accuracy",
                        hue="test_type",
                        style="test_type",
                        ci=95,
                        data=run_df,
                        markers=True,
                        dashes=True,
                        ax=ax)
#run_plot.scatter(VSV_x, VSV_y, marker="x", color="k", s=20)
# text = run_plot.annotate("VSV", (VSV_x, VSV_y))
#text = run_plot.text(VSV_x, VSV_y, "VSV", fontsize=12)
l = ax.legend()
#l.texts[0].set_text("")
#l.set_title('Whatever you want')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:])
ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter(plt_y_axis_fmt_string))
plt.setp(ax.get_legend().get_texts(), fontsize='11.5') # for legend text 
#run_plot.axhline(y=baseline_acc,
#                 color="b",
#                 linestyle="--",
#                 label="baseline_acc")
run_plot.axhline(y=poisoned_acc,
                 color="r",
                 linestyle="--",
                 label="poisoned_acc")
run_plot.axhline(y=all_aug_train_poisoned_acc,
                 color="g",
                 linestyle="--",
                 label="all_aug_train_poisoned_acc")
#adjustText.adjust_text([text],
#                       x=[VSV_x],
#                       y=[VSV_y],
#                       add_objects=[run_plot],
#                       expand_points=(0.5, 0.3),
#                       expand_objects=(0.3, 0.3),
#                       ax=ax,
#                       force_objects=(0.2, 0.2))
run_plot.get_figure().savefig(filename_prefix + "_joined_cluster.pdf",
                              bbox_inches="tight")