In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
files = os.listdir("data/folder_1_results")
dct = {}
for file in files:
    if (
        any(char == "4" for char in file)
        and any(char == "6" for char in file)
        and "bis" not in file
    ):
        df = pd.read_csv(f"data/folder_1_results/{file}", index_col=0)
        df["id"] = df["trace_id"].astype(str) + "_" + df["ano_id"].astype(str)
        dct["".join(file.split(".")[:-1])] = df

In [None]:
for item, df in dct.items():
    print(item, df["exp_instability"].mean())
    idx = df["id"]

In [None]:
for item, df in dct.items():
    print(item, df["exp_size"].mean())
    idx = df["id"]

In [None]:
names = [
    "ExStream no filtering",
    "ExStream-cluster no filtering",
    "ExStream filtering",
    "ExStream-cluster filtering",
]
colors = ["pink", "red", "lightgreen", "green"]
hatchs = ["///", "", "///", ""]
edgecolors = ["red", "red", "green", "green"]

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(17, 3))
# axs = axs.flatten()

# idx = dct[
#     "explanations_without_false_positive_filtering_without_cluster_threshold_0.7"
# ]["id"]
ind = np.arange(len(idx))
width = 0.2
shift = 0
items = list(dct.items())
for i in range(4):
    if i % 4 == 0:
        shift = 0
    name, df = items[i]
    plt.bar(
        ind + shift,
        df["exp_instability"],
        color=colors[i],
        width=width,
        label=names[i],
        hatch=hatchs[i],
        edgecolor=edgecolors[i],
    )

    plt.bar(
        ind + shift,
        df["exp_instability"],
        color="none",
        edgecolor="k",
        width=width,
        zorder=1,
        lw=0.5,
    )

    # ax.set_xticklabels(df["id"])
    shift += 0.2

plt.title(
    "Explanation instability of ExStream and ExStream-cluster, with and without false positive filtering, for each trace"
)
plt.xlabel("Trace ID")
plt.ylabel("Instability")
plt.xticks(ind + width, idx, rotation=90)
plt.legend(bbox_to_anchor=(0.6, -0.3))
plt.show()

In [None]:
import seaborn as sns

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(17, 15))
ax = ax.flatten()

items = list(dct.items())

for i in range(4):
    name, df = items[i]

    sns.histplot(data=df["exp_instability"], ax=ax[i])


plt.title(
    "Explanation instability of ExStream and ExStream-cluster, with and without false positive filtering, for each trace"
)
# plt.xticks(ind + width, idx, rotation=90)
plt.legend(bbox_to_anchor=(1, -0.25))
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(17, 3))

items = list(dct.items())

sns.boxplot(
    data=[df["exp_instability"].tolist() for name, df in items], ax=ax, palette=colors
)


plt.title(
    "Explanation instability of ExStream and ExStream-cluster, with and without false positive filtering, for each trace"
)
plt.xlabel("Trace ID")
plt.ylabel("Instability")
# plt.xticks(ind + width, idx, rotation=90)
plt.legend(bbox_to_anchor=(1, -0.25))
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 5))
# axs = axs.flatten()

# idx = dct[
#     "explanations_without_false_positive_filtering_without_cluster_threshold_0.7"
# ]["id"]
ind = np.arange(len(idx))
width = 0.2
shift = 0
items = list(dct.items())
for i in range(4):
    if i % 4 == 0:
        shift = 0
    name, df = items[i]
    plt.bar(
        ind + shift,
        df["exp_size"],
        color=colors[i],
        width=width,
        label=names[i],
        hatch=hatchs[i],
        edgecolor=edgecolors[i],
    )

    plt.bar(
        ind + shift,
        df["exp_size"],
        color="none",
        edgecolor="k",
        width=width,
        zorder=1,
        lw=0.5,
    )

    # ax.set_xticklabels(df["id"])
    shift += 0.2
plt.title(
    "Explanation size of ExStream and ExStream-cluster, with and without false positive filtering, for each trace"
)
plt.xlabel("Trace ID")
plt.ylabel("Size")
plt.xticks(ind + width, idx, rotation=90)
plt.legend()
plt.show()

# Distances

In [None]:
import logging
from exstream.correlation_filtering import correlated_features_filter
from exstream.false_positive_filtering import false_positive_filter
from exstream.entropy_based_single_reward_feature import (
    entropy_based_single_feature_reward,
    reward_leap_filter,
)
from utils.get_data import get_train_test_data, split_references_and_anomalies

In [None]:
def compute_explanatory_features(distances: dict) -> dict:
    # col = [s.replace("'", "") for s in col]
    # selected_distances = {
    #     feat: dist for feat, dist in distances.items() if feat in anos.columns
    # }
    # print(selected_distances)
    if len(distances) > 1:
        filtered_features = reward_leap_filter(distances)
        selected_features = filtered_features
    else:
        selected_features = list(distances.keys())

    return selected_features

In [None]:
def get_explanatory_features(
    refs: pd.DataFrame,
    anos: pd.DataFrame,
    cluster: bool,
    correlation_threshold: float,
    false_positive_filtering: bool,
    max_distance: float,
):
    # CORRELATION FILTERING
    all_data = pd.concat([refs, anos])

    logging.info("Filtering correlated features...")
    filtered_features = correlated_features_filter(
        all_data, correlation_threshold=correlation_threshold, cluster=cluster
    )
    logging.debug(f"Features after correlation filtering: {filtered_features}")
    logging.info(
        f"Dropped {len(all_data.columns[:-3]) - len(filtered_features)} features after"
        "correlation filtering"
    )
    refs = refs.loc[:, filtered_features]
    anos = anos.loc[:, filtered_features]

    explanatory_features = {}

    for ano in anos.index.unique():
        logging.info(f"Anomaly {ano}")
        ano_data = anos.loc[ano]
        ano_ref = refs.loc[ano]

        # FALSE POSITIVE FILTERING
        new_filtered_features = false_positive_filter(
            ano_ref, refs, false_positive_filtering, max_distance=max_distance
        )
        logging.debug(
            f"Features after false positive filtering: {new_filtered_features}"
        )
        logging.info(
            f"Dropped {len(filtered_features) - len(new_filtered_features)} features"
            "after false positive filtering"
        )

        ano_data = ano_data.loc[:, new_filtered_features]
        ano_ref = ano_ref.loc[:, new_filtered_features]
        ano_all = pd.concat([ano_ref, ano_data], axis=0)

        # ENTROPY BASED SINGLE FEATURE REWARD
        distance = entropy_based_single_feature_reward(ano_ref, ano_data, ano_all)

        # final_features = compute_explanatory_features(distance)

        explanatory_features[ano] = distance

    return explanatory_features

In [None]:
DATA_FOLDER = "data/folder_1"
LABEL_FILENAME = "labels"
VERBOSE = False
CORRELATION_THRESHOLD = 0.8
MAX_DISTANCE = 30.0

In [None]:
logging.info("Importing data...")
refs, anos = split_references_and_anomalies(DATA_FOLDER, LABEL_FILENAME)
_, labels = get_train_test_data(DATA_FOLDER, LABEL_FILENAME)

labels_df = labels[["trace_id", "ano_id"]].copy()

logging.info("Getting explanatory features...")
explanatory_features = get_explanatory_features(
    refs,
    anos,
    True,
    CORRELATION_THRESHOLD,
    True,
    MAX_DISTANCE,
)
explanatory_features_df = pd.DataFrame(
    list(explanatory_features.items()), columns=["index", "explanation"]
)

# Plot distances

In [None]:
fig = plt.figure(figsize=(6, 4))

for ano, distances in list(explanatory_features.items())[22:]:
    selected_features = compute_explanatory_features(distances)
    colors = [
        "orange" if feature in selected_features else "b"
        for feature in distances.keys()
    ]

    features_names = [
        "..." + feature[-20:] if len(feature) > 10 else feature
        for feature in distances.keys()
    ]
    values = distances.values()
    plt.bar(features_names, values, color=colors)
    plt.title(f"Anomaly {ano}")
    plt.xticks(np.arange(len(distances.keys())), rotation=90)
    plt.xlabel("Previously selected features")
    plt.ylabel("Entropy-based reward")
    plt.ylim(0, 1)
    break

plt.show()