In [None]:
# For each dataset in datasets, get the parsed variables dataframe from ../../datasets/xyz_extended

import pandas as pd
from dowhy import CausalModel

import os
import sys
import re

datasets = []
directory = "../../datasets_raw/xyz_extended"

# Get the filenames of all the log files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".log"):
        datasets.append(filename)


results = {}

for dataset in datasets:
    # Get the parameters used to generate the log
    with open(os.path.join(directory, dataset.split(".")[0] + ".json")) as f:
        l = f.readlines()
        num_total_variables = int(l[3].split(":")[1].strip().strip(","))
        noise_radius = int(l[4].split(":")[1].strip())

    # Get the variable names for x, y, and z
    parsed_vars = pd.read_pickle(
        "../../datasets/xyz_extended/" + dataset + "_parsed_variables_None_None.pkl"
    )
    parsed_vars = parsed_vars.loc[parsed_vars["Tag"].isin(["x", "y", "z"])][
        ["Name", "Tag"]
    ]
    mapping = {k: v for k, v in zip(parsed_vars["Tag"], parsed_vars["Name"])}

    # Read the corresponding columns from the parsed log and reset the column names to x, y, and z
    parsed_log = pd.read_pickle(
        "../../datasets/xyz_extended/" + dataset + "_parsed_log_None_None.pkl"
    )
    data = parsed_log[["machine"] + list(mapping.values())]
    data.columns = ["machine"] + list(mapping.keys())

    # Calculate the max, min and mean for each of xyz for each machine
    agg_list = ["max", "min", "mean"]
    data = data.groupby("machine").agg(
        {
            "x": agg_list,
            "y": agg_list,
            "z": agg_list,
        }
    )
    data.columns = ["_".join(col) for col in data.columns]

    # For each x-based aggregate, for each y-based aggregate, for each z-based aggregate, calculate the ATE of x on y adjusting for z
    effects = {}
    for x_agg in agg_list:
        for y_agg in agg_list:
            for z_agg in agg_list:
                print("x_agg: ", x_agg, "y_agg: ", y_agg, "z_agg: ", z_agg)
                # Get the data for this combination of aggregates
                data_ = data[
                    [
                        "x_" + x_agg,
                        "y_" + y_agg,
                        "z_" + z_agg,
                    ]
                ]
                # Calculate the ATE of x on y adjusting for z using the dowhy package
                model = CausalModel(
                    data=data_,
                    treatment="x_" + x_agg,
                    outcome="y_" + y_agg,
                    common_causes=["z_" + z_agg],
                )
                identified_estimand = model.identify_effect(
                    proceed_when_unidentifiable=True
                )
                estimate = model.estimate_effect(
                    identified_estimand,
                    method_name="backdoor.linear_regression",
                    test_significance=False,
                )
                print(estimate.value)
                effects[(x_agg, y_agg, z_agg)] = estimate.value
                print("------------------")

    effects_df = pd.DataFrame.from_dict(effects, orient="index")
    effects_df.columns = ["ATE"]
    effects_df.reset_index(inplace=True)
    effects_df.rename(columns={"index": "Aggregates"}, inplace=True)
    effects_df["TrueATE"] = 2.0
    effects_df["Error"] = abs(
        (effects_df["ATE"] - effects_df["TrueATE"]) / effects_df["TrueATE"]
    )
    effects_df.sort_values(by="Error", ascending=True, inplace=True)
    effects_df.reset_index(inplace=True, drop=True)

    print(effects_df)

    # Find out which aggregates were used in practice
    prepared_vars = pd.read_pickle(
        "../../datasets/xyz_extended/"
        + dataset
        + "_prepared_variables_machine_None.pkl"
    )
    x_agg = prepared_vars[prepared_vars["Base"] == mapping["x"]]["Agg"].values[0]
    y_agg = prepared_vars[prepared_vars["Base"] == mapping["y"]]["Agg"].values[0]
    z_agg = prepared_vars[prepared_vars["Base"] == mapping["z"]]["Agg"].values[0]

    # Find index of the row in effects_df that corresponds to the aggregates used in practice
    idx = effects_df[effects_df["Aggregates"] == (x_agg, y_agg, z_agg)].index[0]

    print("Dataset: ", dataset)
    print("Aggregates: ", (x_agg, y_agg, z_agg))
    print("Index of chosen aggregates: ", idx)
    last_idx = len(effects_df) - 1
    results[dataset] = (
        num_total_variables,
        noise_radius,
        x_agg,
        y_agg,
        z_agg,
        idx,
        effects_df.loc[idx, "ATE"],
        effects_df.loc[idx, "Error"],
        effects_df.loc[0, "Aggregates"][0],
        effects_df.loc[0, "Aggregates"][1],
        effects_df.loc[0, "Aggregates"][2],
        effects_df.loc[0, "ATE"],
        effects_df.loc[0, "Error"],
        effects_df.loc[last_idx, "Aggregates"][0],
        effects_df.loc[last_idx, "Aggregates"][1],
        effects_df.loc[last_idx, "Aggregates"][2],
        effects_df.loc[last_idx, "ATE"],
        effects_df.loc[last_idx, "Error"],
    )

results_df = pd.DataFrame.from_dict(results, orient="index")
results_df.columns = [
    "V",
    "R",
    "x_agg",
    "y_agg",
    "z_agg",
    "idx",
    "ATE",
    "Error",
    "x_agg_best",
    "y_agg_best",
    "z_agg_best",
    "ATE_best",
    "Error_best",
    "x_agg_worst",
    "y_agg_worst",
    "z_agg_worst",
    "ATE_worst",
    "Error_worst",
]
results_df["Sub-optimality penalty"] = results_df["Error"] - results_df["Error_best"]
results_df["Fraction of gap closed"] =  abs(results_df["Error"] - results_df["Error_worst"])/ abs(results_df["Error_best"] - results_df["Error_worst"])
results_df["Fraction of gap closed"] = results_df["Fraction of gap closed"].fillna(1.0)
results_df.sort_values(by=['V', 'R'], ascending=[True, True], inplace=True)
results_df.reset_index(inplace=True, drop=True)

In [None]:
# Create a horizontal bar chart of the fraction of gap closed, where there is a label of the worst error on the left and a
# label of the best error on the right

import matplotlib.pyplot as plt
import numpy as np

# Create a horizontal bar chart of the fraction of gap closed, where there is a label of the worst error on the left and a

# label of the best error on the right
plt.figure(figsize=(8, 6.5))
plt.barh(
    np.arange(len(results_df)),
    results_df["Fraction of gap closed"],
    color="#7f9aba",
    height=0.5,
)

fontsize=18

# Add the labels
for i in range(len(results_df)):
    plt.text(
        results_df["Fraction of gap closed"][i]-0.01,
        i,
        f'{results_df["Error"][i]*100:.2f}% ({results_df["Fraction of gap closed"][i] * 100:.2f} % of gap)',
        ha="right",
        va="center",
        fontsize=fontsize,
    )

    plt.text(
        1.01,
        i,
        f'{results_df["Error_best"][i]*100:.2f}%',
        ha="left",
        va="center",
        fontsize=fontsize,
    )

    plt.text(
        -0.01,
        i,
        f'{results_df["Error_worst"][i]*100:.2f}%',
        ha="right",
        va="center",
        fontsize=fontsize,
    )

    plt.text(
        -0.4,
        i,
        f'R={results_df["R"][i]:.0f}',
        ha="left",
        va="center",
        fontsize=fontsize,
    )

    if i%3 == 0:
        plt.text(
            -0.44,
            i,
            f'V={results_df["V"][i]:.0f}',
            ha="right",
            va="center",
            fontsize=fontsize,
        )

# Hide y ticks
plt.yticks([])
plt.xticks([])

# Set x limits from 0 to 1
plt.xlim(0, 1)

# Flip y axis
plt.gca().invert_yaxis()

plt.text(
        -0,
        -1.2,
        f'Worst ATE Error',
        ha="center",
        va="center",
        fontsize=fontsize,
    )

plt.text(
        1,
        -1.2,
        f'Best ATE Error',
        ha="center",
        va="center",
        fontsize=fontsize,
    )

# Add x axis label
plt.xlabel("ATE Error using LOGos-picked Aggregates", fontsize=fontsize)

    
plt.show
plt.savefig("xyz_agg_efficiency.png", bbox_inches="tight")

In [None]:
results_df["Fraction of gap closed"].mean()

In [None]:
# For each dataset in datasets, get the parsed variables dataframe from ../../datasets/xyz_extended

import pandas as pd
from dowhy import CausalModel

import os
import sys
import re

datasets = []
directory = "../../datasets_raw/proprietary_logs"

# Get the filenames of all the log files in the directory
for filename in os.listdir(directory):
    if filename.endswith("normal.log"):
        datasets.append(filename)


results_prop = {}

for dataset in datasets:
    # Get the parameters used to generate the log
    with open(os.path.join(directory, dataset.split(".")[0] + ".json")) as f:
        l = f.readlines()
        faulty_users = int(l[2].split(":")[1].strip().strip(","))
        fault_prob = int(l[4].strip().strip(','))

    # Get the variable names for version and code
    mapping = {"code":"73b16c0a_196", "version":"30731d4c_11"}

    # Read the corresponding columns from the parsed log and reset the column names to x, y, and z
    parsed_log = pd.read_pickle(
        "../../datasets/proprietary_logs/proprietary_eval/" + dataset + "_parsed_log_None_None.pkl"
    )
    data = parsed_log[["User"] + list(mapping.values())]
    data.columns = ["User"] + list(mapping.keys())


    # Calculate the max, min and mean for each of xyz for each machine
    agg_list = ["max", "min", "mean"]
    data = data.groupby("User").agg(
        {
            "code": agg_list,
            "version": agg_list,
        }
    )
    data.columns = ["_".join(col) for col in data.columns]

    # For each x-based aggregate, for each y-based aggregate, for each z-based aggregate, calculate the ATE of x on y adjusting for z
    effects = {}
    for code_agg in agg_list:
        for version_agg in agg_list:
            print("code_agg: ", code_agg, "version_agg: ", version_agg)
            # Get the data for this combination of aggregates
            data_ = data[
                [
                    "code_" + code_agg,
                    "version_" + version_agg,
                ]
            ]
            # Calculate the ATE of x on y adjusting for z using the dowhy package
            model = CausalModel(
                data=data_,
                treatment="version_" + version_agg,
                outcome="code_" + code_agg,
            )
            identified_estimand = model.identify_effect(
                proceed_when_unidentifiable=True
            )
            estimate = model.estimate_effect(
                identified_estimand,
                method_name="backdoor.linear_regression",
                test_significance=False,
            )
            print(estimate.value)
            effects[(code_agg, version_agg)] = estimate.value
            print("------------------")

    effects_df = pd.DataFrame.from_dict(effects, orient="index")
    effects_df.columns = ["ATE"]
    effects_df.reset_index(inplace=True)
    effects_df.rename(columns={"index": "Aggregates"}, inplace=True)
    effects_df["TrueATE"] = (401*(fault_prob/100.0) + 200*(1-(fault_prob/100.0)) - 401*0.1-200*0.9) / (15.0-14.3)
    effects_df["Error"] = abs(
        (effects_df["ATE"] - effects_df["TrueATE"]) / effects_df["TrueATE"]
    )
    effects_df.sort_values(by="Error", ascending=True, inplace=True)
    effects_df.reset_index(inplace=True, drop=True)


    # Find out which aggregates were used in practice
    code_agg = 'mean'
    version_agg = 'mean'

    # Find index of the row in effects_df that corresponds to the aggregates used in practice
    idx = effects_df[effects_df["Aggregates"] == (code_agg, version_agg)].index[0]

    print("Dataset: ", dataset)
    print("Aggregates: ", (code_agg, version_agg))
    print("Index of chosen aggregates: ", idx)
    last_idx = len(effects_df) - 1
    results_prop[dataset] = (
        faulty_users/1000,
        fault_prob/100.0,
        code_agg,
        version_agg,
        idx,
        effects_df.loc[idx, "ATE"],
        effects_df.loc[idx, "Error"],
        effects_df.loc[0, "Aggregates"][0],
        effects_df.loc[0, "Aggregates"][1],
        effects_df.loc[0, "ATE"],
        effects_df.loc[0, "Error"],
        effects_df.loc[last_idx, "Aggregates"][0],
        effects_df.loc[last_idx, "Aggregates"][1],
        effects_df.loc[last_idx, "ATE"],
        effects_df.loc[last_idx, "Error"],
    )

results_df_prop = pd.DataFrame.from_dict(results_prop, orient="index")
results_df_prop.columns = [
    "F",
    "p_f",
    "code_agg",
    "version_agg",
    "idx",
    "ATE",
    "Error",
    "code_agg_best",
    "version_agg_best",
    "ATE_best",
    "Error_best",
    "code_agg_worst",
    "version_agg_worst",
    "ATE_worst",
    "Error_worst",
]
results_df_prop["Sub-optimality penalty"] = results_df_prop["Error"] - results_df_prop["Error_best"]
results_df_prop["Fraction of gap closed"] =  abs(results_df_prop["Error"] - results_df_prop["Error_worst"])/ abs(results_df_prop["Error_best"] - results_df_prop["Error_worst"])
results_df_prop["Fraction of gap closed"] = results_df_prop["Fraction of gap closed"].fillna(1.0)
results_df_prop.sort_values(by=['F', 'p_f'], ascending=[False, False], inplace=True)
results_df_prop.reset_index(inplace=True, drop=True)

In [None]:
results_df_prop

In [None]:
# Create a horizontal bar chart of the fraction of gap closed, where there is a label of the worst error on the left and a
# label of the best error on the right

import matplotlib.pyplot as plt
import numpy as np

# Create a horizontal bar chart of the fraction of gap closed, where there is a label of the worst error on the left and a

# label of the best error on the right
plt.figure(figsize=(8, 6.5))
plt.barh(
    np.arange(len(results_df_prop)),
    results_df_prop["Fraction of gap closed"],
    color="#7FBA82",
    height=0.5,
)

fontsize=18

# Add the labels
for i in range(len(results_df_prop)):
    plt.text(
        results_df_prop["Fraction of gap closed"][i]-0.01,
        i,
        f'{results_df_prop["Error"][i]*100:.2f}% ({results_df_prop["Fraction of gap closed"][i] * 100:.2f} % of gap)',
        ha="right",
        va="center",
        fontsize=fontsize,
    )

    plt.text(
        1.01,
        i,
        f'{results_df_prop["Error_best"][i]*100:.2f}%',
        ha="left",
        va="center",
        fontsize=fontsize,
    )

    plt.text(
        -0.01,
        i,
        f'{results_df_prop["Error_worst"][i]*100:.2f}%',
        ha="right",
        va="center",
        fontsize=fontsize,
    )

    plt.text(
        -0.4,
        i,
        f'p_f={results_df_prop["p_f"][i]:.1f}',
        ha="left",
        va="center",
        fontsize=fontsize,
    )

    if i%3 == 0:
        plt.text(
            -0.44,
            i,
            f'F={results_df_prop["F"][i]:.2f}',
            ha="right",
            va="center",
            fontsize=fontsize,
        )

# Hide y ticks
plt.yticks([])
plt.xticks([])

# Set x limits from 0 to 1
plt.xlim(0, 1)

# Flip y axis
plt.gca().invert_yaxis()

plt.text(
        -0,
        -1.2,
        f'Worst ATE Error',
        ha="center",
        va="center",
        fontsize=fontsize,
    )

plt.text(
        1,
        -1.2,
        f'Best ATE Error',
        ha="center",
        va="center",
        fontsize=fontsize,
    )

# Add x axis label
plt.xlabel("ATE Error using LOGos-picked Aggregates", fontsize=fontsize)

    
plt.show
plt.savefig("prop_agg_efficiency.png", bbox_inches="tight")

In [None]:
results_df_prop["Fraction of gap closed"].mean()

In [None]:
(results_df_prop["Fraction of gap closed"].mean() + results_df["Fraction of gap closed"].mean())/2