In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
import json

import pandas
import numpy


__CWD = os.getcwd() + "/"
__SRC = __CWD + "../src/"
__DATA = __CWD + "../data/"
__EXPERIMENTS = __CWD + "../experiments/"
__NOTEBOOKS = __CWD
_SYNTHETIC_RUNS_FOLDER = __DATA + "synthetic/"

sys.path.append(__SRC)
sys.path.append(__EXPERIMENTS)


pandas.set_option("display.max_rows", None)

In [None]:
import geckodriver_autoinstaller

geckodriver_autoinstaller.install()

# Load data

In [None]:
from tqdm import tqdm


runs = os.listdir(f"{_SYNTHETIC_RUNS_FOLDER}")
runs = [r for r in runs if r.endswith("validation.json")]
results = list()

for f in tqdm(runs):
    if f.endswith(".npy") or f.endswith(".pickle") or f.endswith(".csv"):
        continue

    with open(f"{_SYNTHETIC_RUNS_FOLDER}{f}", "r") as log:
        run = json.load(log)

    run_dict = dict()
    run_dict.update(run["test"])
    run_dict.update(run["configuration"])
    run_dict["elapsed_time"] = run["elapsed_time"]
    run_dict.update(run["tree"])
    results.append(run_dict)

analysis = pandas.DataFrame(results)
exclude = (analysis.split == "univariate") & (analysis.max_depth == 2)
analysis = analysis[~exclude]
analysis.loc[(analysis.split == "multivariate") & (analysis.max_depth == 2), "max_depth"] = 1
analysis.to_csv(f"{_SYNTHETIC_RUNS_FOLDER}synthetic_analysis.csv", index=False)

In [None]:
analysis.head()

In [12]:
from plots import draw_twovariate_dataset
from bokeh.layouts import row
from bokeh.io import show, output_notebook, export_png, export_svg

from trees.structure.trees import ObliqueTree
import pickle


output_notebook()

correlations_of_interest = ["0.0", "0.5", "1.0"]
slopes_of_interest = ["15", "30", "45", "90"]
types = ["univariate", "multivariate"]
generated_dataset_template = "correlation:{0}_slope:{1}_noise:0.0_maxdepth:1_softmargin:1.0_run:0_type:univariate.data.npy"

print("Increasing correlation")
datasets = [numpy.load(_SYNTHETIC_RUNS_FOLDER + generated_dataset_template.format(c, slopes_of_interest[0])) for c in correlations_of_interest]
plots = [draw_twovariate_dataset(dataset, width=900, height=480) for dataset in datasets]
show(row(plots))
export_png(row(plots), filename="correlations.png")


print("Increasing slope")
trees = list()
plots = list()
left_x = -4
right_x = -left_x
down_y, up_y = -4, +4

plots = list()
for c in [0., 0.2, 0.5, 0.7, 0.9]:
    with open(f"{_SYNTHETIC_RUNS_FOLDER}correlation:{c}_slope:30_noise:0.0_maxdepth:1_softmargin:1e-06_run:0_type:multivariate.validation.json", "r") as log:
        validation = json.load(log)
        tree = ObliqueTree.from_json(validation["tree_structure"])

    coefficients, bound = tree.root.hyperplane.coefficients, tree.root.hyperplane.bound
    left_point = (left_x, (bound - coefficients[0] * left_x) / coefficients[1])
    right_point = (right_x, (bound - coefficients[0] * right_x) / coefficients[1])

    dataset = numpy.load(f"{_SYNTHETIC_RUNS_FOLDER}correlation:{c}_slope:30_noise:0.0_maxdepth:1_softmargin:1e-06_run:0_type:multivariate.data.npy")
    plot = draw_twovariate_dataset(dataset, width=900, height=480)
    plot.line([left_point[0], right_point[0]], [left_point[1], right_point[1]], line_width=5, color="black", line_dash="dashed")

    plots.append(plot)

show(row(plots))
    

print("Increasing noise")
generated_dataset_template = "correlation:{0}_slope:{1}_noise:0.2_maxdepth:1_softmargin:1.0_run:0_type:univariate.data.npy"
generated_noises_samples = "correlation:{0}_slope:{1}_noise:0.2_maxdepth:1_softmargin:1.0_run:0_type:univariate.data.flipped.npy"
datasets = [numpy.load(_SYNTHETIC_RUNS_FOLDER + generated_dataset_template.format(correlations_of_interest[0], s)) for s in slopes_of_interest]
datasets_flips = [numpy.load(_SYNTHETIC_RUNS_FOLDER + generated_noises_samples.format(correlations_of_interest[0], s)) for s in slopes_of_interest]
plots = [draw_twovariate_dataset(dataset, flipped=noise, width=450, height=200) for dataset, noise in zip(datasets, datasets_flips)]
show(row(plots))

Increasing correlation


Increasing slope


Increasing noise


In [10]:
print("Increasing slope")
trees = list()
plots = list()
left_x = -4
right_x = -left_x
down_y, up_y = -4, +4

plots = list()
for s in [15, 30, 45, 90]:
    coefficients, bound = tree.root.hyperplane.coefficients, tree.root.hyperplane.bound
    left_point = (left_x, (bound - coefficients[0] * left_x) / coefficients[1])
    right_point = (right_x, (bound - coefficients[0] * right_x) / coefficients[1])

    dataset = numpy.load(f"{_SYNTHETIC_RUNS_FOLDER}correlation:0.0_slope:{s}_noise:0.0_maxdepth:1_softmargin:1e-06_run:0_type:multivariate.data.npy")
    plot = draw_twovariate_dataset(dataset, width=900, height=480)
#     plot.line([left_point[0], right_point[0]], [left_point[1], right_point[1]], line_width=5, color="black", line_dash="dashed")

    plots.append(plot)

show(row(plots))
export_png(row(plots), filename="row_slopes.png")

Increasing slope


'/mnt/disk1/mattiasetzu/trees_analysis/trees/notebooks/row_slopes.png'

# Performance on increasing correlation: single split

In [None]:
from plots import performance_by_correlation
from bokeh.io import show, output_notebook, export_png, export_svg
from bokeh.layouts import row, gridplot, column


output_notebook()

metrics = ["average_precision", "roc_auc", "f1-score", "accuracy"]
metric_names = ["AP", "AUC", "F1", "Acc"]
splits = ["univariate", "multivariate"]
noises = sorted(analysis.noise.unique().tolist())
noises = [0., .1, .25]

is_single_split = analysis.max_depth == 1
is_univariate = analysis.split == "univariate"
is_multivariate = analysis.split == "multivariate"
plts = list()
for degree in sorted(analysis.degrees_slope.unique()):
    degree_plots = list()
    for metric, metric_name in zip(metrics, metric_names):
        metric_plts = list()
        for eps in noises:
            has_noise_eps = analysis.noise == eps
            has_degrees = analysis.degrees_slope == degree

            univariate_metric = analysis[is_single_split & is_univariate & has_noise_eps & has_degrees].groupby("correlation")[metric]#.sort_index()
            univariate_mean, univariate_std = univariate_metric.mean(), univariate_metric.std()
            multivariate_metric = analysis[is_single_split & is_multivariate & has_noise_eps & has_degrees].groupby("correlation")[metric]#.sort_index()
            multivariate_mean, multivariate_std = multivariate_metric.mean(), multivariate_metric.std()
            correlations = numpy.unique(analysis[is_single_split & is_multivariate & has_noise_eps & has_degrees].correlation)
           
            plt = performance_by_correlation(correlations, univariate_mean, multivariate_mean, univariate_std, multivariate_std, metric_name, eps, width=1800, height=1200)
            plt.x_range = Range1d(-0.05, 1.05)
            metric_plts.append(plt)
        degree_plots.append(metric_plts)
    export_png(gridplot(degree_plots), filename=f"single_split_on_{degree}.png")

# Performance on increasing correlation: full tree

In [None]:
from plots import performance_by_correlation
from bokeh.io import show, output_notebook, export_png, export_svg
from bokeh.layouts import row, gridplot, column


# output_notebook()

metrics = ["average_precision", "roc_auc", "f1-score", "accuracy"]
metric_names = ["AP", "AUC", "F1", "Acc"]
splits = ["univariate", "multivariate"]
noises = sorted(analysis.noise.unique().tolist())
correlations = sorted(analysis.correlation.unique().tolist())
noises = [0., .1, .25]

is_full_tree = analysis.max_depth > 2
is_univariate = analysis.split == "univariate"
is_multivariate = analysis.split == "multivariate"
plts = list()
for degree in sorted(analysis.degrees_slope.unique()):
    degree_plots = list()
    for metric, metric_name in zip(metrics, metric_names):
        metric_plts = list()
        for eps in noises:
            has_noise_eps = analysis.noise == eps
            has_degrees = analysis.degrees_slope == degree

            univariate_metric = analysis[is_full_tree & is_univariate & has_noise_eps & has_degrees].groupby("correlation")[metric]#.sort_index()
            univariate_mean, univariate_std = univariate_metric.mean(), univariate_metric.std()
            multivariate_metric = analysis[is_full_tree & is_multivariate & has_noise_eps & has_degrees].groupby("correlation")[metric]#.sort_index()
            multivariate_mean, multivariate_std = multivariate_metric.mean(), multivariate_metric.std()
            
            correlations = numpy.unique(analysis[is_full_tree & is_multivariate & has_noise_eps & has_degrees].correlation)

            plt = performance_by_correlation(correlations, univariate_mean, multivariate_mean, univariate_std, multivariate_std,
                                             metric_name, eps, width=1800, height=1200)
            plt.x_range = Range1d(-0.05, 1.05)

            metric_plts.append(plt)
        degree_plots.append(metric_plts)
    export_png(gridplot(degree_plots), filename=f"tree_on_{degree}.png")

# Size: how more complex are univariate trees?

In [None]:
from plots import performance_by_correlation, size_difference_in_multiplier_per_noise, size_difference_absolute_per_noise, sizes_per_noise, mean_nonzero_coefficients_per_correlation
from bokeh.io import show, output_notebook, export_png


splits = ["univariate", "multivariate"]
noises = sorted(analysis.noise.unique().tolist())
slopes = sorted(analysis.slope.unique().tolist())
correlations = sorted(analysis.correlation.unique().tolist())

is_full_tree = analysis.max_depth > 2
is_univariate = analysis.split == "univariate"
is_multivariate = analysis.split == "multivariate"
univariate_sizes, multivariate_sizes = list(), list()
for degree in sorted(analysis.degrees_slope.unique()):
    degree_plots = list()
    for eps in [0., .1, .25]:
        has_noise_eps = analysis.noise == eps
        has_degrees = analysis.degrees_slope == degree

        univariate_mean = analysis[is_full_tree & is_univariate & has_noise_eps & has_degrees].groupby("correlation").mean(numeric_only=True).sort_index()["size"].values
        univariate_sizes.append(univariate_mean)

        multivariate_mean = analysis[is_full_tree & is_multivariate & has_noise_eps & has_degrees].groupby("correlation").mean(numeric_only=True).sort_index()["size"].values
        multivariate_sizes.append(multivariate_mean)


    plt1 = sizes_per_noise(correlations, univariate_sizes, multivariate_sizes, width=1800, height=1200) 
    plt2 = size_difference_in_multiplier_per_noise(correlations, univariate_sizes, multivariate_sizes, width=1800, height=1200)
    plt3 = mean_nonzero_coefficients_per_correlation(correlations, univariate_sizes, multivariate_sizes, width=1800, height=1200)

    export_png(plt1, filename=f"complexity_on_{degree}.png")
    export_png(plt2, filename=f"size_difference_multiplier_on_{degree}.png")
    export_png(plt3, filename=f"mean_nonzero_coefficients_on_{degree}.png")
#TODO: regenerate figure 3 w/ swapped shapes

In [None]:
from plots import *
from bokeh.io import show, output_notebook, export_png


splits = ["univariate", "multivariate"]
noises = [0, .1, .25]
slopes = [90, 15, 30, 45]
correlations = sorted(analysis.correlation.unique().tolist())

is_full_tree = analysis.max_depth > 2
is_univariate = analysis.split == "univariate"
is_multivariate = analysis.split == "multivariate"
univariate_sizes, multivariate_sizes = list(), list()
markers_by_noise = [
    ["triangle", "square", "triangle"],
    ["triangle", "square", "triangle"],
    ["triangle", "square", "triangle"]
]
colors_by_noise = [
    ["red", "purple"],
    ["orange", "blue"],
    ["yellow", "green"],
    
]
plots = list()
full_univariate_sizes = list()
full_multivariate_sizes = list()
for degree in [90, 15, 30, 45]:
    degree_plots = list()
    has_degrees = analysis.degrees_slope == degree
    
    univariate_sizes, multivariate_sizes = list(), list()
    for eps in noises:
        has_noise_eps = analysis.noise == eps    

        univariate_mean = analysis[is_full_tree & is_univariate & has_noise_eps & has_degrees].groupby("correlation").mean(numeric_only=True).sort_index()["size"].values
        univariate_sizes.append(univariate_mean)

        multivariate_mean = analysis[is_full_tree & is_multivariate & has_noise_eps & has_degrees].groupby("correlation").mean(numeric_only=True).sort_index()["size"].values
        multivariate_sizes.append(multivariate_mean)
    
    full_univariate_sizes.append(univariate_sizes)
    full_multivariate_sizes.append(multivariate_sizes)

plt = sizes_per_noise_by_slope_on_row(correlations, full_univariate_sizes, full_multivariate_sizes, width=900, height=480, markers=markers_by_noise, colors=colors_by_noise, noises=noises)
# export_png(row(plt), filename=f"size.png")
show(plt)

# Univariate VS Multivariate

## Is the univariate significantly less peformant the multivariate?
t-test on difference.

In [None]:
from scipy.stats import ttest_ind as two_sample_t_test


noises = sorted(analysis.noise.unique().tolist())
slopes = sorted(analysis.slope.unique().tolist())
correlations = sorted(analysis.correlation.unique().tolist())
metrics = ["accuracy", "f1-score", "roc_auc", "average_precision"]

difference_analysis = list()
for slope in analysis.degrees_slope.unique():
    for m in metrics:
        for eps in [0.]:
            is_full_tree = analysis.max_depth > 2
            is_univariate = analysis.split == "univariate"
            is_multivariate = analysis.split == "multivariate"
            no_noise = analysis.noise == 0.
            has_noise_eps = analysis.noise == eps
            has_slope_m = analysis.degrees_slope == slope
            
            univariate_samples = analysis[is_full_tree & is_univariate & has_noise_eps & has_slope_m][m].values
            multivariate_samples = analysis[is_full_tree & is_multivariate & has_noise_eps & has_slope_m][m].values
    
            expected_value_univariate, std_univariate = univariate_samples.mean(), univariate_samples.std()
            expected_value_multivariate, std_multivariate = multivariate_samples.mean(), multivariate_samples.std()
    
            univariate_samples = analysis[is_univariate & is_full_tree & no_noise & has_slope_m][m].values
            multivariate_samples = analysis[is_multivariate & is_full_tree & no_noise & has_slope_m][m].values
    
            test_results = two_sample_t_test(univariate_samples, multivariate_samples, equal_var=False, alternative="less")
    
            difference_analysis.append((slope, float(test_results.statistic), float(test_results.pvalue), m, eps))
difference_analysis = pandas.DataFrame(difference_analysis, columns=["slope", "t_test_outcome", "p_value", "metric", "noise"]).sort_values(by="slope")
difference_analysis.to_csv(f"{__DATA}experiments/difference_t_test.csv", index=False)
difference_analysis[difference_analysis.slope.isin([90, 30, 15, 45])]

### Table format

In [None]:
df = difference_analysis[difference_analysis.slope.isin([90, 30, 15, 45])]
metrics = ["accuracy", "f1-score", "roc_auc", "average_precision"]
pandas.options.display.float_format = "{:.3f}".format

for degree in [90, 15, 30, 45]:
    df_filtered = df[df.slope == degree]
    outcome0 = str(df_filtered[df_filtered.metric == metrics[0]].t_test_outcome.values[0])
    pvalue0 = str(df_filtered[df_filtered.metric == metrics[0]].p_value.values[0])
    outcome1 = str(df_filtered[df_filtered.metric == metrics[1]].t_test_outcome.values[0])
    pvalue1 = str(df_filtered[df_filtered.metric == metrics[1]].p_value.values[0])
    outcome2 = str(df_filtered[df_filtered.metric == metrics[2]].t_test_outcome.values[0])
    pvalue2 = str(df_filtered[df_filtered.metric == metrics[2]].p_value.values[0])
    outcome3 = str(df_filtered[df_filtered.metric == metrics[3]].t_test_outcome.values[0])
    pvalue3 = str(df_filtered[df_filtered.metric == metrics[3]].p_value.values[0])

    outcome0 = outcome0[:outcome0.index(".") + 4]
    outcome1 = outcome1[:outcome1.index(".") + 4]
    outcome2 = outcome2[:outcome2.index(".") + 4]
    outcome3 = outcome3[:outcome3.index(".") + 4]
    pvalue0 = pvalue0[:pvalue0.index(".") + 4]
    pvalue1 = pvalue1[:pvalue1.index(".") + 4]
    pvalue2 = pvalue2[:pvalue2.index(".") + 4]
    pvalue3 = pvalue3[:pvalue3.index(".") + 4]
    
    print(f"{degree if degree != 90 else 0}\\degree & {outcome0} & {pvalue0} && {outcome1} & {pvalue1} && {outcome2} & {pvalue2} && {outcome3} & {pvalue3} \\\\ ")

In [None]:
difference_analysis.groupby("slope").describe()[["t_test_outcome", "p_value"]]