# How correlated are standard datasets?

In [3]:
%load_ext autoreload
%autoreload 2

import os
import json
import sys

from datasets import load_dataset
from pandas.api.types import is_numeric_dtype
from numpy import triu_indices
import pandas

from tqdm import tqdm
from bokeh.io import show, output_notebook, export_png
from bokeh.models import Row

from plots import *


__CWD = os.getcwd() + "/"
__SRC = __CWD + "../src/"
__DATA = __CWD + "../data/"
__EXPERIMENTS = __CWD + "../experiments/"
__NOTEBOOKS = __CWD

sys.path.append(__SRC)

import pandas

pandas.set_option('display.max_rows', None)

_BENCHMARK_RUNS_FOLDER = __DATA + "benchmark/"


output_notebook()


names = [
    ("mstz/acute_inflammation", "inflammation"),
    ("mstz/adult", "income"),
    ("mstz/arcene", "arcene"),
    ("mstz/arhythmia", "has_arhythmia"),
    ("mstz/australian_credit", "australian_credit"),
    ("mstz/balance_scale", "is_balanced"),
    ("mstz/bank", "subscription"),
    ("mstz/blood", "blood"),
    ("mstz/breast", "cancer"),
    ("mstz/car", "car_binary"),
    ("mstz/contraceptive", "contraceptive"),
    ("mstz/compas", "two-years-recidividity"),
    ("mstz/covertype", "covertype_0"),
    ("mstz/dexter", "dexter"),
    ("mstz/electricity", "electricity"),
    ("mstz/fertility", "fertility"),
    ("mstz/german", "loan"),
    ("mstz/gisette", "gisette"),
    ("mstz/glass", "vehicles"),
    ("mstz/heart_failure", "death"),
    ("mstz/heloc", "risk"),
    ("mstz/higgs", "higgs"),
    ("mstz/hill", "hill"),
    ("mstz/hypo", "has_hypo"),
    ("mstz/ipums", "ipums"),
    ("mstz/lrs", "lrs_0"),
    ("mstz/magic", "magic"),
    ("mstz/madelon", "madelon"),
    ("mstz/house16", "house16"),
    ("mstz/ionosphere", "ionosphere"),
    ("mstz/magic", "magic"),
    ("mstz/musk", "musk"),
    ("mstz/nbfi", "default"),
    ("mstz/ozone", "8hr"),
    ("mstz/page_blocks", "page_blocks_binary"),
    ("mstz/phoneme", "phoneme"),
    ("mstz/pima", "pima"),
    ("mstz/pol", "pol"),
    ("mstz/pums", "pums"),
    ("mstz/planning", "planning"),
    ("mstz/post_operative", "post_operative_binary"),
    ("mstz/seeds", "seeds_0"),
    ("mstz/seeds", "seeds_1"),
    ("mstz/seeds", "seeds_2"),
    ("mstz/segment", "brickface"),
    ("mstz/shuttle", "shuttle_0"),
    ("mstz/sonar", "sonar"),
    ("mstz/spambase", "spambase"),
    ("mstz/spect", "spect"),
    ("mstz/speeddating", "dating"),
    ("mstz/steel_plates", "steel_plates_0"),
    ("mstz/student_performance", "math"),
    ("mstz/sydt", "sydt"),
    ("mstz/toxicity", "toxicity"),
    ("mstz/twonorm", "twonorm"),
    ("mstz/vertebral_column", "abnormal"),
    ("mstz/wall_following", "wall_following_0"),
    ("mstz/wine_origin", "wine_origin_0"),
    ("mstz/wine", "wine")
]

pandas_datasets = {name: load_dataset(name, config)["train"].to_pandas() for name, config in names}

# Dataset infos

In [16]:
import numpy

dataset_sizes = numpy.array([pandas_datasets[d].shape[0] for d, _ in names])
dataset_dimensionalities = numpy.array([pandas_datasets[d].shape[1] for d, _ in names])
dataset_numeric_dimensionalities = numpy.array([len([f for f in pandas_datasets[d].columns if is_numeric_dtype(pandas_datasets[d][f])])
                                                for d, _ in names])

plt = draw_dimensionality_and_size(names, dataset_sizes, dataset_dimensionalities, dataset_numeric_dimensionalities, width=900, height=450)
show(plt)

In [17]:
dataset_infos = pandas.DataFrame({
    "name": [name for name, _ in names],
    "config": [config for _, config in names],
    "dataset_size": dataset_sizes,
    "dimensionality": dataset_dimensionalities,
    "numeric_dimensionality": dataset_numeric_dimensionalities
})
dataset_infos["dimensionality_ratio"] = dataset_infos.dataset_size / dataset_infos.dimensionality
dataset_infos["numeric_dimensionality_ratio"] = dataset_infos.dataset_size / dataset_infos.numeric_dimensionality
dataset_infos

Unnamed: 0,name,config,dataset_size,dimensionality,numeric_dimensionality,dimensionality_ratio,numeric_dimensionality_ratio
0,mstz/acute_inflammation,inflammation,120,9,9,13.333333,13.33333
1,mstz/adult,income,36631,14,8,2616.5,4578.875
2,mstz/arcene,arcene,100,10001,10001,0.009999,0.009999
3,mstz/arhythmia,has_arhythmia,68,280,280,0.242857,0.2428571
4,mstz/australian_credit,australian_credit,690,15,8,46.0,86.25
5,mstz/balance_scale,is_balanced,625,5,5,125.0,125.0
6,mstz/bank,subscription,45211,13,10,3477.769231,4521.1
7,mstz/blood,blood,748,4,4,187.0,187.0
8,mstz/breast,cancer,683,10,10,68.3,68.3
9,mstz/car,car_binary,1728,7,7,246.857143,246.8571


In [None]:
print(dataset_infos.describe().loc[["mean", "std", "min", "max"]].transpose().to_latex())

### Dataset statistics

In [None]:
print("""\\begin{table}
\t\\centering
\t\\begin{tabular}{ @{} l l l l @{} }
\t\\toprule
\t& \\textbf{Mean} & \\textbf{Min} & \\textbf{Max} \\\\
\t\\toprule""")
print(f"\t\t{dataset_sizes.mean()} $\\pm$ {dataset_sizes.std()} & {dataset_sizes.min()} & {dataset_sizes.max()}\\\\")
print(f"\t\t{dataset_dimensionalities.mean()} $\\pm$ {dataset_dimensionalities.std()}  & {dataset_dimensionalities.min()} & {dataset_dimensionalities.max()}\\\\")
print(f"\t\t{dataset_numeric_dimensionalities.mean()} $\\pm$ {dataset_numeric_dimensionalities.std()}  & {dataset_numeric_dimensionalities.min()} & {dataset_numeric_dimensionalities.max()}\\\\")
print("""\t\t\\bottomrule
\t\\end{tabular}
\t\\caption{Summary of the datasets analyzed in the paper.}
\t\\label{tbl:datasets_summary}
\\end{table}""")

In [None]:
for (name, _), size, dimensionality, numeric_dimensionality in zip(names, dataset_sizes, dataset_dimensionalities, dataset_numeric_dimensionalities):
    print(f"{name.replace('mstz/', '').replace('_', ' ')} & {size} & {dimensionality} & {numeric_dimensionality} \\\\")

# Dataset correlation

In [19]:
from sklearn.preprocessing import StandardScaler


correlation_metrics = ["pearson", "kendall", "spearman"]
correlations = dict()
for metric in correlation_metrics:
    correlations[metric] = list()
    for name, config in tqdm(names):
        if os.path.exists(f"../data/experiments/correlations/{name.replace('mstz/', '')}.{config}.correlation.{metric}.csv"):
            correlation = pandas.read_csv(f"../data/experiments/correlations/{name.replace('mstz/', '')}.{config}.correlation.{metric}.csv")
        else:
            print(f"{name}...")
            d = pandas_datasets[name].copy()

            # normalize dataset
            scaler = StandardScaler()
            numeric_columns = [f for f in d.columns[:-1]
                               if d.dtypes[f].name.startswith("int")
                                   or d.dtypes[f].name.startswith("float")
                                   or d.dtypes[f].name.startswith("bool")]
            d = d[numeric_columns + [d.columns[-1]]]
            # select non-boolean columns
            non_boolean_columns = [i for i in d.columns[:-1] if set(d[i]) != {0, 1}]
            if len(non_boolean_columns) > 0:
                d.loc[:, non_boolean_columns] = scaler.fit_transform(d.loc[:, non_boolean_columns].values)

            correlation = d.corr(method=metric, numeric_only=True)
            correlation.to_csv(f"../data/experiments/correlations/{name.replace('mstz/', '')}.{config}.correlation.{metric}.csv", index=False)

        correlations[metric].append(correlation)
    
linearized_correlations = {
    metric: numpy.hstack([dataset_correlations.values[triu_indices(dataset_correlations.shape[0], 1)]
                          for dataset_correlations in tqdm(datasets_correlations)])
    for metric, datasets_correlations in correlations.items()
}

100%|██████████████████████████████████████████████████████████| 59/59 [01:52<00:00,  1.90s/it]
100%|██████████████████████████████████████████████████████████| 59/59 [03:43<00:00,  3.79s/it]
100%|██████████████████████████████████████████████████████████| 59/59 [03:44<00:00,  3.81s/it]
100%|██████████████████████████████████████████████████████████| 59/59 [00:01<00:00, 30.97it/s]
100%|██████████████████████████████████████████████████████████| 59/59 [00:05<00:00,  9.84it/s]
100%|██████████████████████████████████████████████████████████| 59/59 [00:05<00:00, 10.06it/s]


In [20]:
correlations_per_dataset = dict()
for metric in correlation_metrics:
    mean_correlations = numpy.array([numpy.nanmean(abs(dataset_correlations.values[triu_indices(dataset_correlations.shape[0], 1)])) for dataset_correlations in correlations[metric]])
    std_correlations = numpy.array([numpy.nanstd(abs(dataset_correlations.values[triu_indices(dataset_correlations.shape[0], 1)])) for dataset_correlations in correlations[metric]])

    correlations_per_dataset[metric] = (mean_correlations, std_correlations)

In [21]:
dataset_infos["pearson_mean_correlation"] = correlations_per_dataset["pearson"][0]
dataset_infos["pearson_std_correlation"] = correlations_per_dataset["pearson"][1]
dataset_infos["spearman_mean_correlation"] = correlations_per_dataset["pearson"][0]
dataset_infos["spearman_std_correlation"] = correlations_per_dataset["pearson"][1]
dataset_infos["kendall_mean_correlation"] = correlations_per_dataset["kendall"][0]
dataset_infos["kendall_std_correlation"] = correlations_per_dataset["kendall"][1]

# Figure 4

In [None]:
from plots import draw_mean_correlations

output_notebook()

for metric in correlations_per_dataset:
    metric_in_latex = {
        "kendall": "\\tau",
        "spearman": "S",
        "pearson": "P"
    }
    plt = draw_correlations_cdf(linearized_correlations[metric])
    plt.toolbar_location = None
    export_png(plt, filename=f"datasets.correlations.{metric}.png")
    print(metric)
#     show(plt)

In [None]:
from plots import draw_correlations_cdf

for metric in correlations_per_dataset:
    metric_in_latex = {
        "kendall": "\\tau",
        "spearman": "S",
        "pearson": "P"
    }
    plt = draw_correlations_cdf(correlations_per_dataset[metric][0], label=f"$$ \mid \\rho^{metric_in_latex[metric]} \mid $$")
    plt.toolbar_location = None
#     export_png(plt, filename=f"datasets.correlations.{metric}.png")
    show(plt)

In [None]:
for metric in correlation_metrics:
    ps = numpy.nanpercentile(abs(linearized_correlations[metric]), [0, 25
                                                                    , 50, 75, 90, 95, 97.5, 99])
    print("\\textbf{" + metric.capitalize() + "} & " + " & ".join([str(p)[:5] for p in ps]) + "\\\\")

# Dataset slope: what's the approximated angular coefficient of the datasets?
**Warning** Use the `experiments/parallel_slopes.py` script for parallel computation.
You can run one as follows:
```shell
for i in {0..300}; do
    python parallel_slopes.py extract --name=$dataset --config=$config --batch=$i --batch_size=200000 &
done
```

Note that this launches **300** different processes. Adjust according to your machine capabilities.

In [7]:
import pandas

slopes = pandas.read_csv(__DATA + "experiments/slopes/slopes.csv", header=None)
slopes.columns = ["dataset", "config", "pair_index", "coefficient_0", "coefficient_1", "precomputed_slope", "precomputed_cleaned_slope"]
slopes.head()

Unnamed: 0,dataset,config,pair_index,coefficient_0,coefficient_1,precomputed_slope,precomputed_cleaned_slope
0,acute_inflammation,inflammation,0,1.993327,0.0,,0.0
1,acute_inflammation,inflammation,1,1.597236,1.944293,-0.821499,-0.821499
2,acute_inflammation,inflammation,2,2.048083,3.505073,-0.58432,-0.58432
3,acute_inflammation,inflammation,3,2.478072,2.29254,-1.080928,-1.080928
4,acute_inflammation,inflammation,4,1.987816,0.611009,-3.253332,-3.253332


In [8]:
nonzero_beta_1 = slopes.coefficient_0 != 0
nonzero_beta_2 = slopes.coefficient_1 != 0
zero_beta_1 = slopes.coefficient_0 == 0
zero_beta_2 = slopes.coefficient_1 == 0
# Different conditions
base_slope = nonzero_beta_2
corrected_slope = nonzero_beta_1 & zero_beta_2
null_slope = zero_beta_1 & zero_beta_2

slopes["slope"] = numpy.repeat(numpy.nan, slopes.shape[0])
slopes.loc[base_slope, "slope"] = abs(numpy.arctan(- slopes[base_slope].coefficient_0.values / slopes[base_slope].coefficient_1.values) * 180 / numpy.pi)
slopes.loc[corrected_slope, "slope"] = 90
slopes["slope_min"] = slopes[["slope"]].applymap(lambda x: min(x, 90 - x))

print(f"Both nonzero coefficients: {100 * slopes[base_slope].shape[0] / slopes.shape[0]}")
print(f"Beta 2 zero coefficient: {100 * slopes[corrected_slope].shape[0] / slopes.shape[0]}")
print(f"Both zero coefficients: {100 * slopes[null_slope].shape[0] / slopes.shape[0]}")

values, counts = numpy.unique(slopes[~null_slope].slope_min.values, return_counts=True)
normalized_counts = counts / sum(counts)
cdf = numpy.cumsum(normalized_counts)

Both nonzero coefficients: 96.36556599894259
Beta 2 zero coefficient: 3.150368566956275
Both zero coefficients: 0.48406543410113934


In [None]:
abs_slope_frequencies = numpy.repeat(numpy.nan, slopes.shape[0])
cond = (slopes.coefficient_0.values != 0) & (slopes.coefficient_1.values != 0)
# x1_neq_0 = slopes.coefficient_0.values != 0
# x2_neq_0 = slopes.coefficient_1.values != 0
abs_slope_frequencies[cond] = abs(numpy.arctan(- slopes[cond].coefficient_0 / slopes[cond].coefficient_1) * 180 / numpy.pi)
abs_slope_frequencies[(slopes.coefficient_0.values != 0) & (slopes.coefficient_1.values == 0)] = 90

print(f"Both != 0 coefficients (percentage):  {100 * numpy.mean(cond)}")
print(f"Both 0 coefficients (percentage):  {100 * numpy.mean(abs_slope_frequencies.isna())}")
abs_slope_frequencies = abs_slope_frequencies[abs_slope_frequencies.isna()]

# slope_frequencies = slopes["slope"].values
# slope_frequencies = numpy.arctan(slope_frequencies) * 180 / numpy.pi
# abs_slope_frequencies = abs(numpy.arctan(- slopes.coefficient_0 / slopes.coefficient_1) * 180 / numpy.pi)
# abs_slope_frequencies[beta1 > 0 and beta2 == 0] = +pi/2
# abs_slope_frequencies[beta1 < 0 and beta2 == 0] = -pi/2

values, counts = numpy.unique(abs_slope_frequencies, return_counts=True)
normalized_counts = counts / sum(counts)

cdf = numpy.cumsum(normalized_counts)

# Figure 5

In [9]:
from bokeh.models import NumeralTickFormatter
from bokeh.models import Range1d


x = values[::8]
y = cdf[::8]
plt = figure(x_axis_label="$$ \\mid \\hat{\\theta} \\mid $$",
             y_axis_label="CDF",
             y_axis_type="log",
             frame_width=900, frame_height=900,
             toolbar_location=None)
plt.line(x, y, line_width=5, color="red")

df = ColumnDataSource(
    data={
        "x": x,
        "y": y
    }
)

plt.xaxis.axis_label_text_font = "CMU"
plt.yaxis.axis_label_text_font = "CMU"
plt.title.text_font = "CMU"
plt.xgrid.grid_line_color = None
plt.ygrid.grid_line_color = None
plt.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
plt.yaxis.minor_tick_line_color = None  # turn off x-axis minor ticks

plt.title.text_font_size = "0pt"
plt.xaxis.major_label_text_font_size = "30pt"
plt.xaxis.axis_label_text_font_size = "30pt"
plt.xaxis.axis_label_text_font_size = "30pt"
plt.yaxis.major_label_text_font_size = "30pt"
plt.yaxis.axis_label_text_font_size = "30pt"
plt.yaxis[0].ticker.desired_num_ticks = 8
plt.xaxis[0].ticker.desired_num_ticks = 10

min_x, max_x = x.min(), x.max()
plt.line([min_x, max_x], [1., 1.], line_width=3, line_dash="dashed", color="black")

export_png(plt, filename="frequency_abs_cut_without_reflection_no_log.png")

'/mnt/disk1/mattiasetzu/trees_analysis/trees/notebooks/frequency_abs_cut_without_reflection_no_log.png'

# Trees validation
To run the tree induction algorithms, see `experiments/README.md`.

In [22]:
validation_files = os.listdir(_BENCHMARK_RUNS_FOLDER)
validation_files = [f for f in validation_files if f.endswith(".validation.json") and "p53" not in f and "liver" not in f]

validation_df = list()
for f in tqdm(validation_files):
    with open(f"{_BENCHMARK_RUNS_FOLDER}{f}", "r") as log:
        report = json.load(log)

    full_name = "mstz/" + report["configuration"]["name"]
    dataset = pandas_datasets[full_name]
    numeric_columns = [f for f in dataset.columns[:-1]
                       if dataset.dtypes[f].name.startswith("int")
                           or dataset.dtypes[f].name.startswith("float")
                           or dataset.dtypes[f].name.startswith("bool")]
    dataset_index = [i for i, (name, _) in enumerate(names) if name == full_name][0]
    dataset_mean_correlations = [correlations_per_dataset[metric][0][dataset_index] for metric in correlation_metrics]
    dataset_std_correlations = [correlations_per_dataset[metric][1][dataset_index] for metric in correlation_metrics]
    validation_df.append((report["configuration"]["name"],
                          report["configuration"]["config"],
                          len(numeric_columns),
                          dataset.shape[0],
                          dataset_mean_correlations[0],
                          dataset_mean_correlations[1],
                          dataset_mean_correlations[2],
                          dataset_std_correlations[0],
                          dataset_std_correlations[1],
                          dataset_std_correlations[2],
                          report["configuration"]["soft_margin"],
                          report["configuration"]["split"],
                          report["training"]["f1-score"],
                          report["training"]["accuracy"],
                          report["training"]["roc_auc"],
                          report["training"]["average_precision"],
                          report["test"]["f1-score"],
                          report["test"]["accuracy"],
                          report["test"]["roc_auc"],
                          report["test"]["average_precision"],
                          report["tree"]["size"],
                          report["tree"]["mean_node_size"],
                          report["tree"]["std_node_size"],
                          report["tree"]["mean_node_size"] * report["tree"]["size"] if "_univariate" not in f else report["tree"]["size"]
                         ))
validation_df = pandas.DataFrame(validation_df)
validation_df.columns = ["name", "config",
                         "numeric_dimensionality",
                         "dataset_size",
                         "mean_correlation_pearson", "mean_correlation_kendall", "mean_correlation_spearman",
                         "std_correlation_pearson", "std_correlation_kendall", "std_correlation_spearman",
                         "soft_margin", "split",
                         "f1_score_TR", "accuracy_TR", "roc_auc_TR", "average_precision_TR",
                         "f1_score_TS", "accuracy_TS", "roc_auc_TS", "average_precision_TS", "tree_size",
                         "mean_node_size", "std_node_size", "nonzero_coefficients"]
validation_df.head()

100%|████████████████████████████████████████████████████████| 116/116 [00:21<00:00,  5.38it/s]


Unnamed: 0,name,config,numeric_dimensionality,dataset_size,mean_correlation_pearson,mean_correlation_kendall,mean_correlation_spearman,std_correlation_pearson,std_correlation_kendall,std_correlation_spearman,...,roc_auc_TR,average_precision_TR,f1_score_TS,accuracy_TS,roc_auc_TS,average_precision_TS,tree_size,mean_node_size,std_node_size,nonzero_coefficients
0,seeds,seeds_1,7,210,0.616342,0.484179,0.612591,0.289386,0.251921,0.278639,...,1.0,1.0,0.975958,0.97619,0.964286,0.952381,7,0.714286,0.404061,5.0
1,house16,house16,16,22784,0.146567,0.169684,0.228568,0.16932,0.141044,0.177177,...,0.869722,0.905152,0.853205,0.85429,0.818112,0.871315,413,0.0625,0.0,413.0
2,steel_plates,steel_plates_0,27,1941,0.235965,0.218711,0.282402,0.2211,0.224434,0.265465,...,0.856425,0.553378,0.917942,0.915167,0.754639,0.309638,65,0.306713,0.431151,19.936343
3,madelon,madelon,500,2000,0.018221,0.012523,0.01822,0.020984,0.015766,0.020894,...,1.0,1.0,0.527142,0.5275,0.5275,0.51455,27,0.973231,0.08265,26.277231
4,compas,two-years-recidividity,12,4534,0.178455,0.182163,0.204907,0.195926,0.180507,0.193283,...,0.692453,0.466724,0.863518,0.886439,0.671213,0.438101,37,0.185185,0.244627,6.851852


In [None]:
print(validation_df[validation_df.split == "multivariate"].sort_values(by="name")[["name", "dataset_size", "numeric_dimensionality", "f1_score_TS", "accuracy_TS", "roc_auc_TS", "average_precision_TS", "tree_size"]].to_latex())

In [None]:
pandas.options.display.float_format = "{:.3f}".format

for name, config in names:   
    df = validation_df[(validation_df.name == name.replace("mstz/", "")) & (validation_df.config == config)]
    if name == "mstz/spect":
        continue
    
    size = df["dataset_size"].values[0]
    features = df["numeric_dimensionality"].values[0]
    f1_u = str(df[df.split == "univariate"]["f1_score_TS"].values[0])
    f1_m = str(df[df.split == "multivariate"]["f1_score_TS"].values[0])
    acc_u = str(df[df.split == "univariate"]["accuracy_TS"].values[0])
    acc_m = str(df[df.split == "multivariate"]["accuracy_TS"].values[0])
    auc_u = str(df[df.split == "univariate"]["roc_auc_TS"].values[0])
    auc_m = str(df[df.split == "multivariate"]["roc_auc_TS"].values[0])
    ap_u = str(df[df.split == "univariate"]["average_precision_TS"].values[0])
    ap_m = str(df[df.split == "multivariate"]["average_precision_TS"].values[0])
    
    f1_u = f1_u[:f1_u.index(".") + 4]
    f1_m = f1_m[:f1_m.index(".") + 4]
    acc_u = acc_u[:acc_u.index(".") + 4]
    acc_m = acc_m[:acc_m.index(".") + 4]
    auc_u = auc_u[:auc_u.index(".") + 4]
    auc_m = auc_m[:auc_m.index(".") + 4]
    ap_u = ap_u[:ap_u.index(".") + 4]
    ap_m = ap_m[:ap_m.index(".") + 4]
    
    print(f"{name.replace('mstz/', '')} & {size} & {features} & {acc_u} & {acc_m} && {f1_u} & {f1_m} && {auc_u} & {auc_m} && {ap_u} & {ap_m} \\\\".replace("_", "\\_"))

In [None]:
validation_df.groupby("split").mean(numeric_only=True)["tree_size"]

# Table 3

## Aggregate multivariate

In [None]:
validation_df[validation_df.split == "multivariate"].describe().loc[["mean", "std"]][["f1-score_TS", "average_precision_TS", "roc_auc_TS", "accuracy_TS"]] * 100

## Aggregate univariate

In [None]:
validation_df[validation_df.split == "univariate"].describe().loc[["mean", "std"]][["f1-score_TS", "average_precision_TS", "roc_auc_TS", "accuracy_TS"]] * 100

## Do the two families have different levels of overfit?

In [None]:
train = validation_df[validation_df.split == "multivariate"].describe().loc[["mean", "std"]][["f1-score_TR", "average_precision_TR", "roc_auc_TR"]] * 100
test = validation_df[validation_df.split == "multivariate"].describe().loc[["mean", "std"]][["f1-score_TS", "average_precision_TS", "roc_auc_TS"]] * 100
train.columns = ["f1_score", "average_precision", "roc_auc"]
test.columns = ["f1_score", "average_precision", "roc_auc"]
train - test

In [None]:
train = validation_df[validation_df.split == "univariate"].describe().loc[["mean", "std"]][["f1-score_TR", "average_precision_TR", "roc_auc_TR"]] * 100
test = validation_df[validation_df.split == "univariate"].describe().loc[["mean", "std"]][["f1-score_TS", "average_precision_TS", "roc_auc_TS"]] * 100
train.columns = ["f1_score", "average_precision", "roc_auc"]
test.columns = ["f1_score", "average_precision", "roc_auc"]
train - test

# Performance difference in Univariate VS Multivariate

In [23]:
by_dataset_diffs = list()
validated_datasets = list(validation_df.groupby(["name", "config"]).groups.keys())
for name, config in validated_datasets:
    if name in ["mstz/covertype", "mstz/sydt"]:
        continue
    uni = validation_df[(validation_df.name == name) & (validation_df.config == config) & (validation_df.split == "univariate")]
    multi = validation_df[(validation_df.name == name) & (validation_df.config == config) & (validation_df.split == "multivariate")]
    uni = uni[["f1_score_TR", "accuracy_TR", "roc_auc_TR", "average_precision_TR", "f1_score_TS", "accuracy_TS", "roc_auc_TS", "average_precision_TS"]].values
    multi = multi[["f1_score_TR", "accuracy_TR", "roc_auc_TR", "average_precision_TR", "f1_score_TS", "accuracy_TS", "roc_auc_TS", "average_precision_TS"]].values.mean(axis=0)
    
    by_dataset_diffs.append([name, config] + (100 * (uni - multi).squeeze()).tolist())
by_dataset_diffs = pandas.DataFrame(numpy.array(by_dataset_diffs),
                                    columns=["name", "config",
                                              "f1_score_TR", "accuracy_TR", "roc_auc_TR", "average_precision_TR", "f1_score_TS", "accuracy_TS", "roc_auc_TS", "average_precision_TS"])
by_dataset_diffs = by_dataset_diffs.astype({
    "f1_score_TR": float,
    "accuracy_TR": float,
    "roc_auc_TR": float,
    "average_precision_TR": float,
    "f1_score_TS": float,
    "accuracy_TS": float,
    "roc_auc_TS": float,
    "average_precision_TS": float
})
by_dataset_diffs = by_dataset_diffs.join(dataset_infos[["name",
                                                        "dataset_size",
                                                        "dimensionality", "numeric_dimensionality",
                                                        "pearson_mean_correlation", "spearman_mean_correlation", "kendall_mean_correlation"
                                                       ]], rsuffix="_joined")
by_dataset_diffs.drop("name_joined", axis="columns", inplace=True)
by_dataset_diffs = by_dataset_diffs[[
                                    "name", "config",
                                    "dataset_size",
                                    "dimensionality", "numeric_dimensionality",
                                    "pearson_mean_correlation", "spearman_mean_correlation", "kendall_mean_correlation",
                                    "f1_score_TS", "accuracy_TS", "roc_auc_TS", "average_precision_TS",
                                    "f1_score_TR", "accuracy_TR", "roc_auc_TR", "average_precision_TR"
                                    ]]
by_dataset_diffs

Unnamed: 0,name,config,dataset_size,dimensionality,numeric_dimensionality,pearson_mean_correlation,spearman_mean_correlation,kendall_mean_correlation,f1_score_TS,accuracy_TS,roc_auc_TS,average_precision_TS,f1_score_TR,accuracy_TR,roc_auc_TR,average_precision_TR
0,acute_inflammation,inflammation,120,9,9,0.332817,0.332817,0.321514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,adult,income,36631,14,8,0.096898,0.096898,0.097072,44.491655,47.379555,26.965696,27.194079,45.388401,48.259623,27.418745,28.090292
2,arcene,arcene,100,10001,10001,0.186017,0.186017,0.157184,30.967742,15.0,20.707071,14.494949,0.0,0.0,0.0,0.0
3,arhythmia,has_arhythmia,68,280,280,0.152894,0.152894,0.125609,13.769841,14.285714,10.0,5.297619,0.0,0.0,0.0,0.0
4,australian_credit,australian_credit,690,15,8,0.146747,0.146747,0.129316,11.470685,11.594203,10.900575,10.113788,11.350766,11.413043,10.931771,18.42927
5,balance_scale,is_balanced,625,5,5,0.005051,0.005051,0.004518,-0.963165,1.6,-12.826087,-4.028571,6.290118,8.6,5.837366,24.715336
6,bank,subscription,45211,13,10,0.056306,0.056306,0.073659,27.138356,39.245826,5.947014,5.260416,27.808639,39.59301,7.297504,8.870056
7,blood,blood,748,4,4,0.347104,0.347104,0.32007,20.341349,28.0,17.47076,8.834656,13.319598,19.397993,6.657709,13.531632
8,breast,cancer,683,10,10,0.625502,0.625502,0.569206,2.951443,2.919708,3.686798,5.715668,1.279776,1.282051,1.227786,3.231152
9,car,car_binary,1728,7,7,0.070001,0.070001,0.0636,12.213854,11.849711,16.695804,28.180012,8.053525,7.88712,10.952409,19.82504


## How are differences distributed?

In [27]:
from bokeh.layouts import row

output_notebook()


differences_by_metric = by_dataset_diffs[["f1_score_TS", "average_precision_TS", "roc_auc_TS", "accuracy_TS"]].values.transpose()
differences_means, differences_stds = list(), list()

plts = list()
for values, name, marker in zip(differences_by_metric, ["F1", "AP", "ROC", "Accuracy"], ["circle"] * 4):
    differences = numpy.array(values)
    p = trees_performance_differences(differences, name, marker=marker, width=1800, height=900)
    differences_means.append(differences.mean())
    differences_stds.append(differences.std())
    plts.append(p)
    export_png(p, filename=f"{name}.gaps.png")

# show(row(plts))

# Which datasets favor which family?
Some dataset largely favor one family more than another:
- `MDT >> UDT` strongly favor multivariate trees,
- `UDT >> MDT` strongly favor univariate trees,
- `MDT ~ UDT` show comparable (within one standard deviation) performance differences.

## Are there any outlier datasets?

In [29]:
outlier_differences = by_dataset_diffs[(by_dataset_diffs.f1_score_TS > differences_means[0] + differences_stds[0]) | (by_dataset_diffs.f1_score_TS < differences_means[0] - differences_stds[0])][[
                                                                                                                                       "name",
                                                                                                                                       "dataset_size",
                                                                                                                                       "numeric_dimensionality",
                                                                                                                                       "f1_score_TS", "average_precision_TS", "roc_auc_TS", "accuracy_TS",
                                                                                                                                       "pearson_mean_correlation", "spearman_mean_correlation", "kendall_mean_correlation"
                                                                                                                                      ]].sort_values(by="f1_score_TS")
outlier_differences["dimensionality_ratio"] = outlier_differences.dataset_size / outlier_differences.numeric_dimensionality
outlier_differences = outlier_differences[["name",
                                           "dataset_size",
                                           "numeric_dimensionality",
                                           "dimensionality_ratio",
                                           "pearson_mean_correlation", "spearman_mean_correlation", "kendall_mean_correlation",
                                           "f1_score_TS", "average_precision_TS", "roc_auc_TS", "accuracy_TS"]]
outlier_differences

Unnamed: 0,name,dataset_size,numeric_dimensionality,dimensionality_ratio,pearson_mean_correlation,spearman_mean_correlation,kendall_mean_correlation,f1_score_TS,average_precision_TS,roc_auc_TS,accuracy_TS
22,hill,606,101,6.0,0.975442,0.975442,0.966204,-21.201202,-6.393443,-10.655738,-10.655738
40,seeds,87,9,9.666667,0.092906,0.092906,0.087391,-15.391042,-30.15873,-19.642857,-14.285714
18,glass,214,10,21.4,0.188817,0.188817,0.152439,-6.322136,-14.341085,-19.166667,-6.976744
30,musk,19020,11,1729.090909,0.2575,0.2575,0.221049,-5.396783,-6.321053,-5.952381,-5.208333
46,spambase,208,61,3.409836,0.227444,0.227444,0.145703,29.395114,34.457448,30.233419,23.452769
12,covertype,581012,55,10563.854545,0.042413,0.042413,0.03674,30.663015,29.162703,33.624073,30.146382
2,arcene,100,10001,0.009999,0.186017,0.186017,0.157184,30.967742,14.494949,20.707071,15.0
35,pima,5404,6,900.666667,0.169798,0.169798,0.125806,33.347336,16.112712,24.796296,34.415584
1,adult,36631,8,4578.875,0.096898,0.096898,0.097072,44.491655,27.194079,26.965696,47.379555
34,phoneme,5473,10,547.3,0.26558,0.26558,0.365964,45.179071,35.949935,37.997754,45.883441


## Do datasets have any inherent features that correlates with tree performance?

In [30]:
res = list()
multivariate_best = outlier_differences[outlier_differences.f1_score_TS < 0].name.values.tolist()
univariate_best = outlier_differences[outlier_differences.f1_score_TS > 0].name.values.tolist()
other_datasets = [name.replace("mstz/", "") for name, _ in names if name not in univariate_best and name not in multivariate_best]

print(multivariate_best)
print(univariate_best)
print(other_datasets)

['hill', 'seeds', 'glass', 'musk']
['spambase', 'covertype', 'arcene', 'pima', 'adult', 'phoneme', 'pol', 'wine']
['acute_inflammation', 'adult', 'arcene', 'arhythmia', 'australian_credit', 'balance_scale', 'bank', 'blood', 'breast', 'car', 'contraceptive', 'compas', 'covertype', 'dexter', 'electricity', 'fertility', 'german', 'gisette', 'glass', 'heart_failure', 'heloc', 'higgs', 'hill', 'hypo', 'ipums', 'lrs', 'magic', 'madelon', 'house16', 'ionosphere', 'magic', 'musk', 'nbfi', 'ozone', 'page_blocks', 'phoneme', 'pima', 'pol', 'pums', 'planning', 'post_operative', 'seeds', 'seeds', 'seeds', 'segment', 'shuttle', 'sonar', 'spambase', 'spect', 'speeddating', 'steel_plates', 'student_performance', 'sydt', 'toxicity', 'twonorm', 'vertebral_column', 'wall_following', 'wine_origin', 'wine']


### Correlation between dataset correlation and favored group (Table 6 correlation)

In [31]:
#TODO: add approximate dataset slope per group (table 6 last column)
res = list()
for metric in ["accuracy_TS", "roc_auc_TS", "f1_score_TS", "average_precision_TS"]:
    for candidate_datasets, name in zip([multivariate_best, univariate_best, other_datasets], ["M", "U", "O"]):
        for c in ["pearson", "spearman", "kendall"]:
            target_datasets = by_dataset_diffs[by_dataset_diffs.name.isin(candidate_datasets)]
            target_datasets = target_datasets[[metric] + [f"{c}_mean_correlation"]]
            vals = target_datasets.corr(method=c, numeric_only=True)
            vals = vals.loc[metric][[f"{c}_mean_correlation"]].values.tolist()
            res.append([metric, name, c] + vals)
df = pandas.DataFrame(res, columns=["metric", "favoring", "correlation_measured", "correlation"])
df

Unnamed: 0,metric,favoring,correlation_measured,correlation
0,accuracy_TS,M,pearson,0.253391
1,accuracy_TS,M,spearman,0.40584
2,accuracy_TS,M,kendall,0.414039
3,accuracy_TS,U,pearson,0.346422
4,accuracy_TS,U,spearman,0.142857
5,accuracy_TS,U,kendall,0.0
6,accuracy_TS,O,pearson,-0.153725
7,accuracy_TS,O,spearman,-0.073032
8,accuracy_TS,O,kendall,-0.062402
9,roc_auc_TS,M,pearson,0.53262


In [None]:
df.groupby(["favoring", "correlation_measured"]).describe()["correlation"][["mean", "std"]]

In [None]:
print(df.groupby(["favoring", "correlation_measured"]).describe()["correlation"][["mean", "std"]].to_latex().replace("kendall", "\\rho^\\tau")
                                                                                                            .replace("pearson", "\\rho^\\P")
                                                                                                            .replace("spearman", "\\rho^\\S"))

# Slopes by group (Table 6)

In [10]:
multivariate_best = ['hill', 'seeds', 'glass', 'musk']
univariate_best = ['spambase', 'covertype', 'arcene', 'pima', 'adult', 'phoneme', 'pol', 'wine']
other_datasets = ['acute_inflammation', 'adult', 'arcene', 'arhythmia', 'australian_credit', 'balance_scale', 'bank', 'blood', 'breast', 'car', 'contraceptive', 'compas', 'covertype', 'dexter', 'electricity', 'fertility', 'german', 'gisette', 'glass', 'heart_failure', 'heloc', 'higgs', 'hill', 'hypo', 'ipums', 'lrs', 'magic', 'madelon', 'house16', 'ionosphere', 'magic', 'musk', 'nbfi', 'ozone', 'page_blocks', 'phoneme', 'pima', 'pol', 'pums', 'planning', 'post_operative', 'seeds', 'seeds', 'seeds', 'segment', 'shuttle', 'sonar', 'spambase', 'spect', 'speeddating', 'steel_plates', 'student_performance', 'sydt', 'toxicity', 'twonorm', 'vertebral_column', 'wall_following', 'wine_origin', 'wine']

for g in (multivariate_best, univariate_best, other_datasets):
    vals = slopes[slopes.dataset.isin(g)].slope_min
    print(f"{numpy.nanmean(vals), numpy.nanstd(vals), vals.size}")

(27.288757331373002, 13.619674269974425, 18744)
(20.21776405452933, 13.024665686718928, 49204280)
(17.86437496063892, 13.632164962561381, 82622301)


## Wins and Losses

In [None]:
print("Univariate wins")
print((by_dataset_diffs[["f1_score_TS", "accuracy_TS", "roc_auc_TS", "average_precision_TS"]] >= 0).sum(axis="rows"))
print("\n\n")
print("Multivariate wins")
print((by_dataset_diffs[["f1_score_TS", "accuracy_TS", "roc_auc_TS", "average_precision_TS"]] < 0).sum(axis="rows"))