# Compute metrics on toy data

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from pathlib import Path

import pandas as pd
import dataframe_image as dfi

import src.visualization.visualize as visualize
from src.dependence_measures.compare import (compute_bivariate_scores,
                                             compute_bivariate_scores_on_file_generator)

In [5]:
PROCESSED_DATA_FOLDER_PATH = Path("../data/processed")

INPUTS_COLUMNS = ["x_0"]
OUTPUTS_COLUMNS = ["y"]

## Load noiseless toy data

In [None]:
data_df = pd.read_csv(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noiseless", "cubic_500.csv"), index_col=False)
data_df.head()

## Compute bi-variate metrics

In [None]:
compute_bivariate_scores(data_df, ["0"], ["1"])

## Noiseless

In [None]:
compute_bivariate_scores_on_file_generator(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noiseless").rglob("*200.csv"))
compute_bivariate_scores_on_file_generator(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noiseless").rglob("*500.csv"))
compute_bivariate_scores_on_file_generator(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noiseless").rglob("*1000.csv"))

In [None]:
files_to_demo = [
    "linear_1000.csv",
    "two_lines_1000.csv",
    "line_and_parabola_up_1000.csv",
    "non_coexistence_1000.csv",
    "ellipse_1000.csv",
]

df_styled, df = compute_bivariate_scores_on_file_generator(
    [PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noiseless", file) for file in files_to_demo]
)

dfi.export(df_styled, "../reports/tables/toy_data/example_noiseless_1000.png", dpi=300)

fig = visualize.plot_metric_values_and_rank(
    scores_df=df[["pearson", "spearman", "mutual information (sklearn)", "MIC"]],
    sort_values_by="MIC"
)

fig.savefig("../reports/figures/toy_data/metrics_values_and_rank_noisless_1.png")

## Noisy data

In [None]:
compute_bivariate_scores_on_file_generator(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noisy").rglob("*200.csv"))
compute_bivariate_scores_on_file_generator(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noisy").rglob("*500.csv"))
compute_bivariate_scores_on_file_generator(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noisy").rglob("*1000.csv"))

### Examples

In [None]:
files_to_demo = [
    "linear_n1_heteroscedastic_1000.csv",
    "linear_n2_heteroscedastic_1000.csv",
    "two_lines_n1_heteroscedastic_1000.csv",
    "two_lines_n2_heteroscedastic_1000.csv",
    "line_and_parabola_down_n1_heteroscedastic_1000.csv",
    "line_and_parabola_down_n2_heteroscedastic_1000.csv",
    "line_and_parabola_up_n1_heteroscedastic_1000.csv",
    "line_and_parabola_up_n2_heteroscedastic_1000.csv",
    "non_coexistence_n1_1000.csv",
    "non_coexistence_n2_1000.csv",
    "ellipse_n1_1000.csv",
    "ellipse_n2_1000.csv",
]

compute_bivariate_scores_on_file_generator(
    [PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noisy", file) for file in files_to_demo]
)

In [None]:
df_styled, df = compute_bivariate_scores_on_file_generator(
    list(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data").rglob(r"*line_and_parabola_down*heteroscedastic_1000*")) + \
    [PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noiseless", "line_and_parabola_down_1000.csv")]
)

dfi.export(df_styled, "../reports/tables/toy_data/example_line_and_parabola_down_1000.png", dpi=300)

fig = visualize.plot_metric_values_and_rank(
    scores_df=df[["pearson", "spearman", "mutual information (sklearn)", "MIC"]],
    sort_values_by="MIC"
)

fig.savefig("../reports/figures/toy_data/metrics_values_and_rank_noisy_1.png")

In [None]:
df_styled, df = compute_bivariate_scores_on_file_generator(
    list(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data").rglob(r"*line_and_parabola_up*heteroscedastic_1000*")) + \
    [PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noiseless", "line_and_parabola_up_1000.csv")]
)

dfi.export(df_styled, "../reports/tables/toy_data/example_line_and_parabola_up_1000.png", dpi=300)

fig = visualize.plot_metric_values_and_rank(
    scores_df=df[["pearson", "spearman", "mutual information (sklearn)", "MIC"]],
    sort_values_by="MIC"
)

fig.savefig("../reports/figures/toy_data/metrics_values_and_rank_noisy_2.png")

In [None]:
df_styled, df = compute_bivariate_scores_on_file_generator(
    list(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data").rglob(r"*two_lines*heteroscedastic_1000*")) + \
    [PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "noiseless", "two_lines_1000.csv")]
)

dfi.export(df_styled, "../reports/tables/toy_data/example_two_lines_1000.png", dpi=300)

fig = visualize.plot_metric_values_and_rank(
    scores_df=df[["pearson", "spearman", "mutual information (sklearn)", "MIC"]],
    sort_values_by="MIC"
)

fig.savefig("../reports/figures/toy_data/metrics_values_and_rank_noisy_3.png")