# Compute metrics on toy data

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from pathlib import Path

import pandas as pd

In [2]:
PROCESSED_DATA_FOLDER_PATH = Path("../data/processed")

INPUTS_COLUMNS = ["x_0"]
OUTPUTS_COLUMNS = ["y"]

## Load toy data

In [3]:
data_df = pd.read_csv(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data", "toy_data_linear_0_high_snr.csv"), index_col=False)
data_df.head()

Unnamed: 0,x_0,y
0,0.466675,35.203296
1,0.534236,36.823796
2,-0.976394,-49.645449
3,-0.991247,-67.236426
4,-0.525419,-20.516783


## 1 to 1 metrics

In [13]:
from typing import List
from itertools import product

from scipy import stats
from sklearn.feature_selection import mutual_info_regression
from minepy import MINE


def compute_one_to_one_scores(df: pd.DataFrame,
                              input_cols: List[str],
                              output_cols: List[str]) -> pd.DataFrame:
    """_summary_

    Args:
        df (pd.DataFrame): _description_
        input_cols (List[str]): _description_
        output_cols (List[str]): _description_

    Returns:
        pd.DataFrame: The different scores for the different combinations of inputs-outputs
    """

    records = []
    combinations = product(input_cols, output_cols)

    for input_col, output_col in combinations:

        # pearson
        pearson_score = stats.pearsonr(df[input_col], df[output_col]).statistic

        # spearman
        spearman_score = stats.spearmanr(df[input_col], df[output_col]).statistic

        # mutual information
        mi_score = mutual_info_regression(df[input_col].values.reshape(-1, 1),
                                          df[output_col].values)[0]

        # maximal information coefficent
        mine = MINE()
        mine.compute_score(df[input_col], df[output_col])
        mic_score = mine.mic()

        records.append({"input": input_col,
                        "output": output_col,
                        "pearson": pearson_score,
                        "spearman": spearman_score,
                        "mi": mi_score,
                        "mic": mic_score})
    
    return pd.DataFrame.from_records(records)

In [14]:
compute_one_to_one_scores(data_df, ["x_0"], ["y"])

Unnamed: 0,input,output,pearson,spearman,mi,mic
0,x_0,y,0.954541,0.84644,0.779214,1.0


In [19]:
scores_list = []

for file in sorted(PROCESSED_DATA_FOLDER_PATH.joinpath("toy_data").glob("*.csv")):

    data_df = pd.read_csv(file, index_col=False)

    scores_df = compute_one_to_one_scores(data_df, ["x_0"], ["y"])
    scores_df["file"] = file.name

    scores_list.append(scores_df)

scores_df = pd.concat(scores_list)

In [22]:
scores_df

Unnamed: 0,input,output,pearson,spearman,mi,mic,file
0,x_0,y,0.980963,0.979199,1.657329,0.849864,toy_data_linear_0_high_snr.csv
0,x_0,y,0.859695,0.849856,0.667716,0.536843,toy_data_linear_0_low_snr.csv
0,x_0,y,0.761491,0.974723,1.555288,0.835708,toy_data_log_0_high_snr.csv
0,x_0,y,0.416182,0.494823,0.170443,0.192702,toy_data_log_0_low_snr.csv
0,x_0,y,0.04685,0.121833,1.156415,0.874138,toy_data_sin_0_high_snr.csv
0,x_0,y,0.041902,0.090484,0.181685,0.234796,toy_data_sin_0_low_snr.csv
0,x_0,y,0.972313,0.975774,1.568793,1.0,toy_data_step_0_high_snr.csv
0,x_0,y,0.954541,0.84644,0.779214,1.0,toy_data_step_0_low_snr.csv
