#### Include source package

In [None]:
# switch to the project directory
%cd ../..
# working directory should be ../pdi

In [None]:
import sys
import os

module_path = os.path.abspath('src')

if module_path not in sys.path:
    sys.path.append(module_path)

from pdi.data.data_exploration import plot_feature_distributions_by_condition, plot_group_ratio, generate_figure_thumbnails_from_iterator, plot_cor_matrix, plot_feature_combinations, plot_feature_histogram2d_combinations
from pdi.constants import TARGET_CODES

#### Load preprocessed data
We use `FeatureSetPreparation`, because it is preparation for proposed architecture, thus it is the most important to analyze this data.

In [None]:
from pdi.data.data_preparation import DataPreparation
from pdi.data.types import Split
from pdi.config import Config
import json

CONFIG_FILE_PATH = "experiments/engines_test/attention_config.json"

with open(CONFIG_FILE_PATH, 'r') as f:
    config_data = json.load(f)
config = Config.from_dict(config_data)
data_prep = DataPreparation(config.data, config.sim_dataset_paths, config.seed)
train_df = data_prep.create_dataloaders({Split.TRAIN: 1}, {Split.TRAIN: 1}, config.training.undersample_missing_detectors, config.training.undersample_pions)[0]
data = train_df.unwrap()
print(data.shape)

In [None]:
main_dir = f"eda_results/preprocessed/{data_prep._inputs_checksum}/train"

In [None]:
features = ['fTPCSignal', 'fTRDPattern', 'fTOFSignal', 'fBeta', 'fP', 'fPx', 'fPy', 'fPz', 'fX', 'fY', 'fZ', 'fAlpha', 'fTPCNClsShared', 'fDcaXY', 'fDcaZ']

# Missing detectors combinations

In [None]:
missing_conditions = [
    data["fTOFSignal"].isna() & data["fTRDPattern"].notna(),
    data["fTRDPattern"].isna() & data["fTOFSignal"].notna(),
    data["fTOFSignal"].isna() & data["fTRDPattern"].isna(),
    data["fTRDPattern"].notna() & data["fTOFSignal"].notna(),
]
missing_labels = ["Missing TOF", "Missing TRD", "Missing TOF+TRD", "Nothing missing"]

# We do not want to plot signal distributions for the features that are not in the data
filtered_features = [feature for feature in features if feature not in ['fTRDPattern', 'fTOFSignal', 'fBeta']]

save_directory = f"{main_dir}/diff_missing_det"
os.makedirs(save_directory, exist_ok=True)

### Ratio

In [None]:
pie_miss_dect = plot_group_ratio(missing_labels, missing_conditions, title="Distribution of missing particle detector combinations")
pie_miss_dect.savefig(f"{save_directory}/missing_det_ratio.png", dpi=300, bbox_inches='tight')

### Histogram per feature

In [None]:
# Histograms
hist_gen = plot_feature_distributions_by_condition(
    data, filtered_features, missing_labels, missing_conditions,
    plot_type="hist", log_y=True, title_template="Distribution of {feature} by Missing Detector"
)
generate_figure_thumbnails_from_iterator(hist_gen, save_directory)

### Boxplots per feature

In [None]:
# Boxplots
boxplot_gen = plot_feature_distributions_by_condition(
    data, filtered_features, missing_labels, missing_conditions,
    plot_type="boxplot", title_template="Distribution of {feature} by Missing Detector"
)
generate_figure_thumbnails_from_iterator(boxplot_gen, save_directory)

## Particle types of PIDML interest

In [None]:
from pdi.constants import TARGET_CODE_TO_PART_NAME

particle_conditions = [data["targets"] == float(code) for code in TARGET_CODES]
print([sum(p_cond) for p_cond in particle_conditions])
print(TARGET_CODES)

particle_labels = [TARGET_CODE_TO_PART_NAME[code] for code in TARGET_CODES]

save_directory = f"{main_dir}/dist_by_particle"
os.makedirs(save_directory, exist_ok=True)

### Ratio

In [None]:
# Ratio
pie_part_dist = plot_group_ratio(particle_labels, particle_conditions, title="Distribution of particle types")
pie_part_dist.savefig(f"{save_directory}/part_dist_ratio.png", dpi=300, bbox_inches='tight')

### Correlation matrices

In [None]:
save_directory_cor_mat = f"{main_dir}/cor_mat"
os.makedirs(save_directory_cor_mat, exist_ok=True)

filtered_data = data[features + ["targets"]]

particle_types_cor_mat = [filtered_data[condition] for condition in particle_conditions[:-1]]  # Exclude "other particles"
particle_labels_cor_mat = [TARGET_CODE_TO_PART_NAME[code] for code in TARGET_CODES]
particle_types_cor_mat.append(filtered_data)  # Add "all particles"
particle_labels_cor_mat.append("all particles")

# Generate correlation matrix plots
plots = ((plot_cor_matrix(particle_data, label), label) for particle_data, label in zip(particle_types_cor_mat, particle_labels_cor_mat))
generate_figure_thumbnails_from_iterator(plots, save_directory_cor_mat)

### Histograms per feature

In [None]:
# Histograms
hist_gen = plot_feature_distributions_by_condition(
    data, features, particle_labels[:-1], particle_conditions[:-1],
    plot_type="kde", title_template="Distribution of {feature} by Particle"
)
generate_figure_thumbnails_from_iterator(hist_gen, save_directory)

## Compare of the basis of "fSign"

In [None]:
sign_conditions = [
    data["fSign"] == 1,
    data["fSign"] == -1
]
sign_labels = ["fSign = 1", "fSign = -1"]
save_directory = f"{main_dir}/dist_by_sign"
os.makedirs(save_directory, exist_ok=True)

# Histograms
hist_gen = plot_feature_distributions_by_condition(
    data, features, sign_labels, sign_conditions,
    plot_type="kde", title_template="Distribution of {feature} by Particle Sign"
)
generate_figure_thumbnails_from_iterator(hist_gen, save_directory)

## Feature combinations

In [None]:
features_to_plot = ["fTPCSignal", "fTRDPattern", "fBeta", "fP"]
save_directory = f"{main_dir}/feature_combinations"
os.makedirs(save_directory, exist_ok=True)

In [None]:
# Scatterplots
generate_figure_thumbnails_from_iterator(plot_feature_combinations(data, features_to_plot), save_directory)

In [None]:
# Histograms
generate_figure_thumbnails_from_iterator(plot_feature_histogram2d_combinations(data, features_to_plot), save_directory)