#### Include source package

In [None]:
# switch to the project directory
%cd ..
# working directory should be ../pdi

In [None]:
import sys
import os
module_path = os.path.abspath('src')

if module_path not in sys.path:
    sys.path.append(module_path)

#### Extract file name from INPUT_PATH for creating folders

In [None]:
from pdi.data.config import INPUT_PATH

csv_name = os.path.basename(INPUT_PATH)
csv_name = os.path.splitext(csv_name)[0]
print(csv_name)

In [None]:
main_dir = "reports/figures/pr_test"

#### Load data

In [None]:
from pdi.data.preparation import FeatureSetPreparation
from pdi.data.types import Split

splits = [Split.TRAIN]
prep = FeatureSetPreparation()
prep._load_preprocessed_data(splits)

In [None]:
ungrouped_data = prep.data_to_ungrouped_df(splits)

#### Data shape

In [None]:
print(ungrouped_data.shape)

#### Class distribution

In [None]:
from pdi.data.constants import TARGET_COLUMN

classes = ungrouped_data[TARGET_COLUMN].value_counts()
print(classes)

#### Percent of classes used in experiments

In [None]:
chosen_classes=[-211, 211, 321, -321, 2212, -2212]

num_chosen = classes[chosen_classes].sum()
print(num_chosen / ungrouped_data.shape[0])

#### Missing values

In [None]:
nulls = ungrouped_data.isnull().sum()
print(nulls)

#### Percent of incomplete examples

In [None]:
all_nulls = ungrouped_data.isnull().any(axis=1).sum()
print(all_nulls)
print(all_nulls/ungrouped_data.shape[0])

## Missing distribution pie chart

In [None]:
from matplotlib import pyplot as plt
from pdi.data.detector_helpers import columns_to_detectors, detector_unmask

null_rows = ungrouped_data.isnull().value_counts()

columns = ungrouped_data.columns
missing_values = [columns[list(index)] for index in null_rows.index]

missing_detectors = []
for mv in missing_values:
    dets = columns_to_detectors(mv)
    dets = [d.name for d in dets]
    missing_detectors.append(dets)
print(missing_detectors, null_rows.values)

plt.pie(null_rows)

labels = ["Missing detectors: " + ", ".join(v) for i, v in enumerate(missing_detectors)]
print(labels)
plt.legend(
    [l + f": {100*null_rows[i]/sum(null_rows):.3f}%" for i, l in enumerate(labels)]
    , loc="lower right", bbox_to_anchor=(2.2, -0.5), prop={'size': 20}
)

save_dir = f"{main_dir}/part_and_det_dist/{csv_name}"
os.makedirs(save_dir, exist_ok=True)

plt.savefig(os.path.join(save_dir, "missing_dets.png"), bbox_inches = "tight")

## Class distribution pie chart

In [None]:
from pdi.constants import PARTICLES_DICT
particles = [classes[i] for i in classes.index if i in PARTICLES_DICT]
polish_labels_plural = {
    key: name.replace("anti", "anty").replace("electron", "elektron") + "y" for key, name in PARTICLES_DICT.items()
}

labels_percent = [
    PARTICLES_DICT[i] + f": {100*classes[i]/sum(classes):.3f}%" for i in classes.index if i in PARTICLES_DICT
]

plt.pie(particles)
plt.legend(
    labels_percent, loc="lower right", bbox_to_anchor=(2.2, -0.5), prop={'size': 20}
)

plt.savefig(os.path.join(save_dir, "particles.png"), bbox_inches = "tight")

## Particles distribution vs pt

In [None]:
from pdi.data.data_exploration import plot_particle_distribution
from pdi.constants import TARGET_CODES, PARTICLES_DICT

save_dir = f"{main_dir}/distribution_vs_pt/{csv_name}"
os.makedirs(save_dir, exist_ok=True)
for target_code in TARGET_CODES:
    plot_particle_distribution(target_code, prep, splits, "fPt", f"{PARTICLES_DICT[target_code]}", save_dir)

## Corelation matrix

In [None]:
data = prep.data_to_ungrouped_df(splits)

In [None]:
save_dir = f"{main_dir}/cor_matrix/{csv_name}"

In [None]:
from pdi.data.data_exploration import plot_cor_matrix

os.makedirs(save_dir, exist_ok=True)

title = "all particles"
plot_cor_matrix(data, title, save_dir)

In [None]:
from pdi.constants import PARTICLES_DICT
from pdi.data.constants import TARGET_COLUMN

target_codes = [211, 2212, 321]
for target_code in target_codes:
    one_particle = data.loc[data[TARGET_COLUMN] == target_code]

    title = PARTICLES_DICT[target_code]
    plot_cor_matrix(one_particle, title, save_dir)