Include source package

In [None]:
# switch to the project directory
%cd ..
# working directory should be ../pdi

In [None]:
import sys
import os
module_path = os.path.abspath('src')

if module_path not in sys.path:
    sys.path.append(module_path)

Load data

In [None]:
import pandas as pd
import numpy as np

from pdi.data.constants import INPUT_PATH, CSV_DELIMITER, MISSING_VALUES
print(INPUT_PATH)

data = pd.read_csv(INPUT_PATH, sep=CSV_DELIMITER, index_col=0)
for column, val in MISSING_VALUES.items():
    data.loc[data[column] == val, column] = np.NaN

Data shape

In [None]:
print(data.shape)

Class distribution

In [None]:
classes = data["fPdgCode"].value_counts()
print(classes)

Percent of classes used in experiments

In [None]:
chosen_classes=[-211, 211, 321, -321, 2212, -2212]

num_chosen = classes[chosen_classes].sum()
print(num_chosen / data.shape[0])

Missing values

In [None]:
nulls = data.isnull().sum()
print(nulls)

Missing distribution pie chart

In [None]:
from matplotlib import pyplot as plt

null_rows = data.isnull().value_counts()

columns = data.columns
missing_values = [columns[list(index)] for index in null_rows.index]
print(missing_values, null_rows.values)

plt.pie(null_rows)

labels = ["Zaobserwowane wszystkie wartości",
    "Brakujące wartości z detektorów TRD i TOF",
    "Brakujące wartości z detektora TOF",
    "Brakujące wartości z detektora TRD"]
plt.legend(
    [l + f": {100*null_rows[i]/sum(null_rows):.3f}%" for i, l in enumerate(labels)]
    , loc="lower right", bbox_to_anchor=(2.2, -0.5), prop={'size': 20}
)

plt.savefig(f"reports/figures/miss_val_distr.eps", bbox_inches = "tight")

Percent of incomplete examples

In [None]:
all_nulls = data.isnull().any(axis=1).sum()
print(all_nulls)
print(all_nulls/data.shape[0])

Class distribution pie chart

In [None]:
from pdi.constants import PARTICLES_DICT
particles = [classes[i] for i in classes.index if i in PARTICLES_DICT]
polish_labels_plural = {
    key: name.replace("anti", "anty").replace("electron", "elektron") + "y" for key, name in PARTICLES_DICT.items()
}

labels_percent = [
    polish_labels_plural[i] + f": {100*classes[i]/sum(classes):.3f}%" for i in classes.index if i in polish_labels_plural
]

plt.pie(particles)
plt.legend(
    labels_percent, loc="lower right", bbox_to_anchor=(2.2, -0.5), prop={'size': 20}
)

plt.savefig(f"reports/figures/part_type_distr.eps", bbox_inches = "tight")