In [1]:
ANNOTATION_NAMES = ['a_ascend', 'a_descend', 'a_jump', 'a_loadwalk', 'a_walk', 'p_bent', 'p_kneel', 'p_lie', 'p_sit', 'p_squat', 'p_stand', 't_bend', 't_kneel_stand', 't_lie_sit', 't_sit_lie', 't_sit_stand', 't_stand_kneel', 't_stand_sit', 't_straighten', 't_turn']
TRAIN_PATH = r'data/train'
TEST_PATH = r'data/test'

In [2]:
import os
import pandas as pd

def get_all_targets(path):
    sample_dirs = os.listdir(path)
    sample_dirs.sort()
    
    dfs = []
    for sample in sample_dirs:
        df = pd.read_csv(f"{path}/{sample}/targets.csv")
        df.insert(0, 'sample', sample)
        df.insert(1, 'sample_index', df.index)
        dfs.append(df)
    
    # Concatenate all data into one DataFrame
    return pd.concat(dfs, ignore_index=True)

In [3]:
targets = get_all_targets(TRAIN_PATH)
targets

Unnamed: 0,sample,sample_index,start,end,a_ascend,a_descend,a_jump,a_loadwalk,a_walk,p_bent,...,p_stand,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn
0,00001,0,0.0,1.0,,,,,,,...,,,,,,,,,,
1,00001,1,1.0,2.0,,,,,,,...,,,,,,,,,,
2,00001,2,2.0,3.0,,,,,,,...,,,,,,,,,,
3,00001,3,3.0,4.0,,,,,,,...,,,,,,,,,,
4,00001,4,4.0,5.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16606,00010,1794,1794.0,1795.0,,,,,,,...,,,,,,,,,,
16607,00010,1795,1795.0,1796.0,,,,,,,...,,,,,,,,,,
16608,00010,1796,1796.0,1797.0,,,,,,,...,,,,,,,,,,
16609,00010,1797,1797.0,1798.0,,,,,,,...,,,,,,,,,,


In [4]:
# remove full NaN rows
targets = targets.dropna(subset=ANNOTATION_NAMES, how="all")

In [5]:
# check number of NaNs in data
print(targets.isna().sum())

sample           0
sample_index     0
start            0
end              0
a_ascend         0
a_descend        0
a_jump           0
a_loadwalk       0
a_walk           0
p_bent           0
p_kneel          0
p_lie            0
p_sit            0
p_squat          0
p_stand          0
t_bend           0
t_kneel_stand    0
t_lie_sit        0
t_sit_lie        0
t_sit_stand      0
t_stand_kneel    0
t_stand_sit      0
t_straighten     0
t_turn           0
dtype: int64


In [6]:
# find all rows where the probabilities do not sum to 1 (precision of 10 decimal places)
invalid_prob_sums = targets.loc[targets[ANNOTATION_NAMES].sum(axis=1).round(decimals=10) < 1.0]

invalid_prob_sums

Unnamed: 0,sample,sample_index,start,end,a_ascend,a_descend,a_jump,a_loadwalk,a_walk,p_bent,...,p_stand,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn


# Notes

* comparing floats requires rounding to some desired precission first, series of float operations accumulate miniscule errors on the bit level which might be hard to detect otherwise