# Data analysis

In [1]:
import os
import sys
import pandas as pd

# Used to find the datasets.radartrack module
sys.path.append('..')

from datasets.radartrack import RadarTrack, get_sequence_paths, parse_sequence_info

In [2]:
data_dir = '../data/dataset/RDTrack'
dataset = 'rdtrack'
splits = ['train', 'val', 'test']

In [3]:
sequences: list[tuple[str, str]] = []
for split in splits:
    seq_paths = get_sequence_paths(data_dir, dataset, split)
    sequences.extend(map(lambda y: (split, *y), seq_paths))

gt_files = [os.path.join(seq_path, 'gt', 'gt.txt') for _, _, seq_path in sequences]

dfs: list[pd.DataFrame] = []
seq_infos: list[dict[str]] = []
for i, ((split, seq_id, seq_path), gt) in enumerate(zip(sequences, gt_files)):
    tdf = pd.read_csv(gt, names=RadarTrack.MOT_GT_COLUMNS, dtype=RadarTrack.MOT_GT_DTYPES)
    tdf['split'] = split
    tdf['sequence'] = seq_id
    
    seqinfo = parse_sequence_info(seq_path)
    seqinfo['split'] = split
    tdf['radar'] = seqinfo['radar']
    tdf['measurement'] = seqinfo['measurement']
    tdf['img_width'] = seqinfo['width']
    tdf['img_height'] = seqinfo['height']
    
    seq_infos.append(seqinfo)
    dfs.append(tdf)

df = pd.concat(dfs, axis=0, ignore_index=True)
df.drop_duplicates(inplace=True)
df_seq = pd.DataFrame(seq_infos)

df

Unnamed: 0,frame,id,bb_left,bb_top,bb_width,bb_height,consider_entry,class,visibility,split,sequence,radar,measurement,img_width,img_height
0,1,1,16.5,131.5,38.0,47.0,1,1,1.0,train,rdtrack0001,-1,,0,0
1,1,2,33.5,150.5,4.0,6.0,1,1,1.0,train,rdtrack0001,-1,,0,0
2,1,3,39.5,46.5,7.0,9.0,1,1,1.0,train,rdtrack0001,-1,,0,0
3,2,1,29.5,134.5,7.0,44.0,1,1,1.0,train,rdtrack0001,-1,,0,0
4,2,2,33.5,151.5,5.0,4.0,1,1,1.0,train,rdtrack0001,-1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28395,114,7,34.5,102.5,3.0,3.0,1,1,1.0,test,rdtrack0024,-1,,0,0
28396,115,7,31.5,114.5,7.0,4.0,1,1,1.0,test,rdtrack0024,-1,,0,0
28397,116,7,33.5,123.5,3.0,4.0,1,1,1.0,test,rdtrack0024,-1,,0,0
28398,117,7,34.5,126.5,4.0,4.0,1,1,1.0,test,rdtrack0024,-1,,0,0


In [4]:
df_seq

Unnamed: 0,id,name,measurement,radar,len,fps,image_dir,image_ext,width,height,min,max,minQ,split
0,rdtrack0001,,,-1,379,1,img1,.png,0,0,255,0,255,train
1,rdtrack0002,,,-1,103,1,img1,.png,0,0,255,0,255,train
2,rdtrack0003,,,-1,166,1,img1,.png,0,0,255,0,255,train
3,rdtrack0004,,,-1,807,1,img1,.png,0,0,255,0,255,train
4,rdtrack0006,,,-1,599,1,img1,.png,0,0,255,0,255,train
5,rdtrack0008,,,-1,399,1,img1,.png,0,0,255,0,255,train
6,rdtrack0009,,,-1,209,1,img1,.png,0,0,255,0,255,train
7,rdtrack0010,,,-1,420,1,img1,.png,0,0,255,0,255,train
8,rdtrack0011,,,-1,258,1,img1,.png,0,0,255,0,255,train
9,rdtrack0013,,,-1,604,1,img1,.png,0,0,255,0,255,train


In [5]:
df.dtypes

frame               int64
id                  int64
bb_left           float32
bb_top            float32
bb_width          float32
bb_height         float32
consider_entry      int32
class               int32
visibility        float32
split              object
sequence           object
radar               int64
measurement        object
img_width           int64
img_height          int64
dtype: object

In [6]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
frame,28400.0,304.0525,274.239853,1.0,116.0,231.0,387.0,1438.0
id,28400.0,43.413556,42.926627,1.0,13.0,31.0,58.0,233.0
bb_left,28400.0,22.968662,17.156755,-32.5,6.5,27.5,33.5,62.5
bb_top,28400.0,124.909088,57.052555,4.5,81.5,128.5,171.5,244.5
bb_width,28400.0,16.023134,17.714516,1.0,4.0,9.0,21.0,81.0
bb_height,28400.0,14.234824,9.280062,1.0,6.0,13.0,21.0,76.0
consider_entry,28400.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
class,28400.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
visibility,28400.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
radar,28400.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0


Number of frames for each split

In [7]:
df_seq.groupby(['split'])['len'].sum()

split
test      585
train    7528
val      2045
Name: len, dtype: int64

Number of objects for each sequence

In [56]:
df[['sequence', 'split', 'id']].groupby(['split', 'sequence']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,id
split,sequence,Unnamed: 2_level_1
test,rdtrack0007,61
test,rdtrack0024,7
train,rdtrack0001,40
train,rdtrack0002,87
train,rdtrack0003,18
train,rdtrack0004,107
train,rdtrack0006,88
train,rdtrack0008,75
train,rdtrack0009,17
train,rdtrack0010,63


## Multi-object sequence count

Sequence with more than one object in a given split

In [32]:
df_object_count = df[df['split'] == 'train'].groupby(['sequence'])['id'].nunique()
df_object_count[df_object_count > 1]

sequence
rdtrack0001     40
rdtrack0002     87
rdtrack0003     18
rdtrack0004    107
rdtrack0006     88
rdtrack0008     75
rdtrack0009     17
rdtrack0010     63
rdtrack0011     63
rdtrack0013     54
rdtrack0014    124
rdtrack0016    233
rdtrack0017     28
rdtrack0018     89
rdtrack0019     21
rdtrack0020     49
rdtrack0023     91
Name: id, dtype: int64

Number of frames where only one object is visible

In [67]:
# df_object_count_frames = df[df['split'] == 'test'].groupby(['sequence', 'frame'])['id'].nunique()
df_object_count_frames = df.groupby(['sequence', 'frame'])['id'].nunique()
# df_object_count_frames[df_object_count_frames == 1].count()
df_object_count_frames.groupby(['sequence']).mean()

sequence
rdtrack0001    2.862797
rdtrack0002    3.766990
rdtrack0003    3.295181
rdtrack0004    3.543990
rdtrack0005    2.723971
rdtrack0006    2.921536
rdtrack0007    3.359743
rdtrack0008    4.684211
rdtrack0009    2.636364
rdtrack0010    3.092857
rdtrack0011    2.728682
rdtrack0012    2.443959
rdtrack0013    2.622517
rdtrack0014    2.463696
rdtrack0015    2.128492
rdtrack0016    2.757302
rdtrack0017    1.658065
rdtrack0018    1.912037
rdtrack0019    2.531496
rdtrack0020    2.996296
rdtrack0021    2.172043
rdtrack0022    2.285714
rdtrack0023    2.736597
rdtrack0024    1.279661
Name: id, dtype: float64

Number of frames where more than one object is visible

In [11]:
df_object_count_frames[df_object_count_frames > 1].count()

8282

## Ignored entries

In [12]:
df[df['consider_entry'] <= 0]

Unnamed: 0,frame,id,bb_left,bb_top,bb_width,bb_height,consider_entry,class,visibility,split,sequence,radar,measurement,img_width,img_height


## Invalid bounding boxes

### Subzero values

In [13]:
df[(df['bb_left'] < 0) | (df['bb_top'] < 0) | (df['bb_width'] < 0) | (df['bb_height'] < 0)]

Unnamed: 0,frame,id,bb_left,bb_top,bb_width,bb_height,consider_entry,class,visibility,split,sequence,radar,measurement,img_width,img_height
21,8,1,-0.5,139.5,64.0,42.0,1,1,1.0,train,rdtrack0001,-1,,0,0
23,8,2,-0.5,213.5,62.0,6.0,1,1,1.0,train,rdtrack0001,-1,,0,0
27,10,2,-0.5,219.5,64.0,10.0,1,1,1.0,train,rdtrack0001,-1,,0,0
31,11,4,-0.5,185.5,64.0,23.0,1,1,1.0,train,rdtrack0001,-1,,0,0
33,12,4,-0.5,181.5,64.0,20.0,1,1,1.0,train,rdtrack0001,-1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28219,452,61,-0.5,157.5,64.0,22.0,1,1,1.0,test,rdtrack0007,-1,,0,0
28222,453,61,-0.5,163.5,64.0,20.0,1,1,1.0,test,rdtrack0007,-1,,0,0
28228,456,61,-0.5,180.5,64.0,21.0,1,1,1.0,test,rdtrack0007,-1,,0,0
28373,100,6,-0.5,144.5,64.0,21.0,1,1,1.0,test,rdtrack0024,-1,,0,0


### Box outside image

Boxes wider than the image

In [14]:
df[(df['bb_left'] + df['bb_width']) > df['img_width']]

Unnamed: 0,frame,id,bb_left,bb_top,bb_width,bb_height,consider_entry,class,visibility,split,sequence,radar,measurement,img_width,img_height
0,1,1,16.5,131.5,38.0,47.0,1,1,1.0,train,rdtrack0001,-1,,0,0
1,1,2,33.5,150.5,4.0,6.0,1,1,1.0,train,rdtrack0001,-1,,0,0
2,1,3,39.5,46.5,7.0,9.0,1,1,1.0,train,rdtrack0001,-1,,0,0
3,2,1,29.5,134.5,7.0,44.0,1,1,1.0,train,rdtrack0001,-1,,0,0
4,2,2,33.5,151.5,5.0,4.0,1,1,1.0,train,rdtrack0001,-1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28395,114,7,34.5,102.5,3.0,3.0,1,1,1.0,test,rdtrack0024,-1,,0,0
28396,115,7,31.5,114.5,7.0,4.0,1,1,1.0,test,rdtrack0024,-1,,0,0
28397,116,7,33.5,123.5,3.0,4.0,1,1,1.0,test,rdtrack0024,-1,,0,0
28398,117,7,34.5,126.5,4.0,4.0,1,1,1.0,test,rdtrack0024,-1,,0,0


Boxes higher than the image

In [15]:
df[(df['bb_top'] + df['bb_height']) > df['img_height']]

Unnamed: 0,frame,id,bb_left,bb_top,bb_width,bb_height,consider_entry,class,visibility,split,sequence,radar,measurement,img_width,img_height
0,1,1,16.5,131.5,38.0,47.0,1,1,1.0,train,rdtrack0001,-1,,0,0
1,1,2,33.5,150.5,4.0,6.0,1,1,1.0,train,rdtrack0001,-1,,0,0
2,1,3,39.5,46.5,7.0,9.0,1,1,1.0,train,rdtrack0001,-1,,0,0
3,2,1,29.5,134.5,7.0,44.0,1,1,1.0,train,rdtrack0001,-1,,0,0
4,2,2,33.5,151.5,5.0,4.0,1,1,1.0,train,rdtrack0001,-1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28395,114,7,34.5,102.5,3.0,3.0,1,1,1.0,test,rdtrack0024,-1,,0,0
28396,115,7,31.5,114.5,7.0,4.0,1,1,1.0,test,rdtrack0024,-1,,0,0
28397,116,7,33.5,123.5,3.0,4.0,1,1,1.0,test,rdtrack0024,-1,,0,0
28398,117,7,34.5,126.5,4.0,4.0,1,1,1.0,test,rdtrack0024,-1,,0,0


## Invalid track ids

In [16]:
df[df['id'] <= 0]

Unnamed: 0,frame,id,bb_left,bb_top,bb_width,bb_height,consider_entry,class,visibility,split,sequence,radar,measurement,img_width,img_height


## Invalid classes

In [17]:
df[(df['class'] <= 0) | (df['class'] > 13)]

Unnamed: 0,frame,id,bb_left,bb_top,bb_width,bb_height,consider_entry,class,visibility,split,sequence,radar,measurement,img_width,img_height


## Image resolutions

In [18]:
df_seq.groupby(['width', 'height'])['len'].sum()

width  height
0      0         10158
Name: len, dtype: int64

In [19]:
df_seq.groupby(['width', 'height'])['id'].count()

width  height
0      0         24
Name: id, dtype: int64

In [20]:
for (w, h), count in df_seq.groupby(['width', 'height'])['id'].count().items():
    if count < 10:
        print(f'Resolution: ({w}, {h})')
        print(df_seq[(df_seq['width'] == w) & (df_seq['height'] == h)][['split', 'id', 'measurement', 'name']])
        print()

## Finding best hyperparameter

### Miss Tolerance

We want to find the min, avg and max time an object is absence between frames.


In [21]:
def missing_frames(present: list[int]) -> list[int]:
    # Frames in which obj is absent
    # Diff of list where obj would be present in every frame from first appearance until last disappearance 
    # and actual frames where obj is present
    absence = set(range(present[0], present[-1] + 1)).difference(present)
    return sorted(absence)

absence = {
    'sequence': [],
    'split': [],
    'id': [],
    'absence': [],
}
# absence: dict[str, dict[int, list[int]]] = {}
for tdf in dfs:
    if len(tdf['split'].values) == 0:
        continue
    
    # Do not include test dataset in analysis
    split = tdf['split'].values[0]
    if split == 'test':
        continue
    
    seq_id = tdf['sequence'].values[0]
    tdf_seq = tdf.sort_values(by=['id', 'frame'], ascending=True)
    tdf_seq = tdf_seq.reset_index(drop=True)
    
    for id, present_frames in tdf_seq.groupby(['id'])['frame'].apply(list).items():
        absence_frames = missing_frames(present_frames)
        
        if len(absence_frames) == 0:
            continue
        
        count = 1
        for i in range(0, len(absence_frames) - 1):
            if absence_frames[i] + 1 == absence_frames[i + 1]:
                count += 1
            else:
                # absence.setdefault(seq_id, {}).setdefault(id, []).append(count)
                
                absence['sequence'].append(seq_id)
                absence['split'].append(split)
                absence['id'].append(id)
                absence['absence'].append(count)
                
                count = 1

# absence
df_absence = pd.DataFrame(absence)
df_absence

Unnamed: 0,sequence,split,id,absence
0,rdtrack0001,train,8,7
1,rdtrack0001,train,22,1
2,rdtrack0001,train,22,1
3,rdtrack0001,train,22,2
4,rdtrack0001,train,22,1
...,...,...,...,...
266,rdtrack0021,val,10,1
267,rdtrack0021,val,38,2
268,rdtrack0021,val,42,3
269,rdtrack0021,val,42,7


In [22]:
df_absence['absence'].describe()

count    271.000000
mean       8.675277
std       20.202074
min        1.000000
25%        1.000000
50%        3.000000
75%        5.500000
max      157.000000
Name: absence, dtype: float64

Number of entries where object is absent for one frame.

In [23]:
df_absence[df_absence['absence'] == 1]['absence'].count()

76

Top 10 sequences with max number of frames where object is absent in the validation dataset.

In [24]:
df_absence[df_absence['split'] == 'val'].nlargest(columns=['absence'], n=10)

Unnamed: 0,sequence,split,id,absence
234,rdtrack0005,val,21,22
254,rdtrack0012,val,53,9
255,rdtrack0012,val,54,9
257,rdtrack0012,val,73,8
236,rdtrack0005,val,29,7
240,rdtrack0012,val,8,7
246,rdtrack0012,val,15,7
269,rdtrack0021,val,42,7
241,rdtrack0012,val,11,6
237,rdtrack0005,val,34,5
