In [17]:
import os
import json
import pandas as pd
from collections import Counter

# Paths
base_path = '.'  # Replace with your actual path
rgb_path = os.path.join(base_path, 'rgb')
train_json = os.path.join(base_path, 'Diving48_train.json')
test_json = os.path.join(base_path, 'Diving48_test.json')
vocab_json = os.path.join(base_path, 'Diving48_vocab.json')

# Load JSONs
with open(train_json, 'r') as f:
    train_data = json.load(f)

with open(test_json, 'r') as f:
    test_data = json.load(f)

with open(vocab_json, 'r') as f:
    vocab_data = json.load(f)

# Convert to DataFrame for easy handling
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)


# Show head
print("Train Data Head:")
print(df_train.head())

print("\nTest Data Head:")
print(df_test.head())

# Count unique classes
train_classes = df_train['label'].unique()
test_classes = df_test['label'].unique()
print(f"\nUnique Classes in Train: {len(train_classes)}")
print(f"Unique Classes in Test: {len(test_classes)}")

# Class distribution
print("\nTrain Class Distribution:")
print(df_train['label'].value_counts())

print("\nTest Class Distribution:")
print(df_test['label'].value_counts())

# Check total MP4 files in RGB folder
mp4_files = [f for f in os.listdir(rgb_path) if f.endswith('.mp4')]
print(f"\nTotal video files in rgb/: {len(mp4_files)}")

Train Data Head:
            vid_name  end_frame  start_frame  label
0  VNvb5oLOpLg_00000        156            0     19
1  VNvb5oLOpLg_00001        202            0     19
2  VNvb5oLOpLg_00002        231            0     19
3  VNvb5oLOpLg_00003        129            0     15
4  VNvb5oLOpLg_00004        227            0     15

Test Data Head:
            vid_name  end_frame  start_frame  label
0  rRw7peH60Yw_00000        105            0     26
1  rRw7peH60Yw_00001         45            0     33
2  rRw7peH60Yw_00002         81            0     27
3  rRw7peH60Yw_00003         52            0     33
4  rRw7peH60Yw_00004        163            0     26

Unique Classes in Train: 48
Unique Classes in Test: 48

Train Class Distribution:
label
26    1159
35    1066
7      960
46     792
31     755
21     688
15     668
33     622
19     611
34     595
45     591
44     552
24     525
5      509
12     453
43     451
8      410
22     403
28     361
17     335
29     321
36     304
27     256


In [20]:
df_vocab = pd.DataFrame(vocab_data)
df_vocab.columns

RangeIndex(start=0, stop=4, step=1)

In [11]:
df_train['label'].value_counts()

label
26    1053
35     992
7      901
46     816
21     689
31     683
19     604
15     591
34     571
33     568
45     559
44     518
24     484
5      467
8      424
12     421
43     406
22     355
17     349
36     346
28     326
3      259
27     250
29     231
0      222
9      212
41     160
6      160
20     148
11     144
42     130
1      109
14     102
38      91
10      74
37      69
18      68
32      61
23      57
2       49
40      48
16      48
25      46
39      45
13      44
47      44
4       33
Name: count, dtype: int64

In [16]:
df_train['vid_name'].value_counts()

vid_name
-mmq0PT-u8k_00155    1
_lmT4WlK7G0_00123    1
_lmT4WlK7G0_00058    1
_lmT4WlK7G0_00059    1
_lmT4WlK7G0_00060    1
                    ..
nOlRwoxsDJ0_00027    1
nOlRwoxsDJ0_00028    1
nOlRwoxsDJ0_00029    1
nOlRwoxsDJ0_00030    1
zbAC7t15q3k_00079    1
Name: count, Length: 15027, dtype: int64

In [21]:
unique_action = sorted(df_vocab[0].unique())
unique_somersault = sorted(df_vocab[1].unique())
unique_twist = sorted(df_vocab[2].unique())
unique_position = sorted(df_vocab[3].unique())

print("Unique Actions:", unique_action)
print("Unique Somersaults:", unique_somersault)
print("Unique Twists:", unique_twist)
print("Unique Positions:", unique_position)

Unique Actions: ['Back', 'Forward', 'Inward', 'Reverse']
Unique Somersaults: ['15som', '1som', '25som', '2som', '35som', '3som', '45som', 'Dive']
Unique Twists: ['05Twis', '15Twis', '1Twis', '25Twis', '2Twis', '35Twis', '3Twis', 'NoTwis']
Unique Positions: ['FREE', 'PIKE', 'STR', 'TUCK']


In [23]:
unique_all = set(unique_action + unique_somersault + unique_twist + unique_position)
unique_all = sorted(unique_all)

# Print the result
print("✅ Unique values across all 4 columns (Total: {}):".format(len(unique_all)))
print(unique_all)

✅ Unique values across all 4 columns (Total: 24):
['05Twis', '15Twis', '15som', '1Twis', '1som', '25Twis', '25som', '2Twis', '2som', '35Twis', '35som', '3Twis', '3som', '45som', 'Back', 'Dive', 'FREE', 'Forward', 'Inward', 'NoTwis', 'PIKE', 'Reverse', 'STR', 'TUCK']


In [24]:
unique_all

['05Twis',
 '15Twis',
 '15som',
 '1Twis',
 '1som',
 '25Twis',
 '25som',
 '2Twis',
 '2som',
 '35Twis',
 '35som',
 '3Twis',
 '3som',
 '45som',
 'Back',
 'Dive',
 'FREE',
 'Forward',
 'Inward',
 'NoTwis',
 'PIKE',
 'Reverse',
 'STR',
 'TUCK']

In [35]:
df_vocab.loc[(df_vocab[3]=='TUCK')]

Unnamed: 0,0,1,2,3
4,Back,15som,NoTwis,TUCK
8,Back,25som,NoTwis,TUCK
12,Back,35som,NoTwis,TUCK
14,Back,3som,NoTwis,TUCK
16,Back,Dive,NoTwis,TUCK
25,Forward,25som,NoTwis,TUCK
27,Forward,35som,NoTwis,TUCK
28,Forward,45som,NoTwis,TUCK
32,Inward,15som,NoTwis,TUCK
34,Inward,25som,NoTwis,TUCK


In [41]:
for i in range(4):
    print(df_vocab[i].value_counts())

0
Back       17
Forward    14
Reverse    11
Inward      6
Name: count, dtype: int64
1
15som    15
25som    14
Dive      7
35som     6
2som      2
3som      2
1som      1
45som     1
Name: count, dtype: int64
2
NoTwis    31
15Twis     5
25Twis     4
05Twis     2
1Twis      2
2Twis      2
3Twis      1
35Twis     1
Name: count, dtype: int64
3
PIKE    22
TUCK    14
FREE    11
STR      1
Name: count, dtype: int64


In [42]:
df_vocab

Unnamed: 0,0,1,2,3
0,Back,15som,05Twis,FREE
1,Back,15som,15Twis,FREE
2,Back,15som,25Twis,FREE
3,Back,15som,NoTwis,PIKE
4,Back,15som,NoTwis,TUCK
5,Back,25som,15Twis,PIKE
6,Back,25som,25Twis,PIKE
7,Back,25som,NoTwis,PIKE
8,Back,25som,NoTwis,TUCK
9,Back,2som,15Twis,FREE


In [43]:
def add_class_counts(df_vocab, df_train):
    """
    Adds a 'count' column to df_vocab representing the number of samples in each class (0–47) from df_train.
    
    Parameters:
    - df_vocab: DataFrame with class vocab (index assumed 0–47)
    - df_train: DataFrame with training data (must include 'label' column)

    Returns:
    - df_vocab with an added 'count' column
    """
    # Count occurrences of each label in df_train
    label_counts = df_train['label'].value_counts().sort_index()

    # Reindex to make sure we get 0–47 (fill missing with 0)
    label_counts = label_counts.reindex(range(48), fill_value=0)

    # Add count column to df_vocab
    df_vocab = df_vocab.copy()
    df_vocab['count'] = label_counts.values

    return df_vocab

In [44]:
df_vocab_updated = add_class_counts(df_vocab, df_train)

In [47]:
df_vocab_updated.to_csv("df_vocab.csv")

In [46]:
for i in range(4):
    print(df_vocab[i].value_counts())

0
Back       17
Forward    14
Reverse    11
Inward      6
Name: count, dtype: int64
1
15som    15
25som    14
Dive      7
35som     6
2som      2
3som      2
1som      1
45som     1
Name: count, dtype: int64
2
NoTwis    31
15Twis     5
25Twis     4
05Twis     2
1Twis      2
2Twis      2
3Twis      1
35Twis     1
Name: count, dtype: int64
3
PIKE    22
TUCK    14
FREE    11
STR      1
Name: count, dtype: int64
