In [1]:
import os
import json
import pandas as pd
from collections import Counter

# Paths
base_path = '.'  # Replace with your actual path
rgb_path = os.path.join(base_path, 'rgb')
train_json = os.path.join(base_path, 'Diving48_V2_train.json')
test_json = os.path.join(base_path, 'Diving48_V2_test.json')
vocab_json = os.path.join(base_path, 'Diving48_vocab.json')

# Load JSONs
with open(train_json, 'r') as f:
    train_data = json.load(f)

with open(test_json, 'r') as f:
    test_data = json.load(f)

with open(vocab_json, 'r') as f:
    vocab_data = json.load(f)

# Convert to DataFrame for easy handling
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

# Show head
print("Train Data Head:")
print(df_train.head())

print("\nTest Data Head:")
print(df_test.head())

# Count unique classes
train_classes = df_train['label'].unique()
test_classes = df_test['label'].unique()
print(f"\nUnique Classes in Train: {len(train_classes)}")
print(f"Unique Classes in Test: {len(test_classes)}")

# Class distribution
print("\nTrain Class Distribution:")
print(df_train['label'].value_counts())

print("\nTest Class Distribution:")
print(df_test['label'].value_counts())

# Check total MP4 files in RGB folder
mp4_files = [f for f in os.listdir(rgb_path) if f.endswith('.mp4')]
print(f"\nTotal video files in rgb/: {len(mp4_files)}")

Train Data Head:
            vid_name  label  start_frame  end_frame
0  -mmq0PT-u8k_00155      0            0         48
1  -mmq0PT-u8k_00156      0            0         70
2  -mmq0PT-u8k_00157      0            0         90
3  3qq031609lA_00002      0            0        123
4  3qq031609lA_00004      0            0        102

Test Data Head:
            vid_name  label  start_frame  end_frame
0  8qRmKunCjtY_00016      0            0         78
1  CVAfPfVFulQ_00038      0            0        100
2  CVAfPfVFulQ_00040      0            0         79
3  CVAfPfVFulQ_00048      0            0         76
4  CVAfPfVFulQ_00049      0            0         83

Unique Classes in Train: 47
Unique Classes in Test: 47

Train Class Distribution:
label
26    1053
35     992
7      901
46     816
21     689
31     683
19     604
15     591
34     571
33     568
45     559
44     518
24     484
5      467
8      424
12     421
43     406
22     355
17     349
36     346
28     326
3      259
27     250


In [5]:
df_train.shape, df_train["vid_name"].nunique()

((15027, 4), 15027)

In [6]:
df_test.shape, df_test["vid_name"].nunique()

((1970, 4), 1970)

In [7]:
df_test.shape[0]+ df_train.shape[0]

16997

In [9]:
len(os.listdir("rgb/"))

18404

In [11]:
videoSet = df_train["vid_name"].to_list()
videoSet.extend(df_test["vid_name"].to_list())
len(videoSet)

16997

In [15]:
count = 0
for video in os.listdir("rgb/"):
    if video.split(".mp")[0] not in videoSet:
        print(f"Video not found {video}")
        count = count + 1

Video not found jMEYIEYkpY0_00112.mp4
Video not found sexZ6VnZ9yc_00030.mp4
Video not found c7FHotPDZIw_00111.mp4
Video not found c7FHotPDZIw_00105.mp4
Video not found sexZ6VnZ9yc_00024.mp4
Video not found jMEYIEYkpY0_00106.mp4
Video not found sexZ6VnZ9yc_00018.mp4
Video not found cYkUl8MrXgA_00388.mp4
Video not found bSsVWVfYU4w_00033.mp4
Video not found vlfy4cny75s_00181.mp4
Video not found 5hXbclLNSNA_00003.mp4
Video not found cYkUl8MrXgA_00411.mp4
Video not found k1F4LHeYhBs_00262.mp4
Video not found vlfy4cny75s_00195.mp4
Video not found Le6xdQ2OO8w_00052.mp4
Video not found OFxuiqI5G44_00104.mp4
Video not found VNvb5oLOpLg_01059.mp4
Video not found _8Vy3dlHg2w_00057.mp4
Video not found k71Cc-Sm-Mg_00006.mp4
Video not found JzOshOJgofw_00301.mp4
Video not found roFeEJPgJD8_00009.mp4
Video not found rRw7peH60Yw_00024.mp4
Video not found ddV_Cpszpls_00100.mp4
Video not found D8YKHC5hmUs_00116.mp4
Video not found MbzIpx8kAD0_00061.mp4
Video not found roFeEJPgJD8_00035.mp4
Video not fo

In [16]:
print(count)

1407


In [17]:
df = pd.concat([df_train, df_test], ignore_index=True)
print(df.shape)
print(df.head())

(16997, 4)
            vid_name  label  start_frame  end_frame
0  -mmq0PT-u8k_00155      0            0         48
1  -mmq0PT-u8k_00156      0            0         70
2  -mmq0PT-u8k_00157      0            0         90
3  3qq031609lA_00002      0            0        123
4  3qq031609lA_00004      0            0        102


In [23]:
count = 0
df["exists"] = False
for video in os.listdir("FullAnnotated1000/"):
    base = os.path.splitext(video)[0] 
    vid_name = base.removeprefix("queries_")
    if vid_name in videoSet:
        df.loc[df["vid_name"] == vid_name, "exists"] = True
        count += 1

In [24]:
count

7946

In [25]:
df.loc[df["exists"]==True,"label"].value_counts()

label
7     562
35    541
26    510
46    382
45    370
33    335
15    330
21    329
34    303
31    292
24    246
19    240
22    237
43    236
12    231
44    223
8     217
5     209
36    203
28    189
0     159
3     150
29    117
27    107
6     107
17    106
11     89
42     85
1      79
32     73
41     71
9      68
16     59
47     57
14     44
4      44
25     43
18     39
23     36
20     35
10     34
40     33
37     32
38     27
39     23
13     23
2      21
Name: count, dtype: int64

In [22]:
df["label"].value_counts()

label
26    1122
35    1087
46     979
7      964
31     771
21     741
15     724
19     721
34     670
45     616
33     607
44     585
24     564
5      508
8      478
12     449
36     427
43     415
22     407
28     377
17     370
0      295
3      290
27     289
29     251
9      214
6      181
20     179
41     170
11     161
1      156
42     133
14     104
32     104
38      93
10      89
16      87
18      82
47      76
37      73
23      64
25      62
13      56
4       54
2       52
40      50
39      50
Name: count, dtype: int64

In [26]:
total_counts = df["label"].value_counts().sort_index()

exists_counts = df.loc[df["exists"] == True, "label"].value_counts().sort_index()

loss_df = pd.DataFrame({
    "total_count": total_counts,
    "exists_count": exists_counts
})

loss_df["exists_count"] = loss_df["exists_count"].fillna(0).astype(int)

loss_df["lost_count"] = loss_df["total_count"] - loss_df["exists_count"]
loss_df["percent_lost"] = 100 * loss_df["lost_count"] / loss_df["total_count"]

loss_df["percent_lost"] = loss_df["percent_lost"].round(2)

loss_df

Unnamed: 0_level_0,total_count,exists_count,lost_count,percent_lost
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,295,159,136,46.1
1,156,79,77,49.36
2,52,21,31,59.62
3,290,150,140,48.28
4,54,44,10,18.52
5,508,209,299,58.86
6,181,107,74,40.88
7,964,562,402,41.7
8,478,217,261,54.6
9,214,68,146,68.22


In [27]:
loss_df["percent_lost"].mean()

np.float64(51.04106382978724)

In [45]:
NotProperlyProcessed = pd.read_csv("NotProperlyProcessed.csv")
NotProperlyProcessed

Unnamed: 0,Names
0,queries_iv0Gu1VXAgc_00025
1,queries_VNvb5oLOpLg_00376
2,queries_j4gvbjifw14_00091
3,queries_Y7QZcr24ye0_00392
4,queries_-mmq0PT-u8k_00103
...,...
1492,queries_zYHstCxnAPA_00244.mp4
1493,queries_k71Cc-Sm-Mg_00159.mp4
1494,queries_cYkUl8MrXgA_00214.mp4
1495,queries_6wVdnLa3Tes_00134.mp4


In [46]:
NotProperlyProcessed["Names"].value_counts().index.shape

(1378,)

In [42]:
def clean_queries_prefix(df, column_name):
    """
    Removes 'queries_' prefix from a specific column in the dataframe and 
    adds a new boolean column 'Not well Processed' indicating whether it was removed.
    
    Parameters:
    - df: pandas.DataFrame
    - column_name: str, name of the column to process (e.g., 'vid_name')
    
    Returns:
    - df: updated DataFrame with cleaned column and 'Not well Processed' flag
    """
    # Create boolean mask for entries that have 'queries_' prefix
    mask = df[column_name].str.startswith("queries_")

    # Remove prefix if present
    df[column_name] = df[column_name].str.removeprefix("queries_")

    # Add new column indicating if prefix was originally present
    df["Not well Processed"] = mask

    return df

In [47]:
NotProperlyProcessed["vid_name_cleaned"] = NotProperlyProcessed["Names"].str.removeprefix("queries_")

# Step 2: Merge with df to get label
bad_processed_with_labels = NotProperlyProcessed.merge(
    df[["vid_name", "label"]],
    left_on="vid_name_cleaned",
    right_on="vid_name",
    how="inner"
)

# Step 3: Count how many not well processed per label
not_well_processed_counts = bad_processed_with_labels["label"].value_counts().sort_index()

# Step 4: Merge into your existing loss_df
loss_df["not_well_processed_count"] = not_well_processed_counts
loss_df["not_well_processed_count"] = loss_df["not_well_processed_count"].fillna(0).astype(int)

# Display updated table
loss_df

Unnamed: 0_level_0,total_count,exists_count,lost_count,percent_lost,not_well_processed_count,percent_exist_actual,percent_lost_actual,lost_count_int2
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,295,159,136,46.1,15,53.9,46.1,130
1,156,79,77,49.36,5,50.64,49.36,72
2,52,21,31,59.62,5,40.38,59.62,28
3,290,150,140,48.28,39,51.72,48.28,113
4,54,44,10,18.52,0,81.48,18.52,10
5,508,209,299,58.86,20,41.14,58.86,288
6,181,107,74,40.88,8,59.12,40.88,70
7,964,562,402,41.7,72,58.3,41.7,344
8,478,217,261,54.6,18,45.4,54.6,249
9,214,68,146,68.22,21,31.78,68.22,132


In [57]:
not_proper_names = NotProperlyProcessed["vid_name_cleaned"].astype(str).tolist()

In [48]:
loss_df["not_well_processed_count"].sum()

np.int64(979)

In [59]:
len(not_proper_names)

1497

In [50]:
loss_df["total_count"] = loss_df["total_count"].astype(int)
loss_df["exists_count"] = loss_df["exists_count"].astype(int)
loss_df["not_well_processed_count"] = loss_df["not_well_processed_count"].astype(int)

loss_df["percent_exist_actual"] = (
   (100 * (( loss_df["total_count"] - loss_df["lost_count"] - loss_df["not_well_processed_count"] ) / loss_df["total_count"])).round(2)
)

loss_df["percent_lost_actual"] = (
   (100 * (( loss_df["lost_count"] + loss_df["not_well_processed_count"] ) / loss_df["total_count"] )).round(2)
)

loss_df["lost_countIteration2"] = (
    loss_df["total_count"] - loss_df["lost_count"] - loss_df["not_well_processed_count"]
).astype(int)

loss_df

Unnamed: 0_level_0,total_count,exists_count,lost_count,percent_lost,not_well_processed_count,percent_exist_actual,percent_lost_actual,lost_count_int2,lost_countIteration2
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,295,159,136,46.1,15,48.81,51.19,121,144
1,156,79,77,49.36,5,47.44,52.56,72,74
2,52,21,31,59.62,5,30.77,69.23,26,16
3,290,150,140,48.28,39,38.28,61.72,101,111
4,54,44,10,18.52,0,81.48,18.52,10,44
5,508,209,299,58.86,20,37.2,62.8,279,189
6,181,107,74,40.88,8,54.7,45.3,66,99
7,964,562,402,41.7,72,50.83,49.17,330,490
8,478,217,261,54.6,18,41.63,58.37,243,199
9,214,68,146,68.22,21,21.96,78.04,125,47


In [53]:
loss_df.to_csv("ActualClassPercenetPerClass.csv")

In [55]:
df

Unnamed: 0,vid_name,label,start_frame,end_frame,exists
0,-mmq0PT-u8k_00155,0,0,48,True
1,-mmq0PT-u8k_00156,0,0,70,False
2,-mmq0PT-u8k_00157,0,0,90,False
3,3qq031609lA_00002,0,0,123,True
4,3qq031609lA_00004,0,0,102,True
...,...,...,...,...,...
16992,mGRum47TLX0_00040,47,0,56,True
16993,mGRum47TLX0_00044,47,0,112,True
16994,ovWCmIMMkRI_00050,47,0,108,True
16995,ovWCmIMMkRI_00051,47,0,125,True


In [56]:
df["exists"].value_counts()

exists
False    9051
True     7946
Name: count, dtype: int64

In [61]:
df.loc[df["vid_name"].isin(not_proper_names), "exists"] = False

In [63]:
df["exists"].value_counts()

exists
False    9925
True     7072
Name: count, dtype: int64

In [64]:
df.to_csv("ActualPreprocessedWithClass.csv")

In [65]:
df_exists = df[df["exists"] == True].copy()

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

labels = df_exists["label"].astype(int)

classes = np.unique(labels)

weights = compute_class_weight(class_weight="balanced", classes=classes, y=labels)

class_weights = dict(zip(classes, weights))


In [66]:
print("Filtered DataFrame shape:", df_exists.shape)
print("Class Weights:", class_weights)

Filtered DataFrame shape: (7072, 5)
Class Weights: {np.int64(0): np.float64(1.0306033226464588), np.int64(1): np.float64(2.0333525014376077), np.int64(2): np.float64(9.404255319148936), np.int64(3): np.float64(1.308418131359852), np.int64(4): np.float64(3.4197292069632494), np.int64(5): np.float64(0.7836879432624113), np.int64(6): np.float64(1.489783020855277), np.int64(7): np.float64(0.3045912653975364), np.int64(8): np.float64(0.7523404255319149), np.int64(9): np.float64(3.0707772470690404), np.int64(10): np.float64(5.787234042553192), np.int64(11): np.float64(1.8808510638297873), np.int64(12): np.float64(0.7165146909827761), np.int64(13): np.float64(10.031205673758866), np.int64(14): np.float64(4.559638942617666), np.int64(15): np.float64(0.5316893466656643), np.int64(16): np.float64(2.893617021276596), np.int64(17): np.float64(1.6718676122931442), np.int64(18): np.float64(4.559638942617666), np.int64(19): np.float64(0.7097551184263348), np.int64(20): np.float64(4.702127659574468), 

In [69]:
for i in class_weights:
    print(i, class_weights[i])

0 1.0306033226464588
1 2.0333525014376077
2 9.404255319148936
3 1.308418131359852
4 3.4197292069632494
5 0.7836879432624113
6 1.489783020855277
7 0.3045912653975364
8 0.7523404255319149
9 3.0707772470690404
10 5.787234042553192
11 1.8808510638297873
12 0.7165146909827761
13 10.031205673758866
14 4.559638942617666
15 0.5316893466656643
16 2.893617021276596
17 1.6718676122931442
18 4.559638942617666
19 0.7097551184263348
20 4.702127659574468
21 0.5393121329977885
22 0.7097551184263348
23 4.559638942617666
24 0.7375886524822695
25 3.5825734549138804
26 0.3215130023640662
27 1.5046808510638299
28 0.8647591098067987
29 1.475177304964539
31 0.5699548678272083
32 2.1192688043152534
33 0.5135429525815118
34 0.5373860182370821
35 0.3096051133876193
36 0.8222299732589233
37 5.015602836879433
38 6.839458413926499
39 7.523404255319149
40 4.853809196980096
41 2.7864460204885737
42 1.9798432250839866
43 0.6934013138542995
44 0.7448915104276385
45 0.44649283414356966
46 0.4425531914893617
47 2.735783

In [74]:
df_exists.reset_index().drop(columns=["index","exists"]).to_csv("df_exsists.csv")

In [75]:
df_exists

Unnamed: 0,vid_name,label,start_frame,end_frame,exists
3,3qq031609lA_00002,0,0,123,True
4,3qq031609lA_00004,0,0,102,True
5,5V-dKBtmKLI_00018,0,0,55,True
7,5V-dKBtmKLI_00085,0,0,80,True
8,5V-dKBtmKLI_00086,0,0,89,True
...,...,...,...,...,...
16992,mGRum47TLX0_00040,47,0,56,True
16993,mGRum47TLX0_00044,47,0,112,True
16994,ovWCmIMMkRI_00050,47,0,108,True
16995,ovWCmIMMkRI_00051,47,0,125,True


In [76]:
df_exists["label"].value_counts()

label
7     494
35    486
26    468
46    340
45    337
33    293
15    283
34    280
21    279
31    264
43    217
22    212
19    212
12    210
24    204
44    202
8     200
5     192
36    183
28    174
0     146
3     115
29    102
6     101
27    100
17     90
11     80
42     76
1      74
32     71
47     55
41     54
16     52
9      49
4      44
25     42
18     33
14     33
23     33
20     32
40     31
37     30
10     26
38     22
39     20
2      16
13     15
Name: count, dtype: int64

In [81]:
np.sort(df_exists["label"].value_counts().index)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47])