# Check if `data/annotations.csv` is a subset of `annotations_filenames.csv`

Loads both CSVs and verifies that every `(Emitter, File Name)` pair in `data/annotations.csv` appears in `annotations_filenames.csv`

In [None]:
import pandas as pd

small = pd.read_csv("data/annotations.csv")  # 10k subset
full = pd.read_csv("annotations_filenames.csv")  # full annotations

print("small shape:", small.shape)
print("full shape :", full.shape)

small shape: (10000, 2)
full shape : (91080, 11)


In [3]:
small.head()

Unnamed: 0,Emitter,File Name
0,216,69809.wav
1,215,71889.wav
2,216,46690.wav
3,230,85411.wav
4,215,45609.wav


In [4]:
full.head()

Unnamed: 0,FileID,Emitter,Addressee,Context,Emitter pre-vocalization action,Addressee pre-vocalization action,Emitter post-vocalization action,Addressee post-vocalization action,Start sample,End sample,File Name
0,7,118,0,9,2,2,3,3,1,336720,0.wav
1,11,0,0,11,0,0,0,0,1,787280,1.wav
2,12,118,0,12,2,2,3,3,1,566096,2.wav
3,15,0,0,12,0,0,0,0,1,402256,3.wav
4,20,0,0,12,0,0,0,0,1,394064,4.wav


In [6]:
# Check that all (Emitter, File Name) pairs in `small` exist in `full`
small_pairs = small[["Emitter", "File Name"]].copy()
full_pairs = full[["Emitter", "File Name"]].copy()

merged = small_pairs.merge(
    full_pairs.drop_duplicates(),
    on=["Emitter", "File Name"],
    how="left",
    indicator=True,
)

missing = merged[merged["_merge"] == "left_only"]

print(f"Rows in small: {len(small_pairs)}")
print(f"Rows in full : {len(full_pairs)}")
print(f"Rows from small NOT found in full: {len(missing)}")

if len(missing) > 0:
    print("\nExamples of missing rows:")
    display(missing.head())
else:
    print("\nSUCCESS: All (Emitter, File Name) pairs in `data/annotations.csv` are present in `annotations_filenames.csv`.")


Rows in small: 10000
Rows in full : 91080
Rows from small NOT found in full: 0

SUCCESS: All (Emitter, File Name) pairs in `data/annotations.csv` are present in `annotations_filenames.csv`.


In [7]:
# double-check via set comparison
small_set = set(zip(small_pairs["Emitter"], small_pairs["File Name"]))
full_set = set(zip(full_pairs["Emitter"], full_pairs["File Name"]))

print("Unique pairs in small:", len(small_set))
print("Unique pairs in full :", len(full_set))
print("Pairs in small but not in full:", len(small_set - full_set))

if small_set.issubset(full_set):
    print("\nSUCCESS: `data/annotations.csv` is a subset of `annotations_filenames.csv` by (Emitter, File Name).")
else:
    print("\nFAILED: There exist pairs in `data/annotations.csv` that are not in `annotations_filenames.csv`.")


Unique pairs in small: 10000
Unique pairs in full : 91080
Pairs in small but not in full: 0

SUCCESS: `data/annotations.csv` is a subset of `annotations_filenames.csv` by (Emitter, File Name).
