In [5]:
import os
import shutil
import zipfile

#  CONFIGURATION
HUMAN_CLASS_ID = "1"
SOURCE_DIR = "/kaggle/input/fire-smoke-human-detector"       # Change if needed
OUTPUT_DIR = "filtered_human"
ZIP_PATH = OUTPUT_DIR + ".zip"
splits = ["train", "valid", "test"]
CLASS_NAMES = ['fire', 'human', 'smoke']        # Must match your actual order

#  TRACKING TOTALS
total_human_instances = 0
total_files_copied = 0

print(" Filtering dataset for images containing humans...")

for split in splits:
    img_src = os.path.join(SOURCE_DIR, split, "images")
    lbl_src = os.path.join(SOURCE_DIR, split, "labels")

    img_dst = os.path.join(OUTPUT_DIR, split, "images")
    lbl_dst = os.path.join(OUTPUT_DIR, split, "labels")

    os.makedirs(img_dst, exist_ok=True)
    os.makedirs(lbl_dst, exist_ok=True)

    # Per-split stats
    split_human_instances = 0
    split_human_images = 0
    split_total_labels = 0
    split_copied = 0

    for label_file in os.listdir(lbl_src):
        if not label_file.endswith(".txt"):
            continue

        split_total_labels += 1
        label_path = os.path.join(lbl_src, label_file)

        with open(label_path, "r") as f:
            lines = f.readlines()

        human_lines = [line for line in lines if line.startswith(HUMAN_CLASS_ID + " ")]
        if human_lines:
            split_human_images += 1
            split_human_instances += len(human_lines)

            # Copy label
            shutil.copy(label_path, os.path.join(lbl_dst, label_file))

            # Copy image
            for ext in [".jpg", ".jpeg", ".png"]:
                img_file = label_file.replace(".txt", ext)
                img_path = os.path.join(img_src, img_file)
                if os.path.exists(img_path):
                    shutil.copy(img_path, os.path.join(img_dst, img_file))
                    split_copied += 1
                    break

    # Per-split report
    print(f"\n Split: {split}")
    print(f"   → Human instances     : {split_human_instances}")
    print(f"   → Label files with humans: {split_human_images} / {split_total_labels}")
    print(f"   → Files copied        : {split_copied}")

    total_human_instances += split_human_instances
    total_files_copied += split_copied

#  Create data.yaml in filtered dataset root
yaml_path = os.path.join(OUTPUT_DIR, "data.yaml")
with open(yaml_path, "w") as f:
    f.write(f"train: train/images\n")
    f.write(f"val: valid/images\n")
    f.write(f"test: test/images\n\n")
    f.write(f"nc: {len(CLASS_NAMES)}\n")
    f.write(f"names: {CLASS_NAMES}\n")

print(f"\n data.yaml created in: {yaml_path}")

#  Zip the folder
print("\n Zipping dataset...")
with zipfile.ZipFile(ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(OUTPUT_DIR):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, OUTPUT_DIR)
            zipf.write(file_path, arcname)

# Final summary
print(f"\n Done!")
print(f"Total human instances copied: {total_human_instances}")
print(f"Total image-label pairs saved: {total_files_copied}")
print(f"Filtered dataset directory: {OUTPUT_DIR}/")
print(f"Zipped dataset saved as: {ZIP_PATH}")


 Filtering dataset for images containing humans...

 Split: train
   → Human instances     : 4436
   → Label files with humans: 2161 / 8001
   → Files copied        : 2161

 Split: valid
   → Human instances     : 455
   → Label files with humans: 220 / 1017
   → Files copied        : 220

 Split: test
   → Human instances     : 370
   → Label files with humans: 182 / 731
   → Files copied        : 182

 data.yaml created in: filtered_human/data.yaml

 Zipping dataset...

 Done!
Total human instances copied: 5261
Total image-label pairs saved: 2563
Filtered dataset directory: filtered_human/
Zipped dataset saved as: filtered_human.zip
