In [1]:
import os
import shutil

import warnings
warnings.filterwarnings("ignore")

In [2]:
base_dir = r"D:\desktop\python\algaenet\dataset\train"
image_dir = os.path.join(base_dir, "images")
label_dir = os.path.join(base_dir, "labels")

In [3]:
class_names = {
    0: "Platymonas",
    1: "Chlorella",
    2: "Dunaliella_salina",
    3: "Effrenium",
    4: "Porphyridium",
    5: "Haematococcus"
}

In [4]:
for class_id, class_name in class_names.items():
    class_subdir = os.path.join(image_dir, class_name)
    os.makedirs(class_subdir, exist_ok=True)
    print(f"created or found directory- {class_subdir}")

created or found directory- D:\desktop\python\algaenet\dataset\train\images\Platymonas
created or found directory- D:\desktop\python\algaenet\dataset\train\images\Chlorella
created or found directory- D:\desktop\python\algaenet\dataset\train\images\Dunaliella_salina
created or found directory- D:\desktop\python\algaenet\dataset\train\images\Effrenium
created or found directory- D:\desktop\python\algaenet\dataset\train\images\Porphyridium
created or found directory- D:\desktop\python\algaenet\dataset\train\images\Haematococcus


In [5]:
moved_count = 0
skipped_count = 0
error_count = 0

print(f"\nprocessing labels in- {label_dir}")
if not os.path.exists(label_dir):
    print(f"error- label directory not found at {label_dir}")
else:
    label_files = [f for f in os.listdir(label_dir) if f.endswith(".txt")]
    print(f"found {len(label_files)} label files")

    for label_file in label_files:
        label_path = os.path.join(label_dir, label_file)
        image_name = label_file.replace(".txt", ".jpg") 
        image_path = os.path.join(image_dir, image_name)

        if not os.path.exists(image_path):
            # print(f"skipping- image file not found for label {label_file} at {image_path}")
            skipped_count += 1
            continue

        try:
            with open(label_path, 'r') as f:
                lines = f.readlines()
                if not lines:
                    # print(f"warning- label file {label_file} is empty")
                    skipped_count += 1
                    continue

                first_line = lines[0].strip()
                if not first_line: 
                     skipped_count += 1
                     continue

                parts = first_line.split()
                if not parts:
                    skipped_count += 1
                    continue

                class_id = int(parts[0])
                class_name = class_names.get(class_id)

                if class_name:
                    destination_dir = os.path.join(image_dir, class_name)
                    destination_path = os.path.join(destination_dir, image_name)

                    # check if file already moved 
                    if not os.path.exists(destination_path):
                         shutil.move(image_path, destination_path)
                         # print(f"Moved {image_name} to {destination_dir}")
                         moved_count += 1
                    else:
                         # print(f"skipping- {image_name} already in {destination_dir}")
                         # if the original image file still exists somehow, remove it
                         if os.path.exists(image_path):
                             os.remove(image_path)
                         skipped_count +=1

                else:
                    print(f"warning- unknown class ID {class_id} in {label_file}")
                    error_count += 1

        except Exception as e:
            print(f"error processing file {label_file}- {e}")
            error_count += 1

print(f"\nfinished processing")
print(f"moved {moved_count} images")
print(f"skipped {skipped_count} images (not found, empty label, already moved, or empty lines)")
print(f"encountered {error_count} errors (unknown class ID or file processing issues)")


processing labels in- D:\desktop\python\algaenet\dataset\train\labels
found 700 label files

finished processing
moved 0 images
skipped 700 images (not found, empty label, already moved, or empty lines)
encountered 0 errors (unknown class ID or file processing issues)


In [6]:
print("\nverifying directory structure")
for item in os.listdir(image_dir):
    item_path = os.path.join(image_dir, item)
    if os.path.isdir(item_path):
        print(f" - directory- {item} contains {len(os.listdir(item_path))} files")


verifying directory structure
 - directory- Chlorella contains 148 files
 - directory- Dunaliella_salina contains 160 files
 - directory- Effrenium contains 78 files
 - directory- Haematococcus contains 101 files
 - directory- Platymonas contains 125 files
 - directory- Porphyridium contains 88 files
