In [1]:
# Import libraries

import os
import shutil

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

import cv2

from tqdm.auto import tqdm

from bs4 import BeautifulSoup


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pip install bs4




In [3]:
# Train directories

train_image_harvard_dir = "c://Users//MyBri//OneDrive//Desktop//Masters//Dissertation//polyp detection//datasets//POLYP-DETECTION-YOLO-NASH//base-datasets//PolypsSet//train2019//Image"
train_annotations_harvard_dir = "c://Users//MyBri//OneDrive//Desktop//Masters//Dissertation//polyp detection//datasets//POLYP-DETECTION-YOLO-NASH//base-datasets//PolypsSet//train2019//Annotation"


In [4]:
# Validation directories

valid_image_harvard_dir = r"C:\Users\MyBri\OneDrive\Desktop\Masters\Dissertation\polyp detection\datasets\POLYP-DETECTION-YOLO-NASH\base-datasets\PolypsSet\val2019\Image"
valid_annotations_harvard_dir = r"C:\Users\MyBri\OneDrive\Desktop\Masters\Dissertation\polyp detection\datasets\POLYP-DETECTION-YOLO-NASH\base-datasets\PolypsSet\val2019\Annotation"


In [5]:
# Test directories

test_image_harvard_dir = r"C:\Users\MyBri\OneDrive\Desktop\Masters\Dissertation\polyp detection\datasets\POLYP-DETECTION-YOLO-NASH\base-datasets\PolypsSet\test2019\Image"
test_annotations_harvard_dir = r"C:\Users\MyBri\OneDrive\Desktop\Masters\Dissertation\polyp detection\datasets\POLYP-DETECTION-YOLO-NASH\base-datasets\PolypsSet\test2019\Annotation"


In [6]:
train_images = os.listdir(train_image_harvard_dir)
train_annotations = os.listdir(train_annotations_harvard_dir)

train_images.sort()
train_annotations.sort()


# Verify that the images and annotations match
train_image_names = set([x.split(".")[0] for x in train_images])
train_annotation_names = set([x.split(".")[0] for x in train_annotations])

train_image_names == train_annotation_names


True

In [8]:
valid_images = os.listdir(valid_image_harvard_dir)
valid_annotations = os.listdir(valid_annotations_harvard_dir)

#these dont matter
valid_images.sort()
valid_annotations.sort()


# Verify that the images and annotations match
valid_image_names = set([x.split(".")[0] for x in valid_images])
valid_annotation_names = set([x.split(".")[0] for x in valid_annotations])

valid_image_names == valid_annotation_names


True

In [7]:
test_images = os.listdir(test_image_harvard_dir)
test_annotations = os.listdir(test_annotations_harvard_dir)

#these dont matter
test_images.sort()
test_annotations.sort()


# Verify that the images and annotations match
test_image_names = set([x.split(".")[0] for x in test_images])
test_annotation_names = set([x.split(".")[0] for x in test_annotations])

test_image_names == test_annotation_names


True

In [8]:
# Set base directories

base_target_dir = "./PolypDetectionHarv/"
train_images_dir = os.path.join(base_target_dir, "train", "images")
train_annots_dir = os.path.join(base_target_dir, "train", "labels")

val_images_dir = os.path.join(base_target_dir, "valid", "images")
val_annots_dir = os.path.join(base_target_dir, "valid", "labels")

test_images_dir = os.path.join(base_target_dir, "test", "images")
test_annots_dir = os.path.join(base_target_dir, "test", "labels")

os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(train_annots_dir, exist_ok=True)
os.makedirs(val_images_dir, exist_ok=True)
os.makedirs(val_annots_dir, exist_ok=True)
os.makedirs(test_images_dir, exist_ok=True)
os.makedirs(test_annots_dir, exist_ok=True)


In [9]:
status="train"
for train_image in tqdm(train_images):
    # train_image = train_images[631]

    # 90 - 10 split
    #if np.random.uniform() > 0.1:
    #    status = "train"
    #else:
    #   status = "val"


    train_image_path = os.path.join(
        train_image_harvard_dir, train_image
        )
    image = Image.open(train_image_path)
    img_width, img_height = image.size

    train_image_annotation = os.path.join(
        train_annotations_harvard_dir, train_image
        ).replace(".jpg", ".xml")

    # Load the annotation
    with open(train_image_annotation) as f:
        annotation = BeautifulSoup(f, "xml")

    boxes = []

    # Get the bbox coordinates (blank if no polyp detected)
    xmin = annotation.find("xmin")
    if xmin is not None:
        xmin = int(annotation.find("xmin").text)
        ymin = int(annotation.find("ymin").text)
        xmax = int(annotation.find("xmax").text)
        ymax = int(annotation.find("ymax").text)

        center_x = (xmin + xmax) / 2
        center_y = (ymin + ymax) / 2
        height = ymax - ymin
        width = xmax - xmin

        center_x /= img_width
        center_y /= img_height
        height /= img_height
        width /= img_width

        boxes.append([0, center_x, center_y, width, height])

    if len(boxes) == 0:
        pass
    
    # 1. copy the image to the train/val folder
    new_image_name = f"harvard__{train_image}"
    new_image_path = os.path.join(
        base_target_dir, "train", "images", new_image_name
        )
    
    shutil.copy(train_image_path, new_image_path)  
    # 2. create the annotation for the image in the train/val folder
    annotation_file_name = new_image_name.replace(".jpg", ".txt")
    annotation_file_path = os.path.join(
        base_target_dir, "train", "labels", annotation_file_name
        )
    
    with open(annotation_file_path, "w") as f:
        for box in boxes:
            f.write(" ".join([str(x) for x in box]) + "\n")
    # 3. write the image path in the train/val.txt file
    train_val_file_path = os.path.join(
        base_target_dir, f"{status}.txt"
        )
    
    with open(train_val_file_path, "a") as f:
        f.write(new_image_path + "\n")

  0%|          | 0/28773 [00:00<?, ?it/s]

100%|██████████| 28773/28773 [06:41<00:00, 71.74it/s]


In [10]:
status = "valid"

# Iterate through subfolders (different file structure)
for subfolder in os.listdir(valid_image_harvard_dir):
    subfolder_path = os.path.join(valid_image_harvard_dir, subfolder)

    if os.path.isdir(subfolder_path):
        
        for image_filename in os.listdir(subfolder_path):
            if image_filename.lower().endswith(".jpg"):
                # Form the paths for the image and annotation
                image_path = os.path.join(subfolder_path, image_filename)
                annotation_filename = os.path.splitext(image_filename)[0] + ".xml"
                annotation_path = os.path.join(valid_annotations_harvard_dir, subfolder, annotation_filename)

                # Check if the corresponding annotation file exists
                if os.path.exists(annotation_path):
                    image = Image.open(image_path)
                    img_width, img_height = image.size

                    # Load the annotation
                    with open(annotation_path) as f:
                        annotation = BeautifulSoup(f, "xml")

                    boxes = []

                    # Get the bbox coordinates (blank if no polyp detected)
                    xmin = annotation.find("xmin")
                    if xmin is not None:
                        xmin = int(annotation.find("xmin").text)
                        ymin = int(annotation.find("ymin").text)
                        xmax = int(annotation.find("xmax").text)
                        ymax = int(annotation.find("ymax").text)

                        center_x = (xmin + xmax) / 2
                        center_y = (ymin + ymax) / 2
                        height = ymax - ymin
                        width = xmax - xmin

                        center_x /= img_width
                        center_y /= img_height
                        height /= img_height
                        width /= img_width

                        boxes.append([0, center_x, center_y, width, height])

                    if len(boxes) == 0:
                        pass

                    # 1. Copy the image to the valid/images folder
                    new_image_name = f"harvard__{subfolder}_{os.path.splitext(image_filename)[0]}.jpg"
                    new_image_path = os.path.join(base_target_dir, "valid", "images", new_image_name)

                    shutil.copy(image_path, new_image_path)

                    # 2. Create the annotation for the image in the valid/labels folder
                    annotation_file_name = f"harvard__{subfolder}_{os.path.splitext(image_filename)[0]}.txt"
                    annotation_file_path = os.path.join(base_target_dir, "valid", "labels", annotation_file_name)

                    with open(annotation_file_path, "w") as f:
                        for box in boxes:
                            f.write(" ".join([str(x) for x in box]) + "\n")

                    # 3. Write the image path in the valid.txt file
                    valid_val_file_path = os.path.join(base_target_dir, f"{status}.txt")

                    with open(valid_val_file_path, "a") as f:
                        f.write(new_image_path + "\n")


In [11]:
status = "test"

# Iterate through subfolders (same structure as valid)
for subfolder in os.listdir(test_image_harvard_dir):
    subfolder_path = os.path.join(test_image_harvard_dir, subfolder)

    if os.path.isdir(subfolder_path):
        # Iterate through image files inside each subfolder
        for image_filename in os.listdir(subfolder_path):
            if image_filename.lower().endswith(".jpg"):
                # Form the paths for the image and annotation
                image_path = os.path.join(subfolder_path, image_filename)
                annotation_filename = os.path.splitext(image_filename)[0] + ".xml"
                annotation_path = os.path.join(test_annotations_harvard_dir, subfolder, annotation_filename)

                # Check if the corresponding annotation file exists
                if os.path.exists(annotation_path):
                    image = Image.open(image_path)
                    img_width, img_height = image.size

                    # Load the annotation
                    with open(annotation_path) as f:
                        annotation = BeautifulSoup(f, "xml")

                    boxes = []

                    # Get the bbox coordinates (blank if no polyp detected)
                    xmin = annotation.find("xmin")
                    if xmin is not None:
                        xmin = int(annotation.find("xmin").text)
                        ymin = int(annotation.find("ymin").text)
                        xmax = int(annotation.find("xmax").text)
                        ymax = int(annotation.find("ymax").text)

                        center_x = (xmin + xmax) / 2
                        center_y = (ymin + ymax) / 2
                        height = ymax - ymin
                        width = xmax - xmin

                        center_x /= img_width
                        center_y /= img_height
                        height /= img_height
                        width /= img_width

                        boxes.append([0, center_x, center_y, width, height])

                    if len(boxes) == 0:
                        pass

                    # 1. Copy the image to the valid/images folder
                    new_image_name = f"harvard__{subfolder}_{os.path.splitext(image_filename)[0]}.jpg"
                    new_image_path = os.path.join(base_target_dir, "test", "images", new_image_name)

                    shutil.copy(image_path, new_image_path)

                    # 2. Create the annotation for the image in the valid/labels folder
                    annotation_file_name = f"harvard__{subfolder}_{os.path.splitext(image_filename)[0]}.txt"
                    annotation_file_path = os.path.join(base_target_dir, "test", "labels", annotation_file_name)

                    with open(annotation_file_path, "w") as f:
                        for box in boxes:
                            f.write(" ".join([str(x) for x in box]) + "\n")

                    # 3. Write the image path in the valid.txt file
                    test_val_file_path = os.path.join(base_target_dir, f"{status}.txt")

                    with open(test_val_file_path, "a") as f:
                        f.write(new_image_path + "\n")
