### Preprocessor for the [Tuberculosis Tb Chest Xray Dataset](https://www.kaggle.com/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset)

Before you can use this preprocessor, or any of the downstream models this dataset is used for, you need to download it from [https://www.kaggle.com/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset](https://www.kaggle.com/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset).

In [4]:
import h5py
import numpy as np
import os
from PIL import Image

In [5]:
def load_and_preprocess_data(data_dir, image_size):
    images = []
    labels = []

    # Iterate through each folder (each class of flowers)
    for class_name in os.listdir(data_dir):
        class_dir = os.path.join(data_dir, class_name)

        # Iterate through each image in the class folder
        # Check if it's a directory
        if not os.path.isdir(class_dir):
            continue  # Skip if it's not a directory
        for image_name in os.listdir(class_dir):
            image_path = os.path.join(class_dir, image_name)


            # Read image and convert to grayscale
            image = Image.open(image_path).convert('L')

            # Resize image
            image = image.resize(image_size)

            # Convert image to numpy array
            image = np.array(image)

            # Normalize pixel values to [0, 1]
            image = image.astype('float32') / 255.0

            # Append to the lists
            images.append(image)
            labels.append(class_name)

    # Convert lists to numpy arrays
    images = np.array(images)
    labels = np.array(labels)

    return images, labels

In [6]:
dataset_dir = "../TB_Chest_Radiography_Database"
image_size = (128, 128)
images, labels = load_and_preprocess_data(dataset_dir, image_size)

In [8]:
# Open a new HDF5 file
with h5py.File(f"{dataset_dir}/tb-xraydb.h5py", "w") as h5f:
    # Store the images and labels datasets
    h5f.create_dataset("images", data=images)
    
    # Store the labels dataset with dtype 'S' for string data
    dt = h5py.special_dtype(vlen=str)
    h5f.create_dataset('labels', data=labels.astype(dt))