# MARCO IMPORT
This file will import the TFRecords files from MARCO and convert, resize and output the images.

In [1]:
import os
import random
import shutil
import tensorflow as tf
from PIL import Image




### Import TFRecords
download the TFRecords file from the MARCO website and place them in a root folder named MARCO_TFRecords folder.

In [2]:
# feature description for TFRecord data
feature_description = {
    'image/height': tf.io.FixedLenFeature([], tf.int64),  # Image height in pixels
    'image/width': tf.io.FixedLenFeature([], tf.int64),  # Image width in pixels
    'image/colorspace': tf.io.FixedLenFeature([], tf.string),  # Colorspace, always 'RGB'
    'image/channels': tf.io.FixedLenFeature([], tf.int64),  # Number of channels, always 3
    'image/class/label': tf.io.FixedLenFeature([], tf.int64),  # Index in a normalized classification layer
    'image/class/raw': tf.io.FixedLenFeature([], tf.int64),  # Index in the raw (original) classification layer
    'image/class/source': tf.io.FixedLenFeature([], tf.int64),  # Index of the image source creator
    'image/class/text': tf.io.FixedLenFeature([], tf.string),  # Human-readable version of the normalized label
    'image/format': tf.io.FixedLenFeature([], tf.string),  # Image format, always 'JPEG'
    'image/filename': tf.io.FixedLenFeature([], tf.string),  # Basename of the image file
    'image/id': tf.io.FixedLenFeature([], tf.int64),  # Unique id for the image
    'image/encoded': tf.io.FixedLenFeature([], tf.string),  # JPEG encoded image in RGB colorspace
}

def save_images_by_label(example, output_directory):
    """
    saves the TFRecord contents as a jpg, in folders for each class:
    0 - clear
    1 - crystals
    2 - other
    3 - precipitate
    """
    example = tf.io.parse_single_example(example, feature_description)
    image = tf.image.decode_jpeg(example['image/encoded'], channels=3)  # Decode JPEG image
    label = example['image/class/label'].numpy()  # Get label value as numpy int

    # Create a folder for each label if it doesn't exist
    label_dir = os.path.join(output_directory, str(label))
    if not os.path.exists(label_dir):
        os.makedirs(label_dir)

    # Save the image to the respective label folder
    image_filename = example['image/filename'].numpy().decode('utf-8')  # Get filename as string
    image_path = os.path.join(label_dir, image_filename)
    tf.io.write_file(image_path, tf.image.encode_jpeg(image))


tfrecord_directory = 'MARCO_TFRecords'
tfrecord_files = os.listdir(tfrecord_directory)

for file_name in tfrecord_files:

    # Read TFRecord file
    tfrecord_file = os.path.join(tfrecord_directory, file_name)
    output_directory = 'MARCO_IMAGES'
    dataset = tf.data.TFRecordDataset(tfrecord_file)
    print("currently importing:", file_name)

    # Process and save images by label
    for record in dataset:
        save_images_by_label(record, output_directory)

print("task finished")

currently importing: train-00006-of-00407
currently importing: train-00028-of-00407
currently importing: train-00045-of-00407
currently importing: train-00077-of-00407
currently importing: train-00078-of-00407
currently importing: train-00137-of-00407
currently importing: train-00146-of-00407
currently importing: train-00163-of-00407
currently importing: train-00193-of-00407
currently importing: train-00219-of-00407
currently importing: train-00220-of-00407
currently importing: train-00229-of-00407
currently importing: train-00267-of-00407
currently importing: train-00284-of-00407
currently importing: train-00294-of-00407
currently importing: train-00305-of-00407
currently importing: train-00306-of-00407
currently importing: train-00344-of-00407
currently importing: train-00390-of-00407
currently importing: train-00403-of-00407
task finished


### Resize 
Resize the images to 608x608 and organize them into folders.

In [3]:
def resize_images_in_folders(root_folder):
    # Define the output root directory for resized images
    output_root = os.path.join("MARCO_IMAGES_resized")
    os.makedirs(output_root, exist_ok=True)

    # Iterate through the numbered folders
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)

        print("currently adapting:", folder_name)
        
        # Check if the item in the root folder is a directory
        if os.path.isdir(folder_path):
            output_folder = os.path.join(output_root, folder_name)
            os.makedirs(output_folder, exist_ok=True)

            # Loop through images in subfolders and resize them
            for subdir, _, files in os.walk(folder_path):
                for filename in files:
                    filepath = os.path.join(subdir, filename)
                    
                    # Check if the file is an image
                    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
                        with Image.open(filepath) as img:
                            # Resize the image to 608x608
                            img_resized = img.resize((608, 608))
                            
                            # Save the resized image in the corresponding output folder
                            output_filename = os.path.join(output_folder, filename)
                            img_resized.save(output_filename, "JPEG")

# Provide the root folder containing subfolders (0, 1, 2, 3) with images
root_folder_path = "MARCO_IMAGES"
resize_images_in_folders(root_folder_path)

print("task finished")

currently adapting: 0
currently adapting: 1
currently adapting: 2
currently adapting: 3
task finished


### Randomly select 1000 images from each class

In [4]:
source_directory = 'MARCO_IMAGES_resized'
destination_directory = 'CVAT_IMAGES'

if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

def select_and_copy_images(source_folder, destination_folder, num_images=1000):

    # create folder
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    for subdir in os.listdir(source_folder):
        print("currently sampling:", subdir)
        subdir_path = os.path.join(source_folder, subdir)
        
        if os.path.isdir(subdir_path):
            destination_subdir = os.path.join(destination_folder, subdir)
            if not os.path.exists(destination_subdir):
                os.makedirs(destination_subdir)
            
            # List all files in the subfolder
            files = os.listdir(subdir_path)
            images = [file for file in files if file.endswith(('jpeg', 'png', 'jpg', 'gif'))]  # Add other image extensions if needed
            
            # Check if there are enough images to select
            if len(images) >= num_images:
                # Ensure uniqueness by shuffling and slicing the list
                random.shuffle(images)
                selected_images = images[:num_images]
                
                # Copy selected images to the destination subfolder
                for image in selected_images:
                    source_path = os.path.join(subdir_path, image)
                    destination_path = os.path.join(destination_subdir, image)
                    shutil.copyfile(source_path, destination_path)
            else:
                print(f"Not enough images in '{subdir}' directory.")


select_and_copy_images(source_directory, destination_directory)
print("task finished")

currently adapting: 0
currently adapting: 1
currently adapting: 2
currently adapting: 3
task finished
