In [2]:
# Acquire: 
# - threshold for which a mask is gray-empty
# - percent of masks that are gray-empty
# - percent of masks that are empty

In [1]:
import os
import cv2
import shutil
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from skimage.metrics import structural_similarity as ssim

In [2]:
CV_SUBJECTS = [
    'FD-027', 'FD-029', 'FD-030', 'FD-031', 'FD-032',
]

In [3]:
AUGMENTED_DATASET_PATH = 'augmented_dataset'
FULL_DATASET_FOLDER_PATH = '/home/miguel/GI/0 - Data Exploration & Analysis/GI-Roberta/gi-roberta-dataset/full_dataset'
FULL_DATRASET_RGBA_FOLDER_PATH = '/home/miguel/GI/0 - Data Exploration & Analysis/GI-Roberta/gi-roberta-dataset/full_dataset_rgba'

In [29]:
training_samples

Unnamed: 0,file_name,model_name
0,FD-032-slice-04-image.png,FD-032
4,FD-031-slice-16-image.png,FD-031
5,FD-031-slice-27-image.png,FD-031
8,FD-031-slice-07-image.png,FD-031
11,FD-032-slice-11-image.png,FD-032
...,...,...
355,FD-030-slice-14-image.png,FD-030
356,FD-029-slice-58-image.png,FD-029
357,FD-030-slice-24-image.png,FD-030
358,FD-032-slice-16-image.png,FD-032


In [11]:
for cv_subject in CV_SUBJECTS:
    # Prepare the dataset for the current subject
    cv_subject_path = os.path.join(AUGMENTED_DATASET_PATH, cv_subject)
    
    if not os.path.exists(cv_subject_path):
        os.makedirs(cv_subject_path)

    train_folder_path = os.path.join(cv_subject_path, 'train')
    val_folder_path = os.path.join(cv_subject_path, 'val')

    if not os.path.exists(train_folder_path):
        os.makedirs(train_folder_path)

    if not os.path.exists(val_folder_path):
        os.makedirs(val_folder_path)        

    images = os.listdir(FULL_DATRASET_RGBA_FOLDER_PATH)
    image_datas = pd.DataFrame({
        'file_name': images,
        'model_name': [f.split('-slice')[0] for f in images]
    })

    training_samples = image_datas[image_datas['model_name'] != cv_subject]
    cv_val_samples = image_datas[image_datas['model_name'] == cv_subject]

    num_blank_masks = 0
    num_total_masks = 0
    for i in tqdm(cv_val_samples.index):
        file_name = cv_val_samples.loc[i, 'file_name']
        original_path = os.path.join(FULL_DATASET_FOLDER_PATH, file_name)
        destination_path = os.path.join(train_folder_path, file_name)

        mask_file_name = file_name.replace('-image.png', '-mask.png')
        original_mask_path = os.path.join(FULL_DATASET_FOLDER_PATH, mask_file_name)
        destination_mask_path = os.path.join(train_folder_path, mask_file_name)

        original_image = cv2.imread(original_path, cv2.IMREAD_UNCHANGED)
        mask_image = cv2.imread(original_mask_path, cv2.IMREAD_GRAYSCALE)
        if (mask_image.flatten().max() == 0):
            num_blank_masks += 1
        num_total_masks += 1
    print(f'Number of blank masks for {cv_subject}: {num_blank_masks}, {num_total_masks} ({num_blank_masks / num_total_masks * 100:.2f}%)')

    num_blank_masks = 0
    num_total_masks = 0
    maxima = []
    for subject in CV_SUBJECTS:
        if subject == cv_subject:
            continue

        subject_samples = image_datas[image_datas['model_name'] == subject].sort_values(by='file_name')
        for i in tqdm(subject_samples.index):
            file_name = subject_samples.loc[i, 'file_name']
            original_path = os.path.join(FULL_DATASET_FOLDER_PATH, file_name)
            destination_path = os.path.join(train_folder_path, file_name)

            mask_file_name = file_name.replace('-image.png', '-mask.png')
            original_mask_path = os.path.join(FULL_DATASET_FOLDER_PATH, mask_file_name)
            destination_mask_path = os.path.join(train_folder_path, mask_file_name)

            original_image = cv2.imread(original_path, cv2.IMREAD_UNCHANGED)
            mask_image = cv2.imread(original_mask_path, cv2.IMREAD_GRAYSCALE)
            if (mask_image.flatten().max() == 0):
                num_blank_masks += 1
            num_total_masks += 1
    print(f'Number of blank masks for {cv_subject}: {num_blank_masks}, {num_total_masks} ({num_blank_masks / num_total_masks * 100:.2f}%)')

100%|██████████| 72/72 [00:00<00:00, 1382.90it/s]


Number of blank masks for FD-027: 25, 72 (34.72%)


100%|██████████| 72/72 [00:00<00:00, 1445.94it/s]
100%|██████████| 72/72 [00:00<00:00, 1486.71it/s]
100%|██████████| 72/72 [00:00<00:00, 1372.66it/s]
100%|██████████| 72/72 [00:00<00:00, 1367.29it/s]


Number of blank masks for FD-027: 105, 288 (36.46%)


100%|██████████| 72/72 [00:00<00:00, 1363.93it/s]


Number of blank masks for FD-029: 27, 72 (37.50%)


100%|██████████| 72/72 [00:00<00:00, 1369.14it/s]
100%|██████████| 72/72 [00:00<00:00, 1481.18it/s]
100%|██████████| 72/72 [00:00<00:00, 1433.74it/s]
100%|██████████| 72/72 [00:00<00:00, 1451.01it/s]


Number of blank masks for FD-029: 103, 288 (35.76%)


100%|██████████| 72/72 [00:00<00:00, 1492.16it/s]


Number of blank masks for FD-030: 31, 72 (43.06%)


100%|██████████| 72/72 [00:00<00:00, 1403.46it/s]
100%|██████████| 72/72 [00:00<00:00, 1446.55it/s]
100%|██████████| 72/72 [00:00<00:00, 1430.40it/s]
100%|██████████| 72/72 [00:00<00:00, 1448.41it/s]


Number of blank masks for FD-030: 99, 288 (34.38%)


100%|██████████| 72/72 [00:00<00:00, 1437.88it/s]


Number of blank masks for FD-031: 22, 72 (30.56%)


100%|██████████| 72/72 [00:00<00:00, 1405.04it/s]
100%|██████████| 72/72 [00:00<00:00, 1458.80it/s]
100%|██████████| 72/72 [00:00<00:00, 1492.75it/s]
100%|██████████| 72/72 [00:00<00:00, 1447.84it/s]


Number of blank masks for FD-031: 108, 288 (37.50%)


100%|██████████| 72/72 [00:00<00:00, 1451.64it/s]


Number of blank masks for FD-032: 25, 72 (34.72%)


100%|██████████| 72/72 [00:00<00:00, 1399.48it/s]
100%|██████████| 72/72 [00:00<00:00, 1444.22it/s]
100%|██████████| 72/72 [00:00<00:00, 1482.17it/s]
100%|██████████| 72/72 [00:00<00:00, 1429.89it/s]

Number of blank masks for FD-032: 105, 288 (36.46%)





In [22]:
for f in subject_samples.sort_values(by='file_name')['file_name'].values.tolist():
    print(f)

FD-027-slice-01-image.png
FD-027-slice-02-image.png
FD-027-slice-03-image.png
FD-027-slice-04-image.png
FD-027-slice-05-image.png
FD-027-slice-06-image.png
FD-027-slice-07-image.png
FD-027-slice-08-image.png
FD-027-slice-09-image.png
FD-027-slice-10-image.png
FD-027-slice-11-image.png
FD-027-slice-12-image.png
FD-027-slice-13-image.png
FD-027-slice-14-image.png
FD-027-slice-15-image.png
FD-027-slice-16-image.png
FD-027-slice-17-image.png
FD-027-slice-18-image.png
FD-027-slice-19-image.png
FD-027-slice-20-image.png
FD-027-slice-21-image.png
FD-027-slice-22-image.png
FD-027-slice-23-image.png
FD-027-slice-24-image.png
FD-027-slice-25-image.png
FD-027-slice-26-image.png
FD-027-slice-27-image.png
FD-027-slice-28-image.png
FD-027-slice-29-image.png
FD-027-slice-30-image.png
FD-027-slice-31-image.png
FD-027-slice-32-image.png
FD-027-slice-33-image.png
FD-027-slice-34-image.png
FD-027-slice-35-image.png
FD-027-slice-36-image.png
FD-027-slice-37-image.png
FD-027-slice-38-image.png
FD-027-slice