In [1]:
import glob
import json
import os
import shutil

from collections import defaultdict

import numpy as np
import pandas as pd
import torch
import torchvision
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Data & Annotations Management

The following notebook is used to manage the [SUN Dataset](http://sundatabase.org/) data and annotations. 

## Requirements

In order to use this, you need:
* Positive cases bounding box annotations. These are contained in the `.txt` files in the [`01_raw/annotation_txt`](../data/01_raw/annotation_txt/) folder for each positive case.
* Positive cases polyp pathological diagnosis. The above `.txt` files do not contain information regarding the specific polyp pathological diagnoses, which instead are contained in [`02_intermediate/positive_cases_description.txt`](../data/02_intermediate/positive_cases_description.txt).
* An annotation template of your choice to save each **image**'s annotation (the `txt` files are one per positive case). We will use a custom template, somewhat similar to the COCO Annotation template. You can find it in the [`annotation_template.json` template](../data/02_intermediate/annotation_template.json).
* A list of the possible polyp pathological diagnoses present in the SUN Dataset. Each polyp class will be associated with a specific ID and a color for visualization purposes. You can find it in the [`02_intermediate/polyp_classes.json` file](../data/02_intermediate/polyp_classes.json).
* A list of the cases the negative frames in the SUN Dataset come from. You can find it in the [`02_intermediate/negative_cases_description.txt` file](../data/02_intermediate/negative_cases_description.txt).

In [2]:
# Base working directory
BASE_DIR = '/home/thuynh/ms-thesis'

# Data folders
DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DIR = os.path.join(DATA_DIR, "01_raw")
INTERMEDIATE_DIR = os.path.join(DATA_DIR, "02_intermediate")
PRIMARY_DIR = os.path.join(DATA_DIR, '03_primary')
MODEL_INPUT_DIR = os.path.join(DATA_DIR, '04_model_input')

# Required folders/files
ANNOTATIONS_DIR = os.path.join(RAW_DIR, "annotation_txt") # Raw case-by-case annotations folder
ANNOTATION_TEMPLATE = os.path.join(INTERMEDIATE_DIR, 'annotation_template.json') # Custom annotation template
POLYP_CLASSES = os.path.join(INTERMEDIATE_DIR, 'polyp_classes.json') # SUN dataset polyp classes 
POSITIVE_CASES_DESCRIPTION = os.path.join(INTERMEDIATE_DIR, 'positive_cases_description.txt') # Polyp classes case-by-case
NEGATIVE_CASES_DESCRIPTION = os.path.join(INTERMEDIATE_DIR, 'negative_cases_description.txt') # Negative frames and cases

## Workflow

In the `annotation_txt` folder we have the bounding box annotations for each case. However, these annotations are lacking the information on the specific polyp pathological diagnosis. We need to complement that information with the one coming from `positive_cases_description.txt` and `negative_cases_description`. For each case, we only have one polyp diagnosis. 

What we will do is:
#### 1. Read each case annotation file (in `annotation_txt`) and save all cases' annotations in a `pd.DataFrame`.
#### 2. Complement each case annotation with information on the specific polyp pathological diagnosis/class (from `positive_cases_description.txt`).
#### 3. Add negative images information and class (from `negative_cases_description.txt`). 
#### 4. Save a separate annotation file for each image (following the `annotation_template.json` template structure) in `03_primary/labels` (negative images too).
#### 5. Extract and move all images (positive and negative) to `03_primary/images`.
#### 6. Split and save training and validation sets in `04_model_input/train` and `04_model_input/val`.

### 1. Read bounding box annotations

**N.B.**: cases #58 and #97 have multiple bbox coordinates for certain images. We selected only one out of the ones specified, based on the coordinates of the bounding box in the images before and after the images with multiple bbox coordinates. The original annotation files were saved as `_case58` and `_case97`.

In [3]:
columns = ['Filename', 'Min_X', 'Min_Y', 'Max_X', 'Max_Y', 'Frame_Class']

# Extract list of annotations
annotation_files = [f for f in os.listdir(ANNOTATIONS_DIR) if f.startswith('case')]
annotation_files = sorted(annotation_files, key=lambda x: int(''.join(filter(str.isdigit, x)))) # sort list of files by ascending number

# Store annotations case by case separately
positive_annotations_dict = {}

for idx, annotation in tqdm.tqdm(enumerate(annotation_files), desc='Loading annotations for each case...', total=len(annotation_files)):
    with open(os.path.join(ANNOTATIONS_DIR, annotation), 'r') as f:
        # Use a regex for the sep argument (filename is space-separated, coordinates are comma-separated)
        # When using regex, need to specify the "python" engine
        positive_annotations_dict[f'case{idx+1}'] = pd.read_csv(f, sep=' |,', names=columns, engine='python')
        
    # Combine bbox coordinates into a single column
    positive_annotations_dict[f'case{idx+1}']['XYXY'] = positive_annotations_dict[f'case{idx+1}'].iloc[:, 1:5].to_numpy().tolist()

    # Clean-up
    positive_annotations_dict[f'case{idx+1}'].drop(columns=['Min_X', 'Min_Y', 'Max_X', 'Max_Y'], inplace=True)
    positive_annotations_dict[f'case{idx+1}']['ID'] = idx+1

# Concatenate everything in a single pd.DataFrame
positive_annotations = pd.concat([case for case in positive_annotations_dict.values()]).reset_index(drop=True)

# Boxes are in XYXY format, we also want XYWH format (aka COCO format).
positive_annotations['XYWH'] = positive_annotations['XYXY'].apply(lambda x: torchvision.ops.box_convert(torch.tensor(x), 'xyxy', 'cxcywh').tolist())

positive_annotations

Loading annotations for each case...: 100%|██████████| 100/100 [00:00<00:00, 204.68it/s]


Unnamed: 0,Filename,Frame_Class,XYXY,ID,XYWH
0,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[408, 545, 568, 722]",1,"[488.0, 633.5, 160.0, 177.0]"
1,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[388, 549, 561, 729]",1,"[474.5, 639.0, 173.0, 180.0]"
2,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[390, 552, 561, 726]",1,"[475.5, 639.0, 171.0, 174.0]"
3,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[390, 552, 561, 730]",1,"[475.5, 641.0, 171.0, 178.0]"
4,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[389, 558, 557, 748]",1,"[473.0, 653.0, 168.0, 190.0]"
...,...,...,...,...,...
49131,case_M_20181106093315_0U62372110682814_1_007_0...,0,"[540, 562, 728, 750]",100,"[634.0, 656.0, 188.0, 188.0]"
49132,case_M_20181106093315_0U62372110682814_1_007_0...,0,"[538, 572, 760, 782]",100,"[649.0, 677.0, 222.0, 210.0]"
49133,case_M_20181106093315_0U62372110682814_1_007_0...,0,"[538, 572, 761, 780]",100,"[649.5, 676.0, 223.0, 208.0]"
49134,case_M_20181106093315_0U62372110682814_1_007_0...,0,"[537, 569, 763, 783]",100,"[650.0, 676.0, 226.0, 214.0]"


### 2. Complement annotations with specific polyp diagnoses.

The polyp pathological diagnoses were extracted from the SUN Dataset homepage (_Table 2_), and saved in `positive_cases_description.txt`.

In [4]:
# Read positive cases description file
with open(POSITIVE_CASES_DESCRIPTION, 'r') as f:
    positive_cases_desc = pd.read_csv(f, sep='\t', header=0, thousands=',').drop(columns=['Number of frames']).rename(columns={'Pathological diagnosis': 'Pathological_Diagnosis'})

positive_cases_desc

Unnamed: 0,ID,Shape,Size,Location,Pathological_Diagnosis
0,1,Is,6mm,C,Low-grade adenoma
1,2,Is,18mm,R,High-grade adenoma
2,3,IIa,3mm,A,Low-grade adenoma
3,4,Is,4mm,S,Low-grade adenoma
4,5,IIa,3mm,T,Low-grade adenoma
...,...,...,...,...,...
95,96,Is,5mm,S,Hyperplastic polyp
96,97,IIa,15mm-,C,Sessile serrated lesion
97,98,IIa,4mm,T,Low-grade adenoma
98,99,Is,5mm,S,Low-grade adenoma


In [5]:
# Merge the pathological diagnoses with the rest of the information
positive_annotations = positive_annotations.merge(positive_cases_desc, on='ID', how='left')

positive_annotations

Unnamed: 0,Filename,Frame_Class,XYXY,ID,XYWH,Shape,Size,Location,Pathological_Diagnosis
0,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[408, 545, 568, 722]",1,"[488.0, 633.5, 160.0, 177.0]",Is,6mm,C,Low-grade adenoma
1,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[388, 549, 561, 729]",1,"[474.5, 639.0, 173.0, 180.0]",Is,6mm,C,Low-grade adenoma
2,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[390, 552, 561, 726]",1,"[475.5, 639.0, 171.0, 174.0]",Is,6mm,C,Low-grade adenoma
3,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[390, 552, 561, 730]",1,"[475.5, 641.0, 171.0, 178.0]",Is,6mm,C,Low-grade adenoma
4,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[389, 558, 557, 748]",1,"[473.0, 653.0, 168.0, 190.0]",Is,6mm,C,Low-grade adenoma
...,...,...,...,...,...,...,...,...,...
49131,case_M_20181106093315_0U62372110682814_1_007_0...,0,"[540, 562, 728, 750]",100,"[634.0, 656.0, 188.0, 188.0]",IIa,3mm,R,Hyperplastic polyp
49132,case_M_20181106093315_0U62372110682814_1_007_0...,0,"[538, 572, 760, 782]",100,"[649.0, 677.0, 222.0, 210.0]",IIa,3mm,R,Hyperplastic polyp
49133,case_M_20181106093315_0U62372110682814_1_007_0...,0,"[538, 572, 761, 780]",100,"[649.5, 676.0, 223.0, 208.0]",IIa,3mm,R,Hyperplastic polyp
49134,case_M_20181106093315_0U62372110682814_1_007_0...,0,"[537, 569, 763, 783]",100,"[650.0, 676.0, 226.0, 214.0]",IIa,3mm,R,Hyperplastic polyp


### 3. Add also the negative frames information.

As stated in [_Misawa et al._](https://www.giejournal.org/article/S0016-5107(20)34655-1/fulltext), a small number of negative images (1024 out of 56668) can help with the training and performances of the object detection models. Therefore, we add to our database also the negative frames and possibly experiment with different numbers of negative frames included in the training/validation sets. 

The information for the negative frames was extracted from the SUN Dataset homepage (_Table 3_), and saved in `negative_cases_description.txt`.

In [6]:
# Read negative cases description file
with open(NEGATIVE_CASES_DESCRIPTION, 'r') as f:
    negative_cases_desc = pd.read_csv(f, sep='\t', header=0, thousands=',')

negative_cases_desc

Unnamed: 0,ID,Number of frames,Lenth of each video (seconds)
0,1,9961,332.0
1,2,10073,335.8
2,3,7152,238.4
3,4,14635,487.8
4,5,7916,263.9
5,6,17046,511.4
6,7,5636,169.1
7,8,2568,85.6
8,9,9522,317.4
9,10,7086,236.2


Negative frames (with no polyp in them) have `"Negative"` as part of their filename. We will use this information to filter them out from positive frames. Also, as seen in the above table, we only have negative frames for the first 13 cases.

In [7]:
negative_cases = defaultdict(list)
# Retrieve negative images paths and cases
negative_cases = {case_id: [image_path for image_path in glob.iglob('*.jpg', root_dir=os.path.join(RAW_DIR, f'case{case_id}')) if 'Negative' in image_path] for case_id in negative_cases_desc['ID']}

# Drop cases with no negative frames
negative_cases = {case_id: pd.DataFrame({'Filename': images, 'ID': case_id, 'Frame_Class': 1}) for case_id, images in negative_cases.items()} # Frame_Class = 1 means no polyp

# Concatenate everything in a single pd.DataFrame
negative_annotations = pd.concat([case for case in negative_cases.values()]).reset_index(drop=True)

negative_annotations

Unnamed: 0,Filename,ID,Frame_Class
0,case_M_20181109094641_0U62372110931241_1_003_0...,1,1
1,case_M_20181109094641_0U62372110931241_1_003_0...,1,1
2,case_M_20181109094641_0U62372110931241_1_003_0...,1,1
3,case_M_20181109094641_0U62372110931241_1_003_0...,1,1
4,case_M_20181109094641_0U62372110931241_1_003_0...,1,1
...,...,...,...
109549,case_M_20181211095338_0U62372121159337_1_008_0...,13,1
109550,case_M_20181211095338_0U62372121159337_1_008_0...,13,1
109551,case_M_20181211095338_0U62372121159337_1_008_0...,13,1
109552,case_M_20181211095338_0U62372121159337_1_008_0...,13,1


In [8]:
# Merge with the annotations dataframe from earlier
annotations = positive_annotations.merge(negative_annotations, on=['ID', 'Filename', 'Frame_Class'], how='outer')

# Clean-up
annotations['XYXY'] = annotations['XYXY'].fillna("").apply(list) # Use empty lists instead of NaNs
annotations['XYWH'] = annotations['XYWH'].fillna("").apply(list) # Use empty lists instead of NaNs
annotations = annotations.groupby('ID').apply(lambda x: x.ffill().bfill()) # Fill information for negative frames with same from positive frames (they belong to the same case)

annotations

Unnamed: 0,Filename,Frame_Class,XYXY,ID,XYWH,Shape,Size,Location,Pathological_Diagnosis
0,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[408, 545, 568, 722]",1,"[488.0, 633.5, 160.0, 177.0]",Is,6mm,C,Low-grade adenoma
1,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[388, 549, 561, 729]",1,"[474.5, 639.0, 173.0, 180.0]",Is,6mm,C,Low-grade adenoma
2,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[390, 552, 561, 726]",1,"[475.5, 639.0, 171.0, 174.0]",Is,6mm,C,Low-grade adenoma
3,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[390, 552, 561, 730]",1,"[475.5, 641.0, 171.0, 178.0]",Is,6mm,C,Low-grade adenoma
4,case_M_20181001100941_0U62372100109341_1_005_0...,0,"[389, 558, 557, 748]",1,"[473.0, 653.0, 168.0, 190.0]",Is,6mm,C,Low-grade adenoma
...,...,...,...,...,...,...,...,...,...
158685,case_M_20181211095338_0U62372121159337_1_008_0...,1,[],13,[],Is,5mm,T,Low-grade adenoma
158686,case_M_20181211095338_0U62372121159337_1_008_0...,1,[],13,[],Is,5mm,T,Low-grade adenoma
158687,case_M_20181211095338_0U62372121159337_1_008_0...,1,[],13,[],Is,5mm,T,Low-grade adenoma
158688,case_M_20181211095338_0U62372121159337_1_008_0...,1,[],13,[],Is,5mm,T,Low-grade adenoma


The above DataFrame contains all the information regarding the SUN Dataset: bounding box annotations (including polyp pathological diagnoses) for positive frames and negative frames.

### 4. Save separate annotation files for each frame in `03_primary/labels`.

The `03_primary` folder will be our main source of data from which we will sample out our training and validation sets to `04_model_input` folder.

In [12]:
# Create "labels" dir if needed
LABELS_DIR = os.path.join(PRIMARY_DIR, 'labels')
LABELS_EXT = '.json'

if not os.path.exists(LABELS_DIR):
    os.mkdir(LABELS_DIR)

Let's load the custom annotation template we will be using for the SUN Dataset labels. 

In [10]:
# Load "annotation_template.json"
with open(ANNOTATION_TEMPLATE, 'r') as f:
    annotation_template = json.load(f)

annotation_template

{'image_id': '',
 'class': 0,
 'labels': [{'label_id': 0,
   'category_id': 0,
   'category_name': '',
   'segmentation': [],
   'bbox': []}],
 'other': []}

For each image, we will have an homonimous label `.json` file which will contain the following information:
* `image_id`: name of the image.
* `class`: either `0` or `1`, indicating a positive/polyp (`0`) or negative/non-polyp (`1`) frame.
* `labels`: if `class=0`, then `labels` contains the ground-truth annotations for that frame. Annotations will include a unique identifier of the label(`label_id`), the ID of the pathological diagnosis (`category_id`. See `polyp_classes.json`), the segmentation mask polygonal coordinates - if any (`segmentation`), and the bounding box coordinates (`bbox`).
* `other`: any other additional note, comment, information regarding the particular image.

Let's now also load the pathological diagnosis classes present in the SUN Dataset.

In [11]:
# Load "classes_template.json"
with open(POLYP_CLASSES, 'r') as f:
    polyp_classes = json.load(f)

polyp_classes

{'polyp_classes': [{'id': 1,
   'name': 'Hyperplastic polyp',
   'color': '(255,255,155,50)',
   'outline': '(255,222,0)'},
  {'id': 2,
   'name': 'Sessile serrated lesion',
   'color': '(255,102,102,50)',
   'outline': '(255,60,60)'},
  {'id': 3,
   'name': 'Low-grade adenoma',
   'color': '(51,51,255,50)',
   'outline': '(18,18,184)'},
  {'id': 4,
   'name': 'Traditional serrated adenoma',
   'color': '(0,200,10,50)',
   'outline': '(0,120,10)'},
  {'id': 5,
   'name': 'High-grade adenoma',
   'color': '(255,40,40,50)',
   'outline': '(160,0,0)'},
  {'id': 6,
   'name': 'Invasive carcinoma',
   'color': '(188,0,255,50)',
   'outline': '(140,0,190)'}]}

For each polyp class, we assign a unique ID (`id`), the corresponding pathological diagnosis (`name`), and different colors for visualization purposes (`color` and `outline`. `color` should be used for the segmentation masks "inside", as the color is semi-transparent. `outline` should be used for the contour of segmentation masks or the bounding box itself).

Now, we can populate each annotation template for each image using the information from each annotation.

**N.B.**: for the SUN Dataset we only have **one** bounding box annotation for each image. This might not be the case for images coming from other datasets or for possible future extensions of the SUN Dataset itself. As a matter of fact, there may be multiple objects detected in the same frame (be them other polyps or other objects of interests - such as image artefacts, etc.). 

Also, there might be multiple annotations associated with the same frame (bounding boxes, segmentation masks, etc.), therefore we keep our `labels` field as a **list of labels** (even though for the SUN Dataset we will only have one).

This might come in handy in the future when extending the framework to other datasets or for more refined annotations for the SUN Dataset itself.

In [13]:
# Store all annotations in a single file for tidyness
all_annotations = {'annotations': []}

# Populate annotation template for each image
# NB itertuples() returns a namedtuple for each row, whose fields/columns can be accessed as class attributes (via "somenamedtuple.<fieldname>")
for image_row in tqdm.tqdm(annotations.itertuples(), desc="Populating and saving each image's annotations...", total=len(annotations)):
    # Copy template structure
    image_annot = annotation_template
    image_annot['labels'] = [] # Initialise empty
    image_annot['other'] = []

    # Populate
    # Add image name and class
    image_annot['image_id'] = image_row.Filename
    image_annot['class'] = image_row.Frame_Class

    # Add annotations
    if image_row.Frame_Class == 0: # NB: Frame_Class=0 means POSITIVE (polyp) frame
        # Extract polyp class ID
        polyp_class = [polyp['id'] for polyp in polyp_classes['polyp_classes'] if polyp['name']==image_row.Pathological_Diagnosis]
        
        image_annot['labels'] = [
            {
                'label_id': image_row.Index+1, # Unique, sequential label ID
                'category_id': polyp_class[0], # Polyp ID from classes_template.json
                'category_name': image_row.Pathological_Diagnosis,
                'xyxy': image_row.XYXY,
                'xywh': image_row.XYWH
            }
        ]

    # Add other useful information
    image_annot['other'] = [
        {
            'LabelID': image_row.Index+1,
            'CaseID': image_row.ID,
            'Polyp Shape': image_row.Shape,
            'Polyp Size': image_row.Size,
            'Polyp Location': image_row.Location
        }
    ]

    all_annotations['annotations'].append(image_annot)


################################### RUN ONLY ONCE OR IF NEEDED ###################################
# # Save annotation file as "filename.json" in "03_primary/labels"
#     with open(os.path.join(LABELS_DIR, image_row.Filename[:-4]+LABELS_EXT), 'w') as f:
#         json.dump(image_annot, f)


# # Save annotations in a single json file located in "03_primary/labels.json"
# with open(os.path.join(PRIMARY_DIR, 'labels.json'), 'w') as f:
#     json.dump(all_annotations, f)

###################################################################################################

Populating and saving each image's annotations...: 100%|██████████| 158690/158690 [00:10<00:00, 14560.93it/s]


### 5. Move all images to `03_primary/images` folder.

Finally, we copy all of the images (both positive AND negative frames) to the above-mentioned folder, from which we will sample out the training/validation sets (just like for the labels). 

In [14]:
IMAGES_DIR = os.path.join(PRIMARY_DIR, 'images')

# Make sure the folder exists
if not os.path.exists(IMAGES_DIR):
    os.mkdir(IMAGES_DIR)

In [15]:
################################### RUN ONLY ONCE OR IF NEEDED ###################################
# # Copy images to selected folder
# for image_path in tqdm.tqdm(glob.iglob(os.path.join(RAW_DIR, 'case*/*.jpg')), desc='Copying images from raw to primary folder...', total=len(annotations)):
#     # Get the image name only
#     image_name = os.path.basename(image_path)

#     # Copy image
#     shutil.copyfile(image_path, os.path.join(IMAGES_DIR, image_name))
###################################################################################################

Copying images from raw to primary folder...: 100%|██████████| 158690/158690 [00:41<00:00, 3857.72it/s]


### 6. Split and save training and validation sets in `04_model_input/train` and in `04_model_input/val`.

For storage issues purposes, we will use `os.symlink` to just create shortcuts to the files in the `03_primary` folder. 

First, we need to set the **RNG seed** for reproducible results. `np.random.seed()` is not the recommended way to fix the seed anymore. Instead, we use `np.random.default_rng()`. See the [documentation](https://numpy.org/doc/stable/reference/random/generator.html?highlight=default_rng#) and [this blogpost](https://towardsdatascience.com/stop-using-numpy-random-seed-581a9972805f) for more information.

In [16]:
# Fix seed for reproducibility
SEED = 42
RNG = np.random.default_rng(seed=SEED)

We will only split into training and validation sets to exploit the SUN Dataset the most. For testing purposes, we can use any of the publicly available colonoscopy datasets - which would also provide generalizabity tests for our models.

We perform train/val split on a 80/20 and **per-class** basis. 

A list of public datasets for automatic polyp detection can be found [here](https://github.com/sing-group/deep-learning-colonoscopy#public-datasets).

Also, we will not include all negative frames in our dataset, but only some of them (as detailed in [_Misawa et al._](https://www.giejournal.org/article/S0016-5107(20)34655-1/fulltext)). However, we will use the number of negative frames as a hyperparameter and evaluate our models against varying number of negative samples (therefore, we will implement a method in our dataset to only take `N` negative frames).

For now, we include all of them in the training and validation sets (but we make sure to include an even split of positive and negative frames).

In [17]:
# Get negative frames list
negative_list = negative_annotations['Filename'].to_list()

# Shuffle it (in-place)
RNG.shuffle(negative_list)

# Get training and validation split indices
SPLIT = 0.8
NEG_SPLIT = int(SPLIT * len(negative_list))

# For each positive annotation, get each polyp class filelist (shuffled) and the split
classes_list = {}
classes_split = {}

for polyp in polyp_classes['polyp_classes']:
    classes_list[polyp['name']] = positive_annotations.loc[positive_annotations['Pathological_Diagnosis']==polyp['name'], 'Filename'].to_list()
    RNG.shuffle(classes_list[polyp['name']]) # shuffle in place
    # Get split indices
    classes_split[polyp['name']] = int(SPLIT * len(classes_list[polyp['name']]))

# Save neatly in a dict
dataset = {}

dataset['train'] = {}
dataset['val'] = {}

# Positive train/val split
train_positive = [polyp_list[:split] for polyp_list, split in zip(classes_list.values(), classes_split.values())] # list of lists
val_positive = [polyp_list[split:] for polyp_list, split in zip(classes_list.values(), classes_split.values())] # list of lists
# Unpack
train_positive = [fname for sublist in train_positive for fname in sublist]
val_positive = [fname for sublist in val_positive for fname in sublist]

# Extract positive images and labels
dataset['train']['positive'] = {
    'images': train_positive,
    'labels': [fname[:-4]+LABELS_EXT for fname in train_positive]
}
dataset['val']['positive'] = {
    'images': val_positive,
    'labels': [fname[:-4]+LABELS_EXT for fname in val_positive]
}

# Negative train/val split
train_negative = negative_list[:NEG_SPLIT]
val_negative = negative_list[NEG_SPLIT:]

# Extract negative images and labels
dataset['train']['negative'] = {
    'images': train_negative,
    'labels': [fname[:-4]+LABELS_EXT for fname in train_negative]
}
dataset['val']['negative'] = {
    'images': val_negative,
    'labels': [fname[:-4]+LABELS_EXT for fname in val_negative]
}

# Sanity check
print('---POSITIVE FRAMES---')
print(f"Trainset: {len(dataset['train']['positive']['images'])} | Valset: {len(dataset['val']['positive']['images'])} | Total: {len(dataset['train']['positive']['images']) + len(dataset['val']['positive']['images'])}")

print('---NEGATIVE FRAMES---')
print(f"Trainset: {len(dataset['train']['negative']['images'])} | Valset: {len(dataset['val']['negative']['images'])} | Total: {len(dataset['train']['negative']['images']) + len(dataset['val']['negative']['images'])}")

---POSITIVE FRAMES---
Trainset: 39306 | Valset: 9830 | Total: 49136
---NEGATIVE FRAMES---
Trainset: 87643 | Valset: 21911 | Total: 109554


In [18]:
DATASPLITS = ['train', 'val']
FRAME_TYPES = ['positive', 'negative']
XY = ['images', 'labels']

In [19]:
# Create folders
for datasplit in DATASPLITS:
    INPUT_DIR = os.path.join(MODEL_INPUT_DIR, datasplit)
    # Check if "train" or "val" folders already exist
    if not os.path.exists(INPUT_DIR):
        print(f'Created {datasplit} folder.')
        os.mkdir(INPUT_DIR)
    
    # Create also images and labels folders
    for xy in XY:
        XY_DIR = os.path.join(INPUT_DIR, xy)
        # Check if "images" or "labels" folders already exist
        if not os.path.exists(XY_DIR):
            print(f'Created {datasplit}/{xy} folder.')
            os.mkdir(XY_DIR)

Created train folder.
Created train/images folder.
Created train/labels folder.
Created val folder.
Created val/images folder.
Created val/labels folder.


In [21]:
################################### RUN ONLY ONCE OR IF NEEDED ###################################
# # Create symlink of the corresponding image and label in "04_model_input/*/images" and "04_model_input/*/labels"
# for datasplit in DATASPLITS:
#     for frame_type in FRAME_TYPES:
#         for xy in XY:
#             # Pair filepaths in a dict {src: dst}
#             filepairs = {os.path.join(PRIMARY_DIR, xy, fname): os.path.join(MODEL_INPUT_DIR, datasplit, xy, fname) for fname in dataset[datasplit][frame_type][xy]}

#             for src, dst in tqdm.tqdm(filepairs.items(), desc=f'Creating symlinks for {datasplit} set for {frame_type} {xy}...', total=len(dataset[datasplit][frame_type][xy])):
#                 os.symlink(src, dst)
###################################################################################################

Creating symlinks for train set for positive images...: 100%|██████████| 39306/39306 [00:00<00:00, 71544.48it/s]
Creating symlinks for train set for positive labels...: 100%|██████████| 39306/39306 [00:00<00:00, 68438.42it/s]
Creating symlinks for train set for negative images...: 100%|██████████| 87643/87643 [00:01<00:00, 66432.20it/s]
Creating symlinks for train set for negative labels...: 100%|██████████| 87643/87643 [00:01<00:00, 65353.66it/s]
Creating symlinks for val set for positive images...: 100%|██████████| 9830/9830 [00:00<00:00, 76079.67it/s]
Creating symlinks for val set for positive labels...: 100%|██████████| 9830/9830 [00:00<00:00, 74508.20it/s]
Creating symlinks for val set for negative images...: 100%|██████████| 21911/21911 [00:00<00:00, 74269.70it/s]
Creating symlinks for val set for negative labels...: 100%|██████████| 21911/21911 [00:00<00:00, 71345.48it/s]


In [22]:
# Sanity check
for datasplit in DATASPLITS:
    x = [img[:-4] for img in os.listdir(os.path.join(MODEL_INPUT_DIR, datasplit, 'images'))]
    y = [lbl[:-5] for lbl in os.listdir(os.path.join(MODEL_INPUT_DIR, datasplit, 'labels'))]

    assert sorted(x) == sorted(y), f'There are some mismatches or missing files between images and labels for the {datasplit} set.'   