In [8]:
import sys
sys.path.insert(1, "/home/odyssey/mmk_smoke_detection/")

%matplotlib inline
import albumentations as A
from PIL import Image, ImageDraw
import numpy as np
import glob
import os
import random
import matplotlib
from matplotlib import pyplot as plt
from IPython.display import display, clear_output, Image as IImage
import cv2

# matplotlib.use('TkAgg')

from dataset_preparator.preparator import DatasetDirectoryController

In [9]:
import random
import os
import shutil
import tqdm

from dataset_preparator.preparator import LABELS
from dataset_preparator.preparator import BINARY_LABELS

from typing import Callable, List

class TrainTestSplitter:
    _train_con: DatasetDirectoryController
    _test_con: DatasetDirectoryController
    _src_con: DatasetDirectoryController
    _prob_border: float

    #     _mapper: Callable
    _prob_to_move: Callable

    def __init__(
            self,
            src_dir: str,
            dst_dir: str,
            labels: List[str],
            prob_to_move: Callable,
            prob_border: float = 0.8,
            mapper: Callable = lambda x: x,
    ):
        self._prob_to_move = prob_to_move
        self._src_con = DatasetDirectoryController(src_dir)
        self._src_con.prepare_directories()
        train_dir = os.path.join(dst_dir, 'train')
        test_dir = os.path.join(dst_dir, 'test')
        if os.path.exists(train_dir):
            shutil.rmtree(train_dir)
        os.makedirs(train_dir)
        self._train_con = DatasetDirectoryController(
            train_dir,
            labels=labels,
            label_mapper=mapper
        )
        self._train_con.prepare_directories()

        if os.path.exists(test_dir):
            shutil.rmtree(test_dir)
        os.makedirs(test_dir)
        self._test_con = DatasetDirectoryController(
            test_dir,
            labels=labels,
            label_mapper=mapper
        )
        self._test_con.prepare_directories()

        self._prob_border = prob_border

    #     def prob_to_move(
    #         self,
    #         label_idx: int
    #     ) -> float:
    #         if self._is_binary:
    #             return 1 / (len(self._src_con.labels) - 1) if label_idx != 1 else 1.0
    #         return 1

    def _split_label_dir(
            self,
            label_idx: int
    ):
        src_label_dir = self._src_con.get_directory_for_label_idx(label_idx)
        imgs = glob.glob(os.path.join(src_label_dir, '*'))
        for img in tqdm.tqdm(imgs):
            dir_con = self._train_con if random.random() <= self._prob_border else self._test_con
            dst_label_dir = dir_con.get_directory_for_label_idx(label_idx)
            if random.random() <= self._prob_to_move(label_idx=label_idx):
                shutil.copy2(img, dst_label_dir)

    def split(self):
        for label_idx in range(len(self._src_con.labels)):
            self._split_label_dir(label_idx=label_idx)
            print('train:', LABELS[label_idx], self._train_con.stats)
            print('test:', LABELS[label_idx], self._test_con.stats)

In [12]:
PATH_TO_RESULT_DATASET = 'splitted_line_fire_dataset'

splitter = TrainTestSplitter(
    src_dir='line_fire_binary_dataset/',
    dst_dir=PATH_TO_RESULT_DATASET,
    labels=[LABELS[0], LABELS[1], LABELS[2]],
#     mapper=lambda label_idx: label_idx % 3,
    prob_to_move=lambda label_idx: 1,
    prob_border=0.95
)
splitter.split()

('background', 'emission', 'fire', 'machine')
['background', 'emission', 'fire']
['background', 'emission', 'fire']


100%|██████████| 6170/6170 [00:00<00:00, 17653.57it/s]


train: background {'background': 5881, 'emission': 0, 'fire': 0}
test: background {'background': 289, 'emission': 0, 'fire': 0}


100%|██████████| 5509/5509 [00:00<00:00, 16325.51it/s]


train: emission {'background': 5881, 'emission': 5218, 'fire': 0}
test: emission {'background': 289, 'emission': 291, 'fire': 0}


100%|██████████| 13816/13816 [00:00<00:00, 17324.16it/s]


train: fire {'background': 5881, 'emission': 5218, 'fire': 13139}
test: fire {'background': 289, 'emission': 291, 'fire': 677}


0it [00:00, ?it/s]

train: machine {'background': 5881, 'emission': 5218, 'fire': 13139}
test: machine {'background': 289, 'emission': 291, 'fire': 677}





In [6]:
PATH_TO_RESULT_DATASET = 'splitted_line_three_dataset'

splitter = TrainTestSplitter(
    src_dir='line_three_dataset_2/',
    dst_dir=PATH_TO_RESULT_DATASET,
    labels=LABELS[:3],
#     mapper=lambda label_idx: label_idx % 3,
    prob_to_move=lambda label_idx: 1,
    prob_border=0.95
)
splitter.split()

('background', 'emission', 'fire', 'machine')
('background', 'emission', 'fire')
('background', 'emission', 'fire')


100%|██████████| 6170/6170 [00:00<00:00, 18965.66it/s]


train: background {'background': 5851, 'emission': 0, 'fire': 0}
test: background {'background': 319, 'emission': 0, 'fire': 0}


100%|██████████| 5509/5509 [00:00<00:00, 19214.33it/s]


train: emission {'background': 5851, 'emission': 5240, 'fire': 0}
test: emission {'background': 319, 'emission': 269, 'fire': 0}


100%|██████████| 5956/5956 [00:00<00:00, 19419.09it/s]


train: fire {'background': 5851, 'emission': 5240, 'fire': 5663}
test: fire {'background': 319, 'emission': 269, 'fire': 293}


0it [00:00, ?it/s]

train: machine {'background': 5851, 'emission': 5240, 'fire': 5663}
test: machine {'background': 319, 'emission': 269, 'fire': 293}





In [4]:
PATH_TO_RESULT_DATASET = 'splitted_line_binary_dataset_2/'

splitter = TrainTestSplitter(
    src_dir='line_binary_dataset_2/',
    dst_dir=PATH_TO_RESULT_DATASET,
    labels=LABELS[:2],
#     mapper=lambda label_idx: label_idx % 3,
    prob_to_move=lambda label_idx: 1,
    prob_border=0.95
)
splitter.split()

('background', 'emission', 'fire', 'machine')
('background', 'emission')
('background', 'emission')


100%|██████████| 10717/10717 [00:00<00:00, 19520.95it/s]


train: background {'background': 10163, 'emission': 0}
test: background {'background': 554, 'emission': 0}


100%|██████████| 5509/5509 [00:00<00:00, 18920.60it/s]


train: emission {'background': 10163, 'emission': 5248}
test: emission {'background': 554, 'emission': 261}


0it [00:00, ?it/s]


train: fire {'background': 10163, 'emission': 5248}
test: fire {'background': 554, 'emission': 261}


0it [00:00, ?it/s]

train: machine {'background': 10163, 'emission': 5248}
test: machine {'background': 554, 'emission': 261}





In [33]:
PATH_TO_RESULT_DATASET = 'splitted_line_binary_dataset/'

splitter = TrainTestSplitter(
    src_dir='line_binary_dataset/',
    dst_dir=PATH_TO_RESULT_DATASET,
    labels=LABELS[:2],
#     mapper=lambda label_idx: label_idx % 3,
    prob_to_move=lambda label_idx: 1,
    prob_border=0.95
)
splitter.split()

100%|██████████| 11225/11225 [00:00<00:00, 19722.91it/s]


train: background {'background': 10661, 'emission': 0}
test: background {'background': 564, 'emission': 0}


100%|██████████| 4510/4510 [00:00<00:00, 19201.74it/s]


train: emission {'background': 10661, 'emission': 4288}
test: emission {'background': 564, 'emission': 222}


0it [00:00, ?it/s]


train: fire {'background': 10661, 'emission': 4288}
test: fire {'background': 564, 'emission': 222}


0it [00:00, ?it/s]


train: machine {'background': 10661, 'emission': 4288}
test: machine {'background': 564, 'emission': 222}


In [31]:
PATH_TO_RESULT_DATASET = 'splitted_line_dataset/'


def prob_to_move(label_idx: int) -> float:
    if label_idx == 3:
        return 0.1
#     if label_idx == 2:
#         return 1/3
    return 1

splitter = TrainTestSplitter(
    src_dir='line_dataset/',
    dst_dir=PATH_TO_RESULT_DATASET,
    labels=LABELS[:-1],
#     mapper=lambda label_idx: label_idx % 3,
    prob_to_move=lambda label_idx: 1,
    prob_border=0.90
)
splitter.split()

100%|██████████| 6172/6172 [00:01<00:00, 5235.55it/s]


train: background {'background': 5529, 'emission': 0, 'fire': 0}
test: background {'background': 643, 'emission': 0, 'fire': 0}


100%|██████████| 4510/4510 [00:00<00:00, 4919.88it/s]


train: emission {'background': 5529, 'emission': 4084, 'fire': 0}
test: emission {'background': 643, 'emission': 426, 'fire': 0}


100%|██████████| 5074/5074 [00:00<00:00, 5144.69it/s]


train: fire {'background': 5529, 'emission': 4084, 'fire': 4584}
test: fire {'background': 643, 'emission': 426, 'fire': 490}


0it [00:00, ?it/s]

train: machine {'background': 5529, 'emission': 4084, 'fire': 4584}
test: machine {'background': 643, 'emission': 426, 'fire': 490}





In [29]:
PATH_TO_RESULT_DATASET = 'splitted_line_small_aug_dataset/'


def prob_to_move(label_idx: int) -> float:
    if label_idx == 3:
        return 0.1
#     if label_idx == 2:
#         return 1/3
    return 1

splitter = TrainTestSplitter(
    src_dir='line_small_aug_dataset/',
    dst_dir=PATH_TO_RESULT_DATASET,
    labels=LABELS[:-1],
#     mapper=lambda label_idx: label_idx % 3,
    prob_to_move=lambda label_idx: 1,
    prob_border=0.90
)
splitter.split()

100%|██████████| 24688/24688 [00:01<00:00, 19027.70it/s]


train: background {'background': 22303, 'emission': 0, 'fire': 0}
test: background {'background': 2385, 'emission': 0, 'fire': 0}


100%|██████████| 27060/27060 [00:01<00:00, 18917.61it/s]


train: emission {'background': 22303, 'emission': 24394, 'fire': 0}
test: emission {'background': 2385, 'emission': 2666, 'fire': 0}


100%|██████████| 25370/25370 [00:01<00:00, 18463.24it/s]


train: fire {'background': 22303, 'emission': 24394, 'fire': 22753}
test: fire {'background': 2385, 'emission': 2666, 'fire': 2617}


0it [00:00, ?it/s]


train: machine {'background': 22303, 'emission': 24394, 'fire': 22753}
test: machine {'background': 2385, 'emission': 2666, 'fire': 2617}


In [None]:
PATH_TO_RESULT_DATASET = 'merged_ready_dataset_copy'


def prob_to_move(label_idx: int) -> float:
    if label_idx == 3:
        return 0.1
    if label_idx == 2:
        return 1/3
    return 1

splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_DATASET,
    dst_dir=f'three_{PATH_TO_RESULT_DATASET}',
    labels=LABELS[:-1],
    mapper=lambda label_idx: label_idx % 3,
    prob_to_move=prob_to_move,
    prob_border=0.95
)
splitter.split()

In [None]:
PATH_TO_RESULT_DATASET = 'small_aug_dataset_2'

def prob_to_move(label_idx: int) -> float:
    if label_idx == 0:
        return 1/3
    return 1

def mapper(label_idx: int) -> int:
    if label_idx != 1:
        return 0
    return 1

splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_DATASET,
    dst_dir=f'binary_splitted_{PATH_TO_RESULT_DATASET}_2',
    labels=LABELS[:2],
    mapper=mapper,
    prob_to_move=prob_to_move,
    prob_border=0.95
)
splitter.split()

In [None]:
PATH_TO_RESULT_DATASET = 'small_aug_dataset_2'

def prob_to_move(label_idx: int) -> float:
    if label_idx == 3:
        return 0.1
    return 1

splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_DATASET,
    dst_dir=f'merged_{PATH_TO_RESULT_DATASET}',
    labels=LABELS[:-1],
    mapper=lambda label_idx: label_idx % 3,
    prob_to_move=prob_to_move,
    prob_border=0.95
)
splitter.split()

In [None]:
PATH_TO_RESULT_DATASET = 'small_aug_dataset'
# PATH_TO_RESULT_MEDIUM_DATASET = 'medium_aug_dataset'

# def map_label_idx(label_idx: int):
#     return label_idx % 3
def prob_to_move(label_idx: int) -> float:
    if label_idx == 0 or label_idx == 3:
        return 1/2
    return 1

splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_DATASET,
    dst_dir=f'three_{PATH_TO_RESULT_DATASET}',
    labels=LABELS[:-1],
    mapper=lambda label_idx: label_idx % 3,
    prob_to_move=prob_to_move
)
splitter.split()

In [8]:
PATH_TO_RESULT_DATASET = "/home/odyssey/mmk_smoke_detection/augmentation/line_val/"


def prob_to_move(label_idx: int) -> float:
    return 1


splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_DATASET,
    dst_dir='line_val',
    prob_border=1.1,
    labels=LABELS[:-1],
    mapper=lambda label_idx: label_idx % 3,
    prob_to_move=prob_to_move
)
splitter.split()

100%|██████████| 160/160 [00:00<00:00, 16616.04it/s]


train: background {'background': 160, 'emission': 0, 'fire': 0}
test: background {'background': 0, 'emission': 0, 'fire': 0}


100%|██████████| 178/178 [00:00<00:00, 18094.67it/s]


train: emission {'background': 160, 'emission': 178, 'fire': 0}
test: emission {'background': 0, 'emission': 0, 'fire': 0}


100%|██████████| 290/290 [00:00<00:00, 18847.59it/s]


train: fire {'background': 160, 'emission': 178, 'fire': 290}
test: fire {'background': 0, 'emission': 0, 'fire': 0}


100%|██████████| 27/27 [00:00<00:00, 11244.78it/s]

train: machine {'background': 187, 'emission': 178, 'fire': 290}
test: machine {'background': 0, 'emission': 0, 'fire': 0}





In [9]:
! rm -r line_val/emission/ line_val/fire/ line_val/background/ line_val/machine/ 

In [11]:
! mv line_val/train/* line_val/

In [15]:
! rm -r line_val/train/

In [19]:
!ls line_val/fire/

00e2c5b0-cd71-4ca8-b86d-227826ff866e_0_0.jpg
0155e9ae-be15-4739-bbbb-fafb4da83439_0_0.jpg
01911661-adca-4bba-8f72-f693e3100c9d_0_0.jpg
01ab4e83-0e3c-40d9-9c45-ea36d62611ce_0_0.jpg
04cf2d9c-4f64-4f79-bd5c-f93453c4d9bd_0_0.jpg
05560b9c-489f-42fd-aff3-8053058eb136_0_0.jpg
061fe915-9002-4ce0-a3a5-b395873d55b4_0_0.jpg
08c3f8f3-e695-44f5-a652-03216bd4d7a7_0_0.jpg
0c1a057f-b70d-427c-9b8b-a92356c57b07_0_0.jpg
0c8ae284-063c-454c-98ae-862c3a1eb970_0_0.jpg
0cb39f5a-a69e-4a89-a1f6-1fdea936f490_0_0.jpg
0d07bba1-13b6-4202-9c0c-a3c42e41ef50_0_0.jpg
0d4bddf8-f862-4ab6-be6d-589d76d2fc3b_0_0.jpg
0dbfed79-0901-4195-9001-429a48838538_0_0.jpg
0eb8ac6a-884f-49a5-86ea-27f74207f515_0_0.jpg
0ebbbd13-758a-4a5f-aa51-3dddd3495d68_0_0.jpg
0f868684-5934-4ce7-851b-4cfc2cb35869_0_0.jpg
101b51b1-94c9-451f-9f7f-15cfd97cac03_0_0.jpg
10c7b88d-30ef-4db7-a53a-67867c7f83b2_0_0.jpg
10f5b3f7-2044-4436-b53d-9c63f3316977_0_0.jpg
1150dafa-4114-4381-bcb4-76e3aed06954_0_0.jpg
12b5609d-0748-4906-a998-0b23a6a55b

In [None]:
PATH_TO_RESULT_DATASET = "/home/odyssey/mmk_smoke_detection/validation/expanded_val/"


def prob_to_move(label_idx: int) -> float:
    return 1


splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_DATASET,
    dst_dir='three_validation',
    prob_border=1.1,
    labels=LABELS[:-1],
    mapper=lambda label_idx: label_idx % 3,
    prob_to_move=prob_to_move
)
splitter.split()

In [None]:
PATH_TO_RESULT_DATASET = "/home/odyssey/mmk_smoke_detection/validation/expanded_val/"

def prob_to_move(label_idx: int) -> float:
    return 1

def mapper(label_idx: int) -> int:
    if label_idx != 1:
        return 0
    return 1

splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_DATASET,
    dst_dir='binary_validation',
    prob_border=1.1,
    labels=LABELS[:-2],
    mapper=mapper,
    prob_to_move=prob_to_move
)
splitter.split()

In [None]:
PATH_TO_RESULT_DATASET = 'small_aug_dataset'
# PATH_TO_RESULT_MEDIUM_DATASET = 'medium_aug_dataset'

splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_DATASET,
    dst_dir=f'splitted_{PATH_TO_RESULT_DATASET}'
)
splitter.split()

In [None]:
PATH_TO_RESULT_DATASET = 'small_aug_dataset'
# PATH_TO_RESULT_MEDIUM_DATASET = 'medium_aug_dataset'

splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_DATASET,
    dst_dir=f'binary_splitted_{PATH_TO_RESULT_DATASET}',
    is_binary=True
)
splitter.split()

In [None]:
# PATH_TO_RESULT_DATASET = 'small_aug_dataset'
PATH_TO_RESULT_MEDIUM_DATASET = 'medium_aug_dataset'

splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_MEDIUM_DATASET,
    dst_dir=f'splitted_{PATH_TO_RESULT_MEDIUM_DATASET}'
)
splitter.split()

In [None]:
# PATH_TO_RESULT_DATASET = 'small_aug_dataset'
PATH_TO_RESULT_MEDIUM_DATASET = 'medium_aug_dataset'

splitter = TrainTestSplitter(
    src_dir=PATH_TO_RESULT_MEDIUM_DATASET,
    dst_dir=f'binary_splitted_{PATH_TO_RESULT_MEDIUM_DATASET}',
    is_binary=True
)
splitter.split()