In [95]:
### IMPORTS

import os
import glob
import random
import shutil
import sys
from concurrent.futures.process import ProcessPoolExecutor
from pathlib import Path
import cv2
import numpy as np
import yaml
from matplotlib import pyplot as plt
import matplotlib.patches as patches
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

In [96]:
### SCRIPT VARIABLES
# get settings

with open('settings.yaml', 'r') as f:
    sets = yaml.safe_load(f)

## IMAGE DATASET PATH
dataset_path = sets['path_main_db'] #'/db/input'

## AMOUNT OF DESIRED IMAGES
im_amount = sets['dataset_size'] # there is 749 images in the database


## DATABASE FOLDER NAME
dataset_destination_folder_name = sets['path_prepreprocessed_db']
desired_size = sets['img_size']

In [97]:
# augument dataset - create more images from single image



def draw_centered_rects(img, centers, crop_w, crop_h):
    fig, ax = plt.subplots()
    ax.imshow(img, cmap='gray')

    for (cy, cx) in centers:
        x = cx - crop_w // 2
        y = cy - crop_h // 2
        rect = patches.Rectangle((y, x), crop_w, crop_h,
                                 linewidth=1.5, edgecolor='red', facecolor='none')
        #ax.plot(cy, cx, 'g*')
        ax.add_patch(rect)

def augument_image(image_path, desired_size, num_img_multiplicator, remove_old=False, debug=False):
    image = cv2.imread(image_path)
    image_name = os.path.split(image_path)[-1]
    image_name = image_name.split('.')[0]


    y_im, x_im = image.shape[:-1]
    image_size = random.randint(desired_size[0], desired_size[1])
    x_shape, y_shape = image_size, image_size
    filter = 1/(image_size**2) * np.ones([image_size, image_size])

    how_many_fit = int((x_im // x_shape) * (y_im // y_shape) * num_img_multiplicator)
    number_images = 1 if how_many_fit == 0 else how_many_fit



    laplaced = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    laplaced = cv2.Laplacian(laplaced, cv2.CV_8U) / 255
    laplaced = cv2.filter2D(laplaced, -1, filter)
    laplaced = (laplaced - laplaced.min()) / (laplaced.max() - laplaced.min())

    mask = np.zeros_like(laplaced, dtype=np.float32)
    mask[y_shape//2:y_im-(y_shape//2), x_shape//2:x_im-(x_shape//2)] = 1
    laplaced = laplaced * mask

    out_indexes = []
    out_images = []
    for i in range(number_images):
        raveled_laplacian = laplaced.ravel()
        norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)
        if np.isnan(norm_raveled_laplacian).any():
            print('Warning: NaN found in file {}'.format(image_path))
            continue
        point = np.random.choice((x_im * y_im), 1, p=norm_raveled_laplacian)
        y, x = np.unravel_index(point, (y_im, x_im))
        y_cnt, x_cnt = y[0], x[0]


        min_y = y_cnt - y_shape
        max_y = y_cnt + y_shape
        min_x = x_cnt - x_shape
        max_x = x_cnt + x_shape

        min_y = 0 if min_y < 0 else min_y
        min_x = 0 if min_x < 0 else min_x

        max_y = y_im if max_y > y_im else max_y
        max_x = x_im if max_x > x_im else max_x

        laplaced[min_y:max_y, min_x:max_x] = 0

        new_image = image[y_cnt - (y_shape // 2): y_cnt + (y_shape // 2), x_cnt - (x_shape // 2): x_cnt + (x_shape // 2)]
        out_images.append(new_image)
        out_indexes.append((x_cnt, y_cnt))

        save_name = '{}_{}.bmp'.format(image_name, i)
        save_name = os.path.join(dataset_destination_folder_name, save_name)


        success = cv2.imwrite(save_name, new_image)

    if remove_old:
        os.remove(image_path)
    if debug:
        draw_centered_rects(laplaced, out_indexes, y_shape, x_shape)
        plt.plot(out_indexes, 'g*')

        for img in out_images:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            plt.figure()
            plt.imshow(img)
            plt.show()


In [98]:


def augment_image_wrapper(args):
    image_path, desired_size, num_img_multiplicator = args
    augument_image(image_path, desired_size, num_img_multiplicator, remove_old=True, debug=False)

image_multiplicator = .25
images_paths = glob.glob(os.path.join(dataset_destination_folder_name, '*.*'))
image_path = images_paths[1]
desired_size = [256, 512]


args_list = [(p, desired_size, image_multiplicator) for p in images_paths]
#for i, image_path in enumerate(tqdm(images_paths), decs="Processing", unit="img/s"):
    #pass
    #augument_image(image_path, desired_size, num_img_multiplicator=image_multiplicator, remove_old=True, debug=False)
with Pool(processes=cpu_count()) as pool:
    list(tqdm(pool.imap_unordered(augment_image_wrapper, args_list),
              total=len(args_list),
              desc="Augmenting",
              unit="img"))


  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




Augmenting:   7%|▋         | 124/1767 [00:05<01:26, 18.96img/s]



  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




Augmenting:  22%|██▏       | 387/1767 [00:46<07:22,  3.12img/s]



Augmenting:  23%|██▎       | 404/1767 [00:50<04:44,  4.80img/s]



Augmenting:  23%|██▎       | 405/1767 [00:50<05:06,  4.45img/s]




  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)
Augmenting:  28%|██▊       | 491/1767 [01:15<07:26,  2.86img/s]



Augmenting:  28%|██▊       | 499/1767 [01:17<06:21,  3.32img/s]



  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




Augmenting:  40%|████      | 709/1767 [02:11<03:31,  5.01img/s]



Augmenting:  40%|████      | 711/1767 [02:12<03:56,  4.46img/s]



Augmenting:  41%|████      | 722/1767 [02:14<03:50,  4.54img/s]



  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




Augmenting:  43%|████▎     | 760/1767 [02:19<01:59,  8.41img/s]




Augmenting:  43%|████▎     | 768/1767 [02:20<01:49,  9.14img/s]



Augmenting:  44%|████▎     | 772/1767 [02:21<02:54,  5.70img/s]



Augmenting:  45%|████▍     | 788/1767 [02:24<02:36,  6.27img/s]



Augmenting:  50%|█████     | 884/1767 [02:37<01:47,  8.21img/s]



  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




Augmenting:  52%|█████▏    | 915/1767 [02:42<02:10,  6.54img/s]




  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




Augmenting:  52%|█████▏    | 919/1767 [02:42<01:40,  8.46img/s]



Augmenting:  52%|█████▏    | 921/1767 [02:43<01:58,  7.12img/s]




Augmenting:  52%|█████▏    | 923/1767 [02:43<02:39,  5.29img/s]



Augmenting:  52%|█████▏    | 924/1767 [02:44<03:12,  4.38img/s]




  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




Augmenting:  56%|█████▋    | 994/1767 [02:54<01:10, 10.91img/s]



Augmenting:  58%|█████▊    | 1019/1767 [03:01<06:23,  1.95img/s]



Augmenting:  59%|█████▊    | 1037/1767 [03:04<02:05,  5.83img/s]



Augmenting:  60%|█████▉    | 1053/1767 [03:06<01:12,  9.81img/s]



Augmenting:  60%|█████▉    | 1055/1767 [03:06<01:11,  9.90img/s]



Augmenting:  62%|██████▏   | 1088/1767 [03:09<00:49, 13.86img/s]



Augmenting:  63%|██████▎   | 1112/1767 [03:11<01:04, 10.16img/s]



Augmenting:  64%|██████▍   | 1138/1767 [03:14<00:41, 15.33img/s]



  norm_raveled_laplacian = raveled_laplacian / np.sum(raveled_laplacian)




Augmenting:  66%|██████▋   | 1172/1767 [03:18<01:12,  8.17img/s]



Augmenting:  67%|██████▋   | 1191/1767 [03:23<03:06,  3.09img/s]



Augmenting:  69%|██████▉   | 1217/1767 [03:32<05:00,  1.83img/s]



Augmenting:  73%|███████▎  | 1292/1767 [03:53<01:18,  6.03img/s]



Augmenting:  75%|███████▍  | 1319/1767 [03:59<01:11,  6.29img/s]



Augmenting:  75%|███████▌  | 1334/1767 [04:00<00:43, 10.00img/s]



Augmenting:  76%|███████▋  | 1350/1767 [04:02<01:02,  6.71img/s]



Augmenting:  77%|███████▋  | 1368/1767 [04:07<01:17,  5.12img/s]



Augmenting:  82%|████████▏ | 1443/1767 [04:24<00:57,  5.66img/s]



Augmenting:  85%|████████▍ | 1498/1767 [04:36<00:30,  8.78img/s]



Augmenting:  88%|████████▊ | 1548/1767 [04:44<00:27,  7.85img/s]



Augmenting:  89%|████████▊ | 1565/1767 [04:45<00:19, 10.31img/s]



Augmenting:  90%|█████████ | 1594/1767 [04:49<00:26,  6.63img/s]



Augmenting:  94%|█████████▍| 1668/1767 [05:01<00:09, 10.53img/s]



Augmenting:  96%|█████████▌| 1690/1767 [05:04<00:13,  5.58img/s]



Augmenting:  97%|█████████▋| 1706/1767 [05:06<00:06, 10.01img/s]



Augmenting:  98%|█████████▊| 1738/1767 [05:10<00:04,  6.28img/s]



Augmenting:  99%|█████████▊| 1744/1767 [05:11<00:02,  7.70img/s]



Augmenting:  99%|█████████▉| 1747/1767 [05:11<00:01, 10.18img/s]



Augmenting: 100%|██████████| 1767/1767 [07:17<00:00,  4.04img/s]


In [99]:
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
threshold_entropy = .2
threshold_laplacian = .2
images_paths = glob.glob(os.path.join(dataset_destination_folder_name, '*.*'))

In [100]:

def calculate_entropy(image):
    # Convert image to grayscale if it's not already
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Compute histogram (256 bins for 8-bit images)
    hist = cv2.calcHist([image], [0], None, [256], [0,256])
    hist_norm = hist.ravel() / hist.sum()  # normalize to get probabilities

    # Remove zero entries to avoid log(0)
    hist_norm = hist_norm[hist_norm > 0]

    # Compute entropy
    entropy = -np.sum(hist_norm * np.log2(hist_norm))
    return entropy

def calculate_variances(path):
    img = cv2.imread(path)
    laplacian_var = cv2.Laplacian(img, -1).var()
    ent = calculate_entropy(img)
    return path, ent, laplacian_var

last_check = {
    'path': [],
    'entropy': [],
    'laplacian_var': []
}

with ProcessPoolExecutor() as exe:
    for path, entr, laplacian_var in tqdm(
            exe.map(calculate_variances, images_paths),
            total=len(images_paths),
            desc='Computing variances'):
        last_check['path'].append(path)
        last_check['entropy'].append(entr)
        last_check['laplacian_var'].append(laplacian_var)



Computing variances: 100%|██████████| 7276/7276 [00:02<00:00, 2464.37it/s]


In [101]:
df = pd.DataFrame(last_check)
len_df = len(df)


entropy_cutoff = df['entropy'].quantile(threshold_entropy)
laplacian_cutoff = df['laplacian_var'].quantile(threshold_laplacian)

print(entropy_cutoff)
print(laplacian_cutoff)

drop_df = df[
    (df['entropy'] < entropy_cutoff) |
    (df['laplacian_var'] < laplacian_cutoff)
]


drop_idx = set(drop_df.index)
filtered_df = df.drop(drop_idx)

remove_fcn = lambda x: os.remove(x)
drop_df['path'].apply(remove_fcn)

threshold_entropy = 0
threshold_laplacian = 0
#filtered_df.sort_values(by='entropy', ascending=True, inplace=True)
#filtered_df.reset_index(drop=True, inplace=True)


6.1703386306762695
31.151807966947175


In [102]:
drop_df
os.path.exists('/home/maciejka/Desktop/school/S8/labwork-project/db/dataset/427_6.bmp')

print(len(filtered_df))

5244
