## Filter image patches to handle class imbalance

#### Remove portion of images patches with all 0s and no 1s (no signal)

In [1]:
from dotenv import load_dotenv
from PIL import Image

import numpy as np
import matplotlib.pyplot as plt
import math
import os
import glob
import random

%matplotlib inline

In [2]:
load_dotenv()

in_postfix = '_sliced'
out_postfix = '_filtered'

im_dir = os.getenv('IMAGE_DIR')
mask_dir = os.getenv('MASK_DIR')

in_im_dir = im_dir + in_postfix
in_mask_dir = mask_dir + in_postfix

out_im_dir = im_dir + out_postfix
out_mask_dir = mask_dir + out_postfix

n = 128

In [3]:
# returns boolean that indicates whether to keep training example 
def keep_sample(mask, keep_proba):
    keep = True
#     if np.count_nonzero(mask) == 0: 
    rand = random.uniform(0,1)
    if rand > keep_proba:
        keep = False
    return keep

In [None]:
im_filenames = glob.glob(in_im_dir+'/*.npy', recursive=True)
mask_filenames = glob.glob(in_mask_dir+'/*.npy', recursive=True)

num_samples = len(im_filenames)

# Create output directories 
if not os.path.exists(out_im_dir):
    os.makedirs(out_im_dir)
if not os.path.exists(out_mask_dir):
    os.makedirs(out_mask_dir)
    
samples_with_zeros = 0
samples_with_ones = 0

for i, im_filename in enumerate(im_filenames):
    im = np.load(im_filename)
    filename = im_filename.split('/')[-1] #get image name
    mask = np.load(os.path.join(in_mask_dir,filename))
    if i % 1000 == 0:
        print('processing sample',i,'/',num_samples)
        print('0:',samples_with_zeros,'1:',samples_with_ones)
    if np.count_nonzero(mask) == 0: # if sample contains all 0s
        keep = keep_sample(mask, 0.03) # determine whether to keep sample based on probability threshold
        if keep:
            samples_with_zeros += 1
            np.save(os.path.join(out_im_dir,filename), im)
            np.save(os.path.join(out_mask_dir,filename), mask)
    else:
        samples_with_ones += 1
        np.save(os.path.join(out_im_dir,filename), im)
        np.save(os.path.join(out_mask_dir,filename), mask)

processing sample 0 / 234624
0: 0 1: 0
processing sample 1000 / 234624
0: 26 1: 25
processing sample 2000 / 234624
0: 51 1: 50
processing sample 3000 / 234624
0: 72 1: 70
processing sample 4000 / 234624
0: 95 1: 95
processing sample 5000 / 234624
0: 118 1: 125
processing sample 6000 / 234624
0: 144 1: 158
processing sample 7000 / 234624
0: 173 1: 187
processing sample 8000 / 234624
0: 190 1: 217
processing sample 9000 / 234624
0: 215 1: 248
processing sample 10000 / 234624
0: 243 1: 260
processing sample 11000 / 234624
0: 276 1: 296
processing sample 12000 / 234624
0: 307 1: 331
processing sample 13000 / 234624
0: 339 1: 352
processing sample 14000 / 234624
0: 365 1: 371
processing sample 15000 / 234624
0: 400 1: 393
processing sample 16000 / 234624
0: 429 1: 424
processing sample 17000 / 234624
0: 446 1: 447
processing sample 18000 / 234624
0: 473 1: 483
processing sample 19000 / 234624
0: 504 1: 504
processing sample 20000 / 234624
0: 529 1: 532
processing sample 21000 / 234624
0: 56