In [12]:
import pandas as pd
from torch.utils.data import Dataset
from typing import Optional, List, TypeVar
from pathlib import Path
import rasterio
from tqdm import tqdm
import numpy as np
import cv2
import warnings
from segmentation.config import Configs as CFG
from segmentation.scr.rle_coding import *

PandasDataFrame = TypeVar('pandas.core.frame.DataFrame')

In [14]:
df = pd.read_csv(CFG.path_df_kidney_1_til)

In [17]:
def random_sub_df(df:PandasDataFrame, sample_limit: Optional[int] = None, empty_tile_pct :int = 10):
    """_summary_

    Args:
        df PandasDataFrame: dataframe
        sample_limit int: sample limit. Defaults to None.
        empty_tile_pct int: percentage of empty masks. Defaults to 10.

    Returns:
        PandasDataFrame: dataframe with a certain percentage of empty masks
    """
    if sample_limit is None:
        sample_limit = df.shape[0]
    print(sample_limit)
    pct_no_empty = df['is_empty'].value_counts(normalize=True)[False]
    pct_empty = df['is_empty'].value_counts(normalize=True)[True]
    count_no_empty = df['is_empty'].value_counts()[False]
    count_empty = df['is_empty'].value_counts()[True]
    print(f"Dataset contains {count_empty} empty and {count_no_empty} non-empty tiles.")
    
    num_empty_tiles_to_sample = int(sample_limit * empty_tile_pct / 100)
    num_pos_tiles_to_sample = int(sample_limit * (1 - empty_tile_pct / 100))
    
    if num_empty_tiles_to_sample > count_empty:
        num_empty_tiles_to_sample = count_empty
        sample_limit = int(count_empty / empty_tile_pct * 100)
        num_pos_tiles_to_sample = int(sample_limit * (1 - empty_tile_pct / 100))
        
    if num_pos_tiles_to_sample > count_no_empty:
        num_pos_tiles_to_sample = count_no_empty
        sample_limit = int(count_no_empty / (1 -empty_tile_pct / 100))
        num_empty_tiles_to_sample = int(sample_limit * empty_tile_pct / 100)
  
    print(f"Sample {num_empty_tiles_to_sample} empty and {num_pos_tiles_to_sample} non-empty tiles.")

    
    df_empty = df[df['is_empty'] == True].sample(num_empty_tiles_to_sample)
    df_no_empty = df[df['is_empty'] == False].sample(num_pos_tiles_to_sample)
    frames = [df_empty, df_no_empty]
    return pd.concat(frames).sort_index()
    
  
    
    
        

new_df = random_sub_df(df, empty_tile_pct=4)

13674
Dataset contains 1528 empty and 12146 non-empty tiles.
Sample 506 empty and 12146 non-empty tiles.


In [16]:
new_df['is_empty'].value_counts(normalize=True)

is_empty
False    0.960006
True     0.039994
Name: proportion, dtype: float64

In [105]:
df_empty = df[df['is_empty'] == True].sample(1000)
df_no_empty = df[df['is_empty'] == False].sample(6000)
frames = [df_empty, df_no_empty]
pd.concat(frames).sort_index()

Unnamed: 0,path_img,path_lb,is_empty,bbx,px_stats,size
2,data\kidney_1_tilling\images\0000_0_395_512_51...,data\kidney_1_tilling\labels\0000_0_395_512_51...,True,"(0, 395, 512, 512)","[18515, 36640]","(1303, 912)"
4,data\kidney_1_tilling\images\0000_0_791_512_51...,data\kidney_1_tilling\labels\0000_0_791_512_51...,True,"(0, 791, 512, 512)","[18515, 36640]","(1303, 912)"
5,data\kidney_1_tilling\images\0000_400_791_512_...,data\kidney_1_tilling\labels\0000_400_791_512_...,True,"(400, 791, 512, 512)","[18515, 36640]","(1303, 912)"
6,data\kidney_1_tilling\images\0001_0_0_512_512.png,data\kidney_1_tilling\labels\0001_0_0_512_512.png,True,"(0, 0, 512, 512)","[18320, 37358]","(1303, 912)"
8,data\kidney_1_tilling\images\0001_0_395_512_51...,data\kidney_1_tilling\labels\0001_0_395_512_51...,True,"(0, 395, 512, 512)","[18320, 37358]","(1303, 912)"
...,...,...,...,...,...,...
13668,data\kidney_1_tilling\images\2278_0_0_512_512.png,data\kidney_1_tilling\labels\2278_0_0_512_512.png,True,"(0, 0, 512, 512)","[10069, 18379]","(1303, 912)"
13669,data\kidney_1_tilling\images\2278_400_0_512_51...,data\kidney_1_tilling\labels\2278_400_0_512_51...,True,"(400, 0, 512, 512)","[10069, 18379]","(1303, 912)"
13671,data\kidney_1_tilling\images\2278_400_395_512_...,data\kidney_1_tilling\labels\2278_400_395_512_...,True,"(400, 395, 512, 512)","[10069, 18379]","(1303, 912)"
13672,data\kidney_1_tilling\images\2278_0_791_512_51...,data\kidney_1_tilling\labels\2278_0_791_512_51...,True,"(0, 791, 512, 512)","[10069, 18379]","(1303, 912)"


In [89]:
800 / 7200

0.1111111111111111

In [82]:
1528/ (1528+3565)

0.30001963479285293

In [55]:
empty_tiles = df[df['is_empty'] == True].index.values

In [None]:
num_empty_tiles_to_sample 

In [None]:
np.random.choice(empty_tiles, min(num_empty_tiles_to_sample, len(empty_tiles)), replace=False)

In [54]:
df[df['is_empty'] == True].index.values

array([    0,     1,     2, ..., 13671, 13672, 13673], dtype=int64)

In [37]:
df = pd.read_csv(CFG.path_df_kidney_1_til)

In [39]:
df['is_empty'].value_counts(normalize=True)[False]

0.8882550826385842

In [16]:
num_tiles = df.shape[0]
num_tiles 

13674

In [18]:
empty_tile_pct = 20
num_empty_tiles_to_sample = int(num_tiles * empty_tile_pct / 100)
num_pos_tiles_to_sample = int(num_tiles * (1 - empty_tile_pct / 100))

In [19]:
num_empty_tiles_to_sample

2734

In [20]:
num_pos_tiles_to_sample

10939

In [21]:
1367 + 12306

13673

In [31]:
np.random.choice(10, min(5, 4), replace=False)

array([3, 2, 7, 6])

In [None]:
num_empty_tiles_to_sample = int(self.sample_limit * self.empty_tile_pct)
            num_pos_tiles_to_sample = int(self.sample_limit * (1 - self.empty_tile_pct))

            empty_idxs_to_sample = np.random.choice(len(empty_tiles), min(num_empty_tiles_to_sample, len(empty_tiles)), replace=False)
            pos_idxs_to_sample = np.random.choice(len(populated_tiles), min(num_pos_tiles_to_sample, len(populated_tiles)), replace=False)

            neg_samples = list(map(empty_tiles.__getitem__, empty_idxs_to_sample))
            pos_samples = list(map(populated_tiles.__getitem__, pos_idxs_to_sample))

            new_samples = pos_samples + neg_samples

            self.samples = new_samples
            if self.empty_tile_pct == 0.0:
                print(f'Dropped {empty} empty tiles.')
            print(f'Dataset contains {len(neg_samples)} empty and {len(pos_samples)} non-empty tile

In [8]:
df

Unnamed: 0,path_img,path_lb,is_empty,bbx,px_stats,size
0,data\kidney_1_tilling\images\0000_0_0_512_512.png,data\kidney_1_tilling\labels\0000_0_0_512_512.png,True,"(0, 0, 512, 512)","[18515, 36640]","(1303, 912)"
1,data\kidney_1_tilling\images\0000_400_0_512_51...,data\kidney_1_tilling\labels\0000_400_0_512_51...,True,"(400, 0, 512, 512)","[18515, 36640]","(1303, 912)"
2,data\kidney_1_tilling\images\0000_0_395_512_51...,data\kidney_1_tilling\labels\0000_0_395_512_51...,True,"(0, 395, 512, 512)","[18515, 36640]","(1303, 912)"
3,data\kidney_1_tilling\images\0000_400_395_512_...,data\kidney_1_tilling\labels\0000_400_395_512_...,True,"(400, 395, 512, 512)","[18515, 36640]","(1303, 912)"
4,data\kidney_1_tilling\images\0000_0_791_512_51...,data\kidney_1_tilling\labels\0000_0_791_512_51...,True,"(0, 791, 512, 512)","[18515, 36640]","(1303, 912)"
...,...,...,...,...,...,...
13669,data\kidney_1_tilling\images\2278_400_0_512_51...,data\kidney_1_tilling\labels\2278_400_0_512_51...,True,"(400, 0, 512, 512)","[10069, 18379]","(1303, 912)"
13670,data\kidney_1_tilling\images\2278_0_395_512_51...,data\kidney_1_tilling\labels\2278_0_395_512_51...,True,"(0, 395, 512, 512)","[10069, 18379]","(1303, 912)"
13671,data\kidney_1_tilling\images\2278_400_395_512_...,data\kidney_1_tilling\labels\2278_400_395_512_...,True,"(400, 395, 512, 512)","[10069, 18379]","(1303, 912)"
13672,data\kidney_1_tilling\images\2278_0_791_512_51...,data\kidney_1_tilling\labels\2278_0_791_512_51...,True,"(0, 791, 512, 512)","[10069, 18379]","(1303, 912)"
