# Experiments with preprocessing the data

In [1]:
import numpy
import pandas
from matplotlib import pyplot

import hubmap

In [2]:
train_df = pandas.read_csv(hubmap.utils.paths.TRAIN_CSV)
train_df.head()

Unnamed: 0,id,organ,data_source,img_height,img_width,pixel_size,tissue_thickness,rle,age,sex
0,10044,prostate,HPA,3000,3000,0.4,4,1459676 77 1462675 82 1465674 87 1468673 92 14...,37.0,Male
1,10274,prostate,HPA,3000,3000,0.4,4,715707 2 718705 8 721703 11 724701 18 727692 3...,76.0,Male
2,10392,spleen,HPA,3000,3000,0.4,4,1228631 20 1231629 24 1234624 40 1237623 47 12...,82.0,Male
3,10488,lung,HPA,3000,3000,0.4,4,3446519 15 3449517 17 3452514 20 3455510 24 34...,78.0,Male
4,10610,spleen,HPA,3000,3000,0.4,4,478925 68 481909 87 484893 105 487863 154 4908...,21.0,Female


In [3]:
ids = train_df.id.to_list()

[10044,
 10274,
 10392,
 10488,
 10610,
 10611,
 10651,
 10666,
 10703,
 10892,
 10912,
 10971,
 10992,
 11064,
 1123,
 11448,
 11497,
 1157,
 11629,
 11645,
 11662,
 1168,
 1184,
 11890,
 12026,
 12174,
 1220,
 12233,
 12244,
 1229,
 12452,
 12466,
 12471,
 12476,
 12483,
 127,
 12784,
 12827,
 13034,
 13189,
 13260,
 13396,
 13483,
 13507,
 13942,
 14183,
 14388,
 14396,
 144,
 14407,
 14674,
 14756,
 1500,
 15005,
 15067,
 15124,
 15192,
 15329,
 15499,
 15551,
 15706,
 15732,
 15787,
 15842,
 15860,
 16149,
 16163,
 16214,
 16216,
 16362,
 164,
 16564,
 16609,
 16659,
 16711,
 16728,
 16890,
 1690,
 17126,
 17143,
 17187,
 1731,
 17422,
 17455,
 17828,
 18121,
 18401,
 18422,
 18426,
 18445,
 18449,
 1850,
 18777,
 1878,
 18792,
 18900,
 19048,
 19084,
 19179,
 19360,
 19377,
 19507,
 19533,
 1955,
 19569,
 19997,
 20247,
 203,
 20302,
 20428,
 20440,
 20478,
 20520,
 20563,
 2079,
 20794,
 20831,
 20955,
 21021,
 21039,
 21086,
 21112,
 21129,
 21155,
 21195,
 21321,
 21358,
 2150

In [4]:
paths = [hubmap.utils.paths.TRAIN_IMAGES.joinpath(f'{i}.tiff') for i in ids]

In [6]:
for path in paths:
    assert path.exists()

In [8]:
index = 10
i = ids[index]
path = paths[index]

i, path

(10912,
 PosixPath('/Users/najibishaq/Documents/kaggle/input/hubmap-organ-segmentation/train_images/10912.tiff'))

In [10]:
image = numpy.squeeze(pyplot.imread(path))
(x_end, y_end, num_channels) = image.shape
x_end, y_end, num_channels

(3000, 3000, 3)

In [14]:
rle = train_df[train_df.id == i]['rle'].iloc[-1]
mask = hubmap.utils.helpers.rle_to_mask(rle, x_end, y_end)

In [17]:
tile_stride = 256
train_out_dir = hubmap.utils.paths.WORKING_DIR.joinpath('train', 'intensity')

In [18]:
tile_template = numpy.zeros((tile_stride, tile_stride), dtype=numpy.float32)

for c in range(num_channels):
    min_intensity = numpy.min(image[..., c])
    max_intensity = numpy.max(image[..., c])

    for x, x_min in enumerate(range(0, x_end, tile_stride)):
        x_max = min(x_end, x_min + tile_stride)

        for y, y_min in enumerate(range(0, y_end, tile_stride)):
            y_max = min(y_end, y_min + tile_stride)

            in_tile = image[x_min:x_max, y_min:y_max, c]
            in_tile = (in_tile - min_intensity) / (max_intensity - min_intensity + hubmap.utils.constants.EPSILON)

            out_tile = numpy.zeros_like(tile_template)
            out_tile[:x_max - x_min, :y_max - y_min] = in_tile[:]

            name = f'{i}_x{x:02d}_y{y:02d}_c{c:1d}.npy'
            out_path = train_out_dir.joinpath(name)
            numpy.save(
                file=str(out_path),
                arr=out_tile,
                allow_pickle=False,
                fix_imports=False,
            )

In [19]:
train_out_dir = hubmap.utils.paths.WORKING_DIR.joinpath('train', 'labels')
tile_template = numpy.zeros((tile_stride, tile_stride), dtype=numpy.uint8)

for x, x_min in enumerate(range(0, x_end, tile_stride)):
    x_max = min(x_end, x_min + tile_stride)

    for y, y_min in enumerate(range(0, y_end, tile_stride)):
        y_max = min(y_end, y_min + tile_stride)

        in_tile = mask[x_min:x_max, y_min:y_max]

        out_tile = numpy.zeros_like(tile_template)
        out_tile[:x_max - x_min, :y_max - y_min] = in_tile[:]

        name = f'{i}_x{x:02d}_y{y:02d}.npy'
        out_path = train_out_dir.joinpath(name)
        numpy.save(
            file=str(out_path),
            arr=out_tile,
            allow_pickle=False,
            fix_imports=False,
        )