## Compress dataset using HDF5

In [1]:
import h5py

import os
import time

import numpy
import pandas
import skimage
from skimage import io

### Compressing the testset.

#### LZF compression with single dataset per image

In [136]:
df_test = pandas.read_csv("./data/sample_submission.csv")

images_test = []

print("Starting.")
tstart = time.time()
with h5py.File("./data/test.hdf5", "w") as f:
    for i,item in df_test.iterrows():
        imid = item['ID']
        imshape = (item['ImageHeight'], item['ImageWidth'], 4)
        #imroot = os.path.join("./data/test", imid)
        #imstems = ("_red.png", "_blue.png", "_yellow.png", "_green.png")
        #paths = [imroot+imstem for imstem in imstems]
        #image = [skimage.io.imread(path) for path in paths]
        #image = numpy.dstack(image)
        #images_test.append(image)
        f.create_dataset(
                    name=imid,
                    data=images[i],
                    shape=imshape,
                    maxshape=imshape,
                    compression="lzf"
                   )
        if (i+1)%50 == 0:
            ips = (i+1)/(time.time()-tstart)
            eta = len(df_test)/ips - (len(df_test)-i-1)/ips
            print(f"Processed {i+1} images. (ips: {ips:.2f} / ETA: {eta/60:.2f} min)")
            
print(f"Finished. Took {(time.time()-tstart)/60:.2f} minutes.")

Starting.
Processed 50 images. (ips: 1.25 / ETA: 7.43 min)
Processed 100 images. (ips: 1.21 / ETA: 7.68 min)
Processed 150 images. (ips: 1.22 / ETA: 7.64 min)
Processed 200 images. (ips: 1.22 / ETA: 7.65 min)
Processed 250 images. (ips: 1.21 / ETA: 7.72 min)
Processed 300 images. (ips: 1.21 / ETA: 7.68 min)
Processed 350 images. (ips: 1.22 / ETA: 7.66 min)
Processed 400 images. (ips: 1.22 / ETA: 7.64 min)
Processed 450 images. (ips: 1.21 / ETA: 7.70 min)
Processed 500 images. (ips: 1.20 / ETA: 7.78 min)
Processed 550 images. (ips: 1.20 / ETA: 7.76 min)
Finished. Took 7.76 minutes.


### Compressing the Trainset.

#### LZF compression with single dataset per image

In [None]:
df_train = pandas.read_csv("./data/train.csv")
images_train = []

print("Starting.")
tstart = time.time()
with h5py.File("./data/train.hdf5", "w") as f:
    for i,item in df_train.iterrows():
        imid = item['ID']
        imroot = os.path.join("./data/train", imid)
        imstems = ("_red.png", "_blue.png", "_yellow.png", "_green.png")
        paths = [imroot+imstem for imstem in imstems]
        image = [skimage.io.imread(path) for path in paths]
        image = numpy.dstack(image)
        images_train.append(image)
        imshape = image.shape
        f.create_dataset(
                    name=imid,
                    data=image,
                    shape=imshape,
                    maxshape=imshape,
                    compression="lzf"
                   )
        if (i+1)%250 == 0:
            ips = (i+1)/(time.time()-tstart)
            eta = len(df_train)/ips - (len(df_test)-i-1)/ips
            print(f"Processed {i+1} images. (ips: {ips:.2f} / ETA: {eta/60:.2f} min)")
            
print(f"Finished. Took {(time.time()-tstart)/60:.2f} minutes.")

Starting.
Processed 250 images. (ips: 0.71 / ETA: 507.28 min)
Processed 500 images. (ips: 0.65 / ETA: 561.17 min)
Processed 750 images. (ips: 0.63 / ETA: 582.80 min)
Processed 1000 images. (ips: 0.62 / ETA: 601.51 min)
Processed 1250 images. (ips: 0.61 / ETA: 614.65 min)


In [155]:
len(images_train)

21806

#### Not do: LZF compression with ImageDimension pooling

In [141]:
images_1728 = [image for image in images if image.shape[0] == 1728]
images_2048 = [image for image in images if image.shape[0] == 2048]
images_3072 = [image for image in images if image.shape[0] == 3072]

In [144]:
images_1728 = numpy.stack(images_1728)
images_2048 = numpy.stack(images_2048)
images_3072 = numpy.stack(images_3072)

In [150]:
print("Starting.")
tstart = time.time()
with h5py.File("./data/test_pool.hdf5", "w") as f:
    f.create_dataset(
        name='1728px',
        data=images_1728,
        shape=images_1728.shape,
        maxshape=images_1728.shape,
        compression="lzf",
       )
    ips = images_1728.shape[0]/(time.time()-tstart)
    print(f"1728px finished. Took {(time.time()-tstart)/60:.2f} minutes. (ips: {ips:.2f})")
    f.create_dataset(
        name='2048px',
        data=images_2048,
        shape=images_2048.shape,
        maxshape=images_2048.shape,
        compression="lzf"
       )
    ips = (images_1728.shape[0] + images_2048.shape[0])/(time.time()-tstart)
    print(f"2048px finished. Took {(time.time()-tstart)/60:.2f} minutes. (ips: {ips:.2f})")
    f.create_dataset(
        name='3072px',
        data=images_3072,
        shape=images_3072.shape,
        maxshape=images_3072.shape,
        compression="lzf"
       )
    ips = (images_1728.shape[0] + images_2048.shape[0] + images_3072.shape[0])/(time.time()-tstart)
    print(f"3072px finished. Took {(time.time()-tstart)/60:.2f} minutes. (ips: {ips:.2f})")

Starting.
1728px finished. Took 0.49 minutes. (ips: 2.30)
2048px finished. Took 6.76 minutes. (ips: 1.31)
3072px finished. Took 7.60 minutes. (ips: 1.23)


In [None]:
df_train = pandas.read_csv("./data/train.csv")

images_train = []
for i,item in df_train.iterrows():
    imid = item['ID']
    imshape = (item['ImageHeight'], item['ImageWidth'], 4)
    imroot = os.path.join("./data/train", imid)
    imstems = ("_red.png", "_blue.png", "_yellow.png", "_green.png")
    paths = [imroot+imstem for imstem in imstems]
    image = [skimage.io.imread(path) for path in paths]
    image = numpy.dstack(image)
    images_train.append(image)

In [None]:
images_1728 = [image for image in images_train if image.shape[0] == 1728]
images_2048 = [image for image in images_train if image.shape[0] == 2048]
images_3072 = [image for image in images_train if image.shape[0] == 3072]

In [None]:
images_1728 = numpy.stack(images_1728)
images_2048 = numpy.stack(images_2048)
images_3072 = numpy.stack(images_3072)

In [None]:
print("Starting.")
tstart = time.time()
with h5py.File("./data/train_pool.hdf5", "w") as f:
    f.create_dataset(
        name='1728px',
        data=images_1728,
        shape=images_1728.shape,
        maxshape=images_1728.shape,
        compression="lzf",
       )
    ips = images_1728.shape[0]/(time.time()-tstart)
    print(f"1728px finished. Took {(time.time()-tstart)/60:.2f} minutes. (ips: {ips:.2f})")
    f.create_dataset(
        name='2048px',
        data=images_2048,
        shape=images_2048.shape,
        maxshape=images_2048.shape,
        compression="lzf"
       )
    ips = (images_1728.shape[0] + images_2048.shape[0])/(time.time()-tstart)
    print(f"2048px finished. Took {(time.time()-tstart)/60:.2f} minutes. (ips: {ips:.2f})")
    f.create_dataset(
        name='3072px',
        data=images_3072,
        shape=images_3072.shape,
        maxshape=images_3072.shape,
        compression="lzf"
       )
    ips = (images_1728.shape[0] + images_2048.shape[0] + images_3072.shape[0])/(time.time()-tstart)
    print(f"3072px finished. Took {(time.time()-tstart)/60:.2f} minutes. (ips: {ips:.2f})")