In [1]:
import numpy as np
import pandas as pd
import cv2

import time
import os
import gc
from tqdm import tqdm

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
HEIGHT = 137
WIDTH = 236

CROP_SIZE = 128

In [3]:
from pathlib import Path

outdir = Path('.')

* Load images from parquet/feather depending on submit
* crop and save images

In [4]:
def crop_image(img, crop_size=CROP_SIZE):
    return cv2.resize(img, (crop_size, crop_size))

In [5]:
def prepare_image(data_type='train', submission=False, indices=[0, 1, 2, 3]):
    assert data_type in ['train', 'test']
    
    datadir = Path('/kaggle/input/bengaliai-cv19')
    featherdir = Path('/kaggle/input/bengaliaicv19feather')
    
    if submission:
        image_df_list = [pd.read_parquet(datadir / f'{data_type}_image_data_{i}.parquet')
                         for i in indices]
    else:
        image_df_list = [pd.read_feather(featherdir / f'{data_type}_image_data_{i}.feather')
                         for i in indices]

    img_names = np.concatenate([df.iloc[:, 0] for df in image_df_list])
    print('Read complete')

    reshape_data = lambda data: 255 - data.iloc[:, 1:].values.reshape(-1, HEIGHT, WIDTH).astype(np.uint8)
    images = [reshape_data(df) for df in image_df_list]
    
    print('Reshape complete')
    
    del image_df_list
    gc.collect()
    
    images = np.concatenate(images, axis=0)
    print(f'Concatenate complete. Shape: {images.shape}')
    return img_names, images

In [6]:
img_names, images = prepare_image(data_type='train', submission=False, indices=[0, 1, 2, 3])

Read complete
Reshape complete
Concatenate complete. Shape: (200840, 137, 236)


## Run this to get train archive

In [7]:
import zipfile

OUT_TRAIN = 'train.zip'

images_mean = list()
images_var = list()

with zipfile.ZipFile(OUT_TRAIN, 'w') as img_out:
    for idx in tqdm(range(len(images))):
        name = img_names[idx]

        img = crop_image(images[idx])

        images_mean.append((img / 255.0).mean())
        images_var.append(((img / 255.0) ** 2).mean()) 

        img = cv2.imencode('.png', img)[1]
        img_out.writestr(name + '.png', img)

100%|██████████| 200840/200840 [04:41<00:00, 714.51it/s]


In [8]:
# img = crop_image(images[0])
# name = img_names[0]
# cv2.imwrite(name + '.png', img)

In [9]:
# image stats
img_avr =  np.array(images_mean).mean()
img_std =  np.sqrt(np.array(images_var).mean() - img_avr**2)
print('mean:', img_avr, ', std:', img_std)

mean: 0.05280547235965753 , std: 0.1628750964925304
