In [1]:
import os
from multiprocessing import Pool, Value
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from fractions import math

from skimage import color, exposure, io
from skimage.transform import resize

from sklearn import preprocessing
from sklearn.externals import joblib

In [2]:
in_dir = "../input/images/"
out_dir = "../input/preprocessed-images/"
shape = 256
scaler_filename = "../models/images_StandardScaler.save"

In [3]:
def process_image(image_dir):
    image = io.imread(in_dir+image_dir)
    
    r = math.gcd(image.shape[0], image.shape[1])
    widht_ratio = int(image.shape[1] / r)
    height_ratio = int(image.shape[0] / r)

    # crop
    if widht_ratio > height_ratio*1.5:
        image = image[:,int((image.shape[1]-shape)/2):int((image.shape[1]-shape)/2+shape)]       
    elif height_ratio > widht_ratio*1.5:
        image = image[int((image.shape[0]-shape)/2):int((image.shape[0]-shape)/2+shape),:]

    image = resize(image, (shape, shape), mode='reflect', anti_aliasing=True)
    image = color.rgb2gray(image)
    image = exposure.equalize_hist(image)
    print("preprocessed: "+image_dir)
    print("saved in: "+out_dir+image_dir)
    io.imsave(out_dir+image_dir,image)    
    return image

In [4]:
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

processes = 4

split_n = 8000

# scaler = preprocessing.MinMaxScaler(feature_range=(0,255))
scaler = preprocessing.StandardScaler()

dir_list = np.array(os.listdir(in_dir))

pool = Pool(processes=processes)  # Num of CPUs

for sub_dir_list in np.array_split(dir_list, split_n):
    # crop, resize, rgb to grey and hist equalization.
    images = np.array(pool.map(process_image, sub_dir_list, chunksize = 8))

    # standarization or normalization
    images = np.reshape(images,(len(images),-1))
    scaler.partial_fit(images)

joblib.dump(scaler, scaler_filename)

pool.close()
pool.terminate()

preprocessed: ae095c33b71fba55.jpg
saved in: ../input/preprocessed-images/ae095c33b71fba55.jpg


  .format(dtypeobj_in, dtypeobj_out))


preprocessed: a1c59822cc89ee12.jpg
saved in: ../input/preprocessed-images/a1c59822cc89ee12.jpg


  .format(dtypeobj_in, dtypeobj_out))


preprocessed: 35a28cb435152d2d.jpg
saved in: ../input/preprocessed-images/35a28cb435152d2d.jpg
preprocessed: 0fde08382dea186f.jpg
saved in: ../input/preprocessed-images/0fde08382dea186f.jpg
preprocessed: 3881e78a167964bd.jpg
saved in: ../input/preprocessed-images/3881e78a167964bd.jpg
preprocessed: d0a260e436719d70.jpg
saved in: ../input/preprocessed-images/d0a260e436719d70.jpg
preprocessed: 46d137ecd7c056b1.jpg
saved in: ../input/preprocessed-images/46d137ecd7c056b1.jpg
preprocessed: c216891c5db191f1.jpg
saved in: ../input/preprocessed-images/c216891c5db191f1.jpg
preprocessed: db26d28c00849935.jpg
saved in: ../input/preprocessed-images/db26d28c00849935.jpg
preprocessed: 3f34d057897febaf.jpg
saved in: ../input/preprocessed-images/3f34d057897febaf.jpg


Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Process ForkPoolWorker-6:
Process ForkPoolWorker-5:
Process ForkPoolWorker-4:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/lib/python3.6/multiproc

KeyboardInterrupt: 