# Data pre-processing steps

This notebook will read the svs files in the data file using openslide python and apply the following pre-processing steps:

-Crop the images in small batches of size 896*896

-Filter out the tiles that contain less than 90% of brain (Tumor??) tissue: Hysteresis thresholding on the grayscale and 8-bit depth complemented image (http://ac.els-cdn.com/S1361841515001838/1-s2.0-S1361841515001838-main.pdf?_tid=f96cb1fa-35ba-11e7-b61d-00000aab0f6b&acdnat=1494446462_d2ee895640e38bd660bc559fc6233d34)

-Optional: Nuclei segmentation using morphometric top-hat filtering and hysteresis thresholding (http://ac.els-cdn.com/S1361841515001838/1-s2.0-S1361841515001838-main.pdf?_tid=f96cb1fa-35ba-11e7-b61d-00000aab0f6b&acdnat=1494446462_d2ee895640e38bd660bc559fc6233d34)

-Further tile to 224*224 (input size of a ResNet or Inception CNN)

input_size $= N*224*224*(1 \ or \ 3)$

label_size $= N$

N = Number_of_images * Number_of_patches

For now, the notebook only treats one svs file : "test.svs". The pre-processing steps are very computationnally expensive so we need to parallelize the code.

In [1]:
from openslide import *
import numpy as np
import skimage
import matplotlib.pyplot as plt
import os

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

#Main functions of openslide
#img.get_thumbnail((1000,1000))
#img.dimensions
#img.properties        
#img.level_count       
#img.level_dimensions  
#img.level_downsamples

# Batch Creation

In [5]:
#Split the image in patches, this process takes ~10min per image and need to be parallelized!! (GPU or multiple CPUs)

data_path = "/home/cedoz/data"
os.chdir(data_path)

img  = OpenSlide("test.svs")
height, width = img.dimensions
batch_size = 896
X_train = np.zeros((height*width/(batch_size**2), batch_size, batch_size, 3))

#pool = Pool()
#result1 = pool.apply_async(solve1, [A])    # evaluate "solve1(A)" asynchronously
#result2 = pool.apply_async(solve2, [B])    # evaluate "solve2(B)" asynchronously
#answer1 = result1.get(timeout=10)
#answer2 = result2.get(timeout=10)

def fill_X_train(i,j):
    batch = img.read_region(location = (i*batch_size,j*batch_size), level = 0, size = (batch_size,batch_size))
    X_train[i*(width/batch_size)+j,:,:,:] = np.array(batch.getdata())[:,0:-1].reshape((batch_size, batch_size, 3))

#args = [A, B]
#results = pool.map(solve1, args)

for i in xrange(height/batch_size):
    print ("iteration %d out of %d"%(i,height/batch_size))
    for j in xrange(width/batch_size):
        fill_X_train(i,j)
    #results = pool.map(fill_X_train, (i,xrange(width/batch_size)))

iteration 0 out of 29
iteration 1 out of 29
iteration 2 out of 29
iteration 3 out of 29
iteration 4 out of 29
iteration 5 out of 29
iteration 6 out of 29
iteration 7 out of 29
iteration 8 out of 29
iteration 9 out of 29
iteration 10 out of 29
iteration 11 out of 29


Process PoolWorker-19:
Process PoolWorker-7:
Process PoolWorker-9:
Process PoolWorker-12:
Process PoolWorker-16:
Process PoolWorker-14:
Process PoolWorker-10:
Process PoolWorker-8:
Process PoolWorker-15:
Process PoolWorker-6:
Process PoolWorker-22:
Process PoolWorker-3:
Process PoolWorker-13:
Process PoolWorker-11:
Process PoolWorker-24:
Process PoolWorker-1:
Process PoolWorker-4:
Process PoolWorker-5:
Process PoolWorker-2:
Process PoolWorker-23:
Process PoolWorker-21:
Process PoolWorker-17:
Process PoolWorker-18:
Process PoolWorker-20:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Tr

KeyboardInterrupt: 

    racquire()
    racquire()
    racquire()
    racquire()
    racquire()
    racquire()
    return recv()
    racquire()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt


# Batch Visualization

In [None]:

# Visualize batches from the WSI.

for i in xrange(height/batch_size):
    for j in xrange(width/batch_size):
        plt_idx = i*(width/batch_size) + j  + 1
        plt.subplot(height/batch_size, width/batch_size, plt_idx)
        plt.imshow(X_train[plt_idx].astype('uint8'))
        plt.axis('off')
plt.show()


In [45]:
#Converting the image to grayscale
image1_grayscale = image1.convert(mode = 'L')
image1_grayscale_array = np.array(image1_grayscale.getdata()).reshape((image1.size[0], image1.size[1], 1))
image1_grayscale_array.shape

(1000, 1000, 1)

In [4]:
#Experimenting with the multiprocessing package to parallelize on the 8 CPUs
import time
from multiprocessing import Pool

def f(x):
    return x**3

A1 = np.zeros(100)
A2 = np.zeros(100)
start1 = time.time()
for i in xrange(100):   
    A1[i] = f(i)
end1 = time.time()
elapsed1 = end1 - start1

start2 = time.time()
pool = Pool()
results = pool.map(f, xrange(100))
end2 = time.time()
elapsed2 = end2 - start2
  
print("time with unparallelized implementation %f"%elapsed1)    
print("time with parallelized implementation %f"%elapsed2)    

#Using the "map" function, it is weird because the operation time increases!!


time with unparallelized implementation 0.000140
time with parallelized implementation 0.019719


In [45]:
from multiprocessing import Process, Queue
import time

def f(x):
    A[x] = x**3

if __name__ == '__main__':

    #q = Queue()
    #The list of examples that we want to compute
    examples = [i for i in xrange(1000)]

    # normal approach
    A = np.zeros(1000)
    t = time.time()
    for example in examples:
        f(example)
    elapsed1 = time.time()-t

    #Parallel approach
    A = np.zeros(1000)
    t = time.time()
    #Split work into 8 processes (8 cores)
    processes = 8
    #Split the input in 8 chunks for each processes
    chunksize = len(examples)/processes
    examplessplitted = [examples[x:x+chunksize] for x in xrange(0, len(examples), chunksize)]
    for subexample in examplessplitted:
        p = Process(target=f, args=(subexample))
        p.Daemon = True
        p.start()
    for subexample in examplessplitted:
        p.join()
    elapsed2 = time.time()-t
    #print A
        

0.000295162200928
8


Process Process-281:
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process Process-282:
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
TypeError: f() takes exactly 1 argument (125 given)
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
TypeError: f() takes exactly 1 argument (125 given)
Process Process-283:
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
TypeError: f() takes exactly 1 argument (125 given)
Process Process-284:
Traceback (mo

0.190186977386


TypeError: f() takes exactly 1 argument (125 given)
Process Process-287:
Process Process-288:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
TypeError: f() takes exactly 1 argument (125 given)
TypeError: f() takes exactly 1 argument (125 given)
