In [0]:
# Install the OpenSlide C library and Python bindings
!apt-get install openslide-tools
!pip install openslide-python

Reading package lists... Done
Building dependency tree       
Reading state information... Done
openslide-tools is already the newest version (3.4.1+dfsg-2).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 7 not upgraded.


In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from openslide import open_slide, __library_version__ as openslide_version
import os
from PIL import Image
from skimage.color import rgb2gray
from google.colab import drive

In [0]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


**Params to be changed.**

In [0]:
# Params to be changed.

train = ['001','002','005','019','023','031','035','057','059','064','075','078','081','084','091','094','096']
validate = ['012','016']

analyse = 'train'
tissue_percentage = 30
stride = 5
levels = [1, 3, 4, 5, 6]

In [0]:
! ls gdrive/'My Drive'
root = 'gdrive/My Drive/adl_pro/'

 adl_pro			    'Colab Notebooks'
 ADL_Project_Script.gdoc	    'COMS 6998 - Cloud Project Proposal.gdoc'
 Aggarwal_Luv_la2733_Resume_ML.pdf   la2733_observations.gdoc
 Aggarwal_Luv_la2733_Resume.pdf     'MMD CAR POOL  APP.gslides'
 AlaapFinalReport.pdf		     Notability
'AML Midterm.gdoc'		     slides


In [0]:
def read_slide(slide, x, y, level, width, height, as_float=False):
    im = slide.read_region((x,y), level, (width, height))
    im = im.convert('RGB') # drop the alpha channel
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return im

In [0]:
def find_tissue_pixels(image, intensity=0.8):
    im_gray = rgb2gray(image)
    assert im_gray.shape == (image.shape[0], image.shape[1])
    indices = np.where(im_gray <= intensity)
    return list(zip(indices[0], indices[1]))

def apply_mask(im, mask, color=(255,0,0)):
    masked = np.copy(im)
    for x,y in mask: masked[x][y] = color
    return masked

In [0]:
def get_files(image):
  base = root + 'slides/'
  
  slide_path = 'tumor_' + image + '.tif'
  tumor_mask_path = 'tumor_' + image + '_mask.tif'

  slide_url = base + slide_path
  mask_url = base + tumor_mask_path
  
  return open_slide(slide_url), open_slide(mask_url)

In [0]:
import cv2

def pad_files(image, mask, stride, height, width):
  gap_height = stride - ( height % stride )
  gap_width = stride - ( width % stride )
  
  if gap_height % 2 == 1:
    top = (gap_height - 1) / 2
    bottom = (gap_height + 1) / 2
  else:
    top = bottom = gap_height / 2
    
  if gap_width % 2 == 1:
    left = (gap_width - 1) / 2
    right = (gap_width + 1) / 2
  else:
    left = right = gap_width / 2
    
  pixel = [0,0,0]
  
  return cv2.copyMakeBorder(image,top,bottom,left,right,cv2.BORDER_CONSTANT,value=pixel),\
      cv2.copyMakeBorder(mask,top,bottom,left,right,cv2.BORDER_CONSTANT,value=pixel[1:]),\
      height + gap_height, width + gap_width

In [0]:
def prune(mask):
  heights = list(np.max(mask, axis = 1))
  top = min(heights.index(1), heights[::-1].index(1))
  bottom = max(heights.index(1), heights[::-1].index(1))
  
  widths = list(np.max(mask, axis = 0))
  left = min(widths.index(1), widths[::-1].index(1))
  right = max(widths.index(1), widths[::-1].index(1))
  return top, bottom, left, right

In [0]:
import collections

def create_data(id, level, dim, stride, tissue_percentage, analyse):
  base = root + 'data/level' + str(level) + '/'
  healthy_counter = 0
  cancer_counter = 0
  
  slide, tumor_mask = get_files(id)
  x = 0
  y = 0
  width=slide.level_dimensions[level][0]
  height=slide.level_dimensions[level][1]

  stride = ( len(slide.level_dimensions) - level + 1 ) * stride

  image = read_slide(slide, x, y, level, width, height)
  mask = read_slide(tumor_mask, x, y, level, width, height)[:,:,0]

  # image, mask, height, width = pad_files(image, mask, stride, height, width)
  top, bottom, left, right = prune(mask)

  top = max(top-dim,0)
  bottom = min(bottom+dim,height-dim)

  left = max(left-dim,0)
  right = min(right+dim,width-dim)

  for x in range(top,bottom,stride):
    for y in range(left,right,stride):
      data = image[x:x+dim,y:y+dim,:]
      tissue = len(find_tissue_pixels(data)) / float(data.shape[0]**2) * 100
      if tissue > tissue_percentage:
        label = np.max(mask[x:x+dim,y:y+dim])
        if label == 0:
          healthy_counter += 1
          path = base + analyse + '/healthy/' + id + '_' + str(healthy_counter)
        else:
          cancer_counter += 1
          path = base + analyse + '/cancer/' + id + '_' +  str(cancer_counter)
        data = Image.fromarray(data, 'RGB')
        if not os.path.exists(path + '.png'):
          data.save( path + '.png')  
  return healthy_counter + cancer_counter

In [0]:
import pickle

for img in eval(analyse):
  for level in levels:
    if not os.path.exists(root + 'data/level' + str(level)):
      os.makedirs(root + 'data/level' + str(level))
      os.makedirs(root + 'data/level' + str(level) + '/' + analyse)
      os.makedirs(root + 'data/level' + str(level) + '/' + analyse + '/cancer/')
      os.makedirs(root + 'data/level' + str(level) + '/' + analyse + '/healthy/')
    count = create_data(img, level = level, dim = 299, stride = stride, tissue_percentage = tissue_percentage, analyse = analyse)
    print('Generated ',count,' images for ', img, ' at level ',level)
      