This code is adapted based on code examples in Ramsundar, Bharath; Eastman, Peter; Walters, Patrick; Pande, Vijay. Deep Learning for the Life Sciences, Chapter 7.

# Installing DeepChem

In [1]:
# Installing RDKit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

--2019-08-02 19:29:24--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.200.79, 104.18.201.79, 2606:4700::6812:c84f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.200.79|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75257002 (72M) [application/x-sh]
Saving to: ‘Miniconda3-latest-Linux-x86_64.sh’


2019-08-02 19:29:29 (162 MB/s) - ‘Miniconda3-latest-Linux-x86_64.sh’ saved [75257002/75257002]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ done
Solving environment: / - \ done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==0.24.0=py37_0
    - bzip2==1.0.8=h7b6447c_0
    - ca-certificates==2019.5.15=0
    - certifi==2019.6.16=py37_0
    - cffi==1.12.3=py37h2e261b9_0
    - chardet==3.0.4=py37_1
    - conda-package-handling==1.

In [0]:
# append rdkit path to current python system path.
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [3]:
# Install DeepChem 
!pip install deepchem

Collecting deepchem
[?25l  Downloading https://files.pythonhosted.org/packages/05/03/ccdd048c61c070dca8aa572010c7ae39a46caad162ca7a3ecc62881b5124/deepchem-2.2.1.dev54.tar.gz (3.9MB)
[K     |████████████████████████████████| 3.9MB 3.3MB/s 
[?25hBuilding wheels for collected packages: deepchem
  Building wheel for deepchem (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/c7/49/0f/0b4235337998b7eadd19f137bf648515da501ad09fd63d4ba0
Successfully built deepchem
Installing collected packages: deepchem
Successfully installed deepchem-2.2.1.dev54


In [4]:
import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import numpy as np
import os
import re



# Dataset
To run this example, you will need to download the Broad BBBC005 dataset from https://data.broadinstitute.org/bbbc/BBBC005/. No login or registration is needed to download this dataset, so the raw images can simply be fetched with

! wget https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_images.zip unzip BBBC005_v1_images.zip

The ground-truth segmentation masks can be fetched as follows

! wget https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_ground_truth.zip unzip BBBC005_v1_ground_truth.zip

In [8]:
# First download the BBBC dataset
! wget https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_images.zip

--2019-08-02 19:33:09--  https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_images.zip
Resolving data.broadinstitute.org (data.broadinstitute.org)... 69.173.92.29
Connecting to data.broadinstitute.org (data.broadinstitute.org)|69.173.92.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1882973059 (1.8G) [application/zip]
Saving to: ‘BBBC005_v1_images.zip’


2019-08-02 19:36:56 (7.91 MB/s) - ‘BBBC005_v1_images.zip’ saved [1882973059/1882973059]



In [10]:
! unzip BBBC005_v1_images.zip

Archive:  BBBC005_v1_images.zip
   creating: BBBC005_v1_images/
  inflating: BBBC005_v1_images/SIMCEPImages_B13_C53_F4_s11_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_I06_C23_F26_s12_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_A14_C57_F1_s21_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_E07_C27_F14_s05_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_B08_C31_F4_s16_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_K17_C70_F32_s08_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_G04_C14_F20_s09_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_J11_C44_F29_s06_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_D02_C5_F10_s15_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_E16_C66_F14_s07_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_F03_C10_F17_s13_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_G20_C83_F20_s20_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_D12_C48_F10_s12_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages

In [11]:
# The ground-truth segmentation masks can be fetched as follows

! wget https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_ground_truth.zip

--2019-08-02 19:40:02--  https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_ground_truth.zip
Resolving data.broadinstitute.org (data.broadinstitute.org)... 69.173.92.29
Connecting to data.broadinstitute.org (data.broadinstitute.org)|69.173.92.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12158428 (12M) [application/zip]
Saving to: ‘BBBC005_v1_ground_truth.zip’


2019-08-02 19:40:04 (6.54 MB/s) - ‘BBBC005_v1_ground_truth.zip’ saved [12158428/12158428]



In [12]:
! unzip BBBC005_v1_ground_truth.zip

Archive:  BBBC005_v1_ground_truth.zip
   creating: synthetic_2_ground_truth/
  inflating: synthetic_2_ground_truth/SIMCEPImages_A14_C57_F1_s21_w1.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A04_C14_F1_s15_w2.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A22_C91_F1_s25_w1.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A13_C53_F1_s14_w2.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A23_C96_F1_s23_w1.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A07_C27_F1_s04_w1.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A05_C18_F1_s05_w2.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A08_C31_F1_s15_w2.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A07_C27_F1_s02_w2.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A21_C87_F1_s06_w1.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A11_C44_F1_s15_w1.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A12_C48_F1_s22_w2.TIF  
  inflating: synthetic_

In [13]:
# check the files on colab drive
! ls

BBBC005_v1_ground_truth      Miniconda3-latest-Linux-x86_64.sh
BBBC005_v1_ground_truth.zip  models
BBBC005_v1_images	     sample_data
BBBC005_v1_images.zip	     synthetic_2_ground_truth


In [0]:
# Load the datasets.
image_dir = 'BBBC005_v1_images'
files = []
labels = []

# We will use regular expressions to append the label (i.e number of cells)
# The number of cells is embedded in the file names.
# Example: SIMCEPImages_J10_C40_F29_s06_w2.TIF --> 40 cells (C_40)
# Regular expressions:
#     .	Any character (except newline character)
#     *	Zero or more occurrences
#     ? Adding ? after the qualifier makes it perform the match in a minimal fashion
for f in os.listdir(image_dir):
  if f.endswith('.TIF'):
    files.append(os.path.join(image_dir, f))
    labels.append(int(re.findall('_C(.*?)_', f)[0]))

# featurize and and create the train, validation, and test sets
loader = dc.data.ImageLoader()
dataset = loader.featurize(files, np.array(labels))
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, seed=123)

In [15]:
# inspect the sets
print('Size of the training set: ', train_dataset.get_shape())

Size of the training set:  (array([15360,   520,   696]), (15360,), (15360,), (15360,))


# Building the Model

It will take some time to train a model on this dataset. It might be best if you download the pretrained models instead. 

In [0]:
# create a directory for storing the models

! mkdir models 
! cd models

In [16]:
# Get the pretrained models
! wget https://s3-us-west-1.amazonaws.com/deepchem.io/featurized_datasets/microscopy_models.zip -P ./models

--2019-08-02 19:42:10--  https://s3-us-west-1.amazonaws.com/deepchem.io/featurized_datasets/microscopy_models.zip
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.24.33
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.24.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 88952487 (85M) [application/zip]
Saving to: ‘./models/microscopy_models.zip’


2019-08-02 19:42:12 (54.9 MB/s) - ‘./models/microscopy_models.zip’ saved [88952487/88952487]



In [17]:
# Unzip
! unzip ./models/microscopy_models.zip

Archive:  ./models/microscopy_models.zip
   creating: model/
  inflating: model/model-6999.data-00000-of-00001  
  inflating: model/model-5999.index  
  inflating: model/model-3999.meta   
  inflating: model/model-4999.index  
  inflating: model/model-7700.data-00000-of-00001  
  inflating: model/model-6999.index  
  inflating: model/model-5999.data-00000-of-00001  
  inflating: model/model-4999.data-00000-of-00001  
  inflating: model/model-5999.meta   
  inflating: model/model-6999.meta   
  inflating: model/model-7700.meta   
  inflating: model/checkpoint        
  inflating: model/model-3999.data-00000-of-00001  
  inflating: model/model-3999.index  
  inflating: model/model-7700.index  
  inflating: model/model-4999.meta   
   creating: segmentation/
 extracting: segmentation/model-5000.data-00000-of-00001  
 extracting: segmentation/model-97.data-00000-of-00001  
  inflating: segmentation/model-500.data-00000-of-00001  
  inflating: segmentation/model-1540.data-00000-of-00001  
 

In [18]:
! ls ./model

checkpoint			model-5999.index
model-3999.data-00000-of-00001	model-5999.meta
model-3999.index		model-6999.data-00000-of-00001
model-3999.meta			model-6999.index
model-4999.data-00000-of-00001	model-6999.meta
model-4999.index		model-7700.data-00000-of-00001
model-4999.meta			model-7700.index
model-5999.data-00000-of-00001	model-7700.meta


In [0]:
RETRAIN = True

# Create the model.
# First, we create a decay rate function as learning rate
# initial_rate= 0.001, is the initial learning rate.
# decay_rate = 0.9, is the base of the exponential
# decay_steps = 250, is the number of training steps over which the rate decreases by decay_rate.
learning_rate = dc.models.tensorgraph.optimizers.ExponentialDecay(0.001, 0.9, 250)
model = dc.models.TensorGraph(learning_rate=learning_rate, model_dir='./model/')
features = layers.Feature(shape=(None, 520, 696))
labels = layers.Label(shape=(None,))
prev_layer = features
for num_outputs in [16, 32, 64, 128, 256]:
  prev_layer = layers.Conv2D(num_outputs, kernel_size=5, stride=2, in_layers=prev_layer)
output = layers.Dense(1, in_layers=layers.Flatten(prev_layer))
model.add_output(output)
loss = layers.ReduceSum(layers.L2Loss(in_layers=(output, labels)))
model.set_loss(loss)

In [26]:
model.layers

{'Conv2D_3': <deepchem.models.tensorgraph.layers.Conv2D at 0x7f8168fb0518>,
 'Conv2D_4': <deepchem.models.tensorgraph.layers.Conv2D at 0x7f8168fb04e0>,
 'Conv2D_5': <deepchem.models.tensorgraph.layers.Conv2D at 0x7f8168fb04a8>,
 'Conv2D_6': <deepchem.models.tensorgraph.layers.Conv2D at 0x7f8168fb0470>,
 'Conv2D_7': <deepchem.models.tensorgraph.layers.Conv2D at 0x7f8168fb0278>,
 'Dense_1': <deepchem.models.tensorgraph.layers.Dense at 0x7f8168fb0550>,
 'Feature_8': <deepchem.models.tensorgraph.layers.Feature at 0x7f8168fb02b0>,
 'Flatten_2': <deepchem.models.tensorgraph.layers.Flatten at 0x7f8168fb0400>,
 'L2Loss_10': <deepchem.models.tensorgraph.layers.L2Loss at 0x7f8168fb05c0>,
 'Label_11': <deepchem.models.tensorgraph.layers.Label at 0x7f8168fb0438>,
 'ReduceSum_9': <deepchem.models.tensorgraph.layers.ReduceSum at 0x7f8168fb05f8>}

In [0]:
# Check if we have a pretrained model
if not os.path.exists('./models'):
  print("created ./models")
  os.mkdir('models')
if not os.path.exists('./models/model'):
  print("created models/model")
  os.mkdir('models/model')

if not RETRAIN:
  model.restore()

# Displaying the Results

In [28]:
# Train it and evaluate performance on the test set.
if RETRAIN:
  print("About to fit model for 50 epochs")
  model.fit(train_dataset, nb_epoch=50)
y_pred = model.predict(test_dataset).flatten()
print(np.sqrt(np.mean((y_pred-test_dataset.y)**2)))

About to fit model for 50 epochs
1.9332225552647448
