In [1]:
!curl -Lo conda_installer.py https://raw.githubusercontent.com/deepchem/deepchem/master/scripts/colab_install.py
import conda_installer
conda_installer.install()
!/root/miniconda/bin/conda info -e

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  3501  100  3501    0     0  16133      0 --:--:-- --:--:-- --:--:-- 16133


add /root/miniconda/lib/python3.7/site-packages to PYTHONPATH
python version: 3.7.10
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
installing miniconda to /root/miniconda
done
installing rdkit, openmm, pdbfixer
added omnia to channels
added conda-forge to channels
done
conda packages installation finished!


# conda environments:
#
base                  *  /root/miniconda



In [2]:
!pip install --pre deepchem
import deepchem
deepchem.__version__

Collecting deepchem
[?25l  Downloading https://files.pythonhosted.org/packages/48/4f/918faea08e6f3e42dd955fd7309912cd01782ee62fc6e3a6047192add238/deepchem-2.6.0.dev20210406183355-py3-none-any.whl (553kB)
[K     |▋                               | 10kB 19.5MB/s eta 0:00:01[K     |█▏                              | 20kB 26.9MB/s eta 0:00:01[K     |█▊                              | 30kB 23.9MB/s eta 0:00:01[K     |██▍                             | 40kB 18.3MB/s eta 0:00:01[K     |███                             | 51kB 14.9MB/s eta 0:00:01[K     |███▌                            | 61kB 13.5MB/s eta 0:00:01[K     |████▏                           | 71kB 15.0MB/s eta 0:00:01[K     |████▊                           | 81kB 15.0MB/s eta 0:00:01[K     |█████▎                          | 92kB 14.9MB/s eta 0:00:01[K     |██████                          | 102kB 16.1MB/s eta 0:00:01[K     |██████▌                         | 112kB 16.1MB/s eta 0:00:01[K     |███████                 

'2.6.0.dev'

In [3]:
import rdkit
rdkit.__version__

'2020.09.1'

In [4]:
import tensorflow as tf
tf.__version__
print('GPU:', tf.test.gpu_device_name())

GPU: /device:GPU:0


In [7]:
# ADD LOGGING
import sys
import logging
logging.basicConfig(
    level=logging.INFO, 
    stream=sys.stdout)

In [9]:
# LOAD THE DATASET WITH MINIMAL PRE-PROCESSING
!mkdir zinc_data

tasks, datasets, _ = deepchem.molnet.load_zinc15(
    featurizer='raw',
    splitter=None,
    transformers=[],
    data_dir='zinc_data', 
    save_dir='zinc_data',
    dataset_size='250K')
print(tasks)

mkdir: cannot create directory ‘zinc_data’: File exists
INFO:deepchem.molnet.load_function.molnet_loader:About to featurize zinc15_250K_2D dataset.
INFO:deepchem.data.data_loader:Loading raw samples now.
INFO:deepchem.data.data_loader:shard_size: 8192
INFO:deepchem.utils.data_utils:About to start loading CSV from zinc_data/zinc15_250K_2D.csv
INFO:deepchem.utils.data_utils:Loading shard 1 of size 8192.
INFO:deepchem.data.data_loader:About to featurize shard.
INFO:deepchem.feat.base_classes:Featurizing datapoint 0
INFO:deepchem.feat.base_classes:Featurizing datapoint 1000
INFO:deepchem.feat.base_classes:Featurizing datapoint 2000
INFO:deepchem.feat.base_classes:Featurizing datapoint 3000
INFO:deepchem.feat.base_classes:Featurizing datapoint 4000
INFO:deepchem.feat.base_classes:Featurizing datapoint 5000
INFO:deepchem.feat.base_classes:Featurizing datapoint 6000
INFO:deepchem.feat.base_classes:Featurizing datapoint 7000
INFO:deepchem.feat.base_classes:Featurizing datapoint 8000
INFO:deepc

In [10]:
# EXTRACT THE SMILES STRINGS FROM THE DATASET
data = datasets[0]
train_smiles = []
for X, _, _, _ in data.itersamples():
    train_smiles.append(rdkit.Chem.MolToSmiles(X))
print(len(train_smiles))
for smile in train_smiles[0:5]:
  print(smile)

250000
CCN(CCSC)C(=O)N[C@@](C)(CC)C(F)(F)F
CC1(C)CN(C(=O)Nc2cc3ccccc3nn2)C[C@@]2(CCOC2)O1
CC[C@H](NC(C)=O)C(=O)NCC1(NC(=O)Cc2nonc2C)CC1
O=C(N[C@@H]1CC[C@H](F)C1)[C@H]1C[C@@H]1c1ccc2c(c1)OCCO2
COCC(=O)N(C)CC(=O)NCC1(Nc2nccn3nnnc23)CC1


In [11]:
# DEFINE THE SMILES TOKENS AND MAX_LENGTH
tokens = set()
for s in train_smiles:
    tokens = tokens.union(set(s))
tokens = sorted(list(tokens))
max_length = max(len(s) for s in train_smiles)

In [22]:
# DEFINE THE MODEL
from deepchem.models.optimizers import Adam, ExponentialDecay
from deepchem.models.seqtoseq import AspuruGuzikAutoEncoder

def get_model():
    batch_size = 64
    learning_rate = ExponentialDecay(0.0001, 0.90, len(train_smiles)/batch_size)
    model = AspuruGuzikAutoEncoder(tokens, max_length, model_dir='vae', 
                                   batch_size=batch_size, learning_rate=learning_rate,
                                   dropout=0.25)
    return model

In [23]:
# TRAINING
from random import shuffle

def generate_sequences(epochs): 
    for i in range(epochs):
        print('epoch:', i+1)
        shuffle(train_smiles)
        for s in train_smiles: 
            yield (s, s)


#deepchem has its own fit model variation
def train(model, epochs=1):
    model.fit_sequences(generate_sequences(epochs))

In [24]:
# GENERATE MOLECULES AND TEST IF THEY ARE VALID
import numpy as np

def generate_molecules(model, n_molecules=1000):
    predictions = model.predict_from_embeddings(np.random.normal(size=(n_molecules,196))) 
    valid = []

    #using chem from rdkit to ensure generated molecules are valid
    count = 0
    for p in predictions:
      count += 1
      smiles = ''.join(p)
      if count < 10:
        print(smiles)
      if rdkit.Chem.MolFromSmiles(smiles) is not None:
        valid.append(smiles) 

    print(len(valid) / n_molecules)
    return valid

    

In [None]:
with tf.device('/device:GPU:0'):
    model = get_model()
    train(model, 30)
    valid = generate_molecules(model, 5000)

    print(len(valid), 'valid molecules')
    count = 0
    for v in valid:
        count += 1
        if count > 20:
            break
        print(v)

epoch: 1
INFO:deepchem.models.keras_model:Ending global_step 100: Average loss 180.417
INFO:deepchem.models.keras_model:Ending global_step 200: Average loss 121.836
INFO:deepchem.models.keras_model:Ending global_step 300: Average loss 110.598
INFO:deepchem.models.keras_model:Ending global_step 400: Average loss 108.013
INFO:deepchem.models.keras_model:Ending global_step 500: Average loss 107.608
INFO:deepchem.models.keras_model:Ending global_step 600: Average loss 106.655
INFO:deepchem.models.keras_model:Ending global_step 700: Average loss 106.237
INFO:deepchem.models.keras_model:Ending global_step 800: Average loss 106.018
INFO:deepchem.models.keras_model:Ending global_step 900: Average loss 105.791
INFO:deepchem.models.keras_model:Ending global_step 1000: Average loss 105.416
INFO:deepchem.models.keras_model:Ending global_step 1100: Average loss 104.756
INFO:deepchem.models.keras_model:Ending global_step 1200: Average loss 104.768
INFO:deepchem.models.keras_model:Ending global_step 