# Installation
Takes about 1 minute.

In [None]:
%%capture installation_log

import sys

# Install Miniconda
!wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!bash Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!rm Miniconda3-latest-Linux-x86_64.sh

# Install RDKit
!conda install -y -c conda-forge rdkit

# Install PyPI packages
!pip install wget

# Clone smiles repository
!git clone https://github.com/mrezler/smiles.git

# Extend `sys.path`
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [None]:
# Set working directory to `/content/smiles`
cd smiles

# Imports

In [None]:
from chainer import datasets, serializers, training, Variable
from chainer.training import extensions
from feature import *
from os import makedirs
from rdkit import Chem
from sklearn import metrics
from time import time
import chainer
import chainer.functions as F
import cupy as cp
import numpy as np
import SCFPfunctions as Mf
import SCFPmodel as Mm

# Constants

In [None]:
ATOM_INFO = 21       # Size of atom feature vector. Default = 21
ATOM_SIZE = 400      # Max length of smiles. Default = 400
BATCH_SIZE = 32      # Number of moleculars in each mini-batch. Default = 32
BOOST = -1           # Augmentation rate (-1 indicates OFF). Default = -1
DATA_DIR = 'TOX21'   # Input Smiles Dataset. Default = 'TOX21'
EPOCH = 500          # Number of max iteration to evaluate. Default = 500
F1 = 128             # No. of filters of first convolution layer. Default = 128
F3 = 64              # No. of filters of second convolution layer. Default = 64
FREQUENCY = 1        # Epoch frequency for evaluation. Default = 1
GPU = 0              # GPU ID (negative value indicates CPU). Default = -1
K1 = 11              # Window-size of first convolution layer. Default = 11
K2 = 5               # Window-size of first pooling layer. Default = 5
K3 = 11              # Window-size of second convolution layer. Default = 11
K4 = 5               # Window-size of second pooling layer. Default = 5
MODEL_DIR = 'MODEL'  # Directory to Model to evaluate. Default = 'OUT'
N_HID = 96           # No. of hidden perceptron. Default = 96
N_OUT = 1            # No. of output perceptron (class). Default = 1
PROTEIN = 'NR-AR'    # Name of protein (subdataset). Default = 'NR-AR'
S1 = 1               # Stride-step of first convolution layer. Default = 1
S2 = 1               # Stride-step of first max-pooling layer. Default = 1
S3 = 1               # Stride-step of second convolution layer. Default = 1
S4 = 1               # Stride-step of second pooling layer. Default = 1
SCORE_SX = 'score'   # Suffix of final scoring data files. Default = 'score'
STRUCT_INFO = 21     # Size of structural feature vector. Default = 21
TEST_SX = 'test'     # Suffix test data files. Default = 'test'
TRAIN_SX = \
    'wholetraining'  # Suffix training data files. Default = 'wholetraining'

# Variables

In [None]:
lensize = ATOM_INFO + STRUCT_INFO

# Functions

In [None]:
def make_dataset(name, suffix, sep):
    print(f'Making {name} dataset...')
    file = DATA_DIR + '/' + PROTEIN + '_' + suffix + '.smiles'
    print(f'Loading smiles: {file}')
    smi = Chem.SmilesMolSupplier(file, delimiter=sep, titleLine=False)
    mols = [mol for mol in smi if mol is not None]
    F_list, T_list = [], []
    for mol in mols:
        smiles = Chem.MolToSmiles(mol, kekuleSmiles=True, isomericSmiles=True)
        if len(smiles) > ATOM_SIZE:
            print('WARNING: Too long mol was ignored.')
        else:
            F_list.append(mol_to_feature(mol, -1, ATOM_SIZE))
            T_list.append(mol.GetProp('_Name'))
    Mf.random_list(F_list)
    Mf.random_list(T_list)
    data_t = np.asarray(T_list, dtype=np.int32).reshape(-1, N_OUT)
    data_f = np.asarray(F_list, dtype=np.float32).reshape(-1, N_OUT, ATOM_SIZE,
                                                          lensize)
    return data_f, data_t

# `trainer-challenge.py`

In [None]:
# Show training settings at start
settings = f"""
TRAINING SETTINGS:

GPU ID: {GPU}
Minibatch-size: {BATCH_SIZE}
Epoch: {EPOCH}
\t\t\tWindow-size\tStride-step\tNo. of filters
1st convolution:\t{K1}\t\t{S1}\t\t{F1}
Max-pooling:\t\t{K2}\t\t{S2}\t\t---
2nd convolution:\t{K3}\t\t{S3}\t\t{F3}
Max-pooling:\t\t{K4}\t\t{S4}\t\t---
"""
print(settings)
# Turn timer on
start = time()
# Choose CPU or GPU
xp = np
if GPU >= 0:
    print('GPU mode')
    xp = cp
# Load training dataset
X_train, y_train = make_dataset('training', TRAIN_SX, sep=' ')
train_dataset = datasets.TupleDataset(X_train, y_train)
# Load testing dataset
X_test, y_test = make_dataset('testing', TEST_SX, sep='\t')
test_dataset = datasets.TupleDataset(X_test, y_test)
# Define model
model = Mm.CNN(ATOM_SIZE, lensize, K1, S1, F1, K2, S2, K3, S3, F3, K4, S4,
               N_HID, N_OUT)
# Transfer model to GPU if GPU mode is set
if GPU >= 0:
    chainer.cuda.get_device_from_id(GPU).use()
    model.to_gpu()
# Define optimizer
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)
# Trainer settings
print('Trainer is setting up...')
output_dir = MODEL_DIR + '/' + PROTEIN
makedirs(output_dir)
train_iter = chainer.iterators.SerialIterator(train_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)
test_iter = chainer.iterators.SerialIterator(test_dataset,
                                             batch_size=BATCH_SIZE,
                                             repeat=False, shuffle=True)
updater = training.StandardUpdater(train_iter, optimizer, device=GPU)
trainer = training.Trainer(updater, (EPOCH, 'epoch'), out=output_dir)
trainer.extend(extensions.Evaluator(test_iter, model, device=GPU))
trainer.extend(extensions.snapshot_object(model,
                                          'model_snapshot_{.updater.epoch}'),
               trigger=(FREQUENCY, 'epoch'))
trainer.extend(extensions.LogReport(trigger=(1, 'epoch'),
                                    log_name='log_epoch'))
trainer.extend(extensions.LogReport(trigger=(10, 'iteration'),
                                    log_name='log_iteration'))
trainer.extend(extensions.PrintReport(['epoch', 'elapsed_time', 'main/loss',
                                       'validation/main/loss', 'main/accuracy',
                                       'validation/main/accuracy']))
# Run trainer
print('Trainer is running...')
trainer.run()
# Turn timer off and print summary of training
end = time()
print(f'Training is done. Total time: {int((end-start)//60)} minutes．')

# `evaluate-challenge.py`

In [None]:
# Load training dataset
X_score, y_score = make_dataset('scoring', SCORE_SX, sep='\t')
# Evaluator settings
borders = [len(y_score) * i // 30 for i in range(30+1)]
with cp.cuda.Device(GPU):
    X_score_gpu = cp.array(X_score)
    y_score_gpu = cp.array(y_score)
# Load model
model = Mm.CNN(ATOMSIZE, lensize, K1, S1, F1, K2, S2, K3, S3, F3, K4, S4,
               N_HID, N_OUT)
model.compute_accuracy = False
model.to_gpu(GPU)
# Run evaluator
print('Evaluator is  running...')
f = open(MODEL_DIR + '/' + PROTEIN + '/evaluation_epoch.csv', 'w')
print('epoch', 'TP', 'FN', 'FP', 'TN', 'Loss', 'Accuracy', 'B_accuracy',
      'Sepecificity', 'Precision', 'Recall', 'F-measure', 'AUC', sep='\t')
f.write('epoch,TP,FN,FP,TN,Loss,Accuracy,B_accuracy,Sepecificity,Precision,'
        'Recall,F-measure,AUC\n')
for epoch in range(FREQUENCY, EPOCH+1, FREQUENCY):
    pred_score, loss = [], []
    with cp.cuda.Device(GPU):
        npz = MODEL_DIR + '/' + PROTEIN + '/model_snapshot_' + str(epoch)
        serializers.load_npz(npz, model)
    for i in range(30):
        with cp.cuda.Device(GPU):
            x_gpu = X_score_gpu[borders[i]:borders[i+1]]
            y_gpu = y_score_gpu[borders[i]:borders[i+1]]
            pred_tmp_gpu, sr = model.predict(Variable(x_gpu))
            pred_tmp_gpu = F.sigmoid(pred_tmp_gpu)
            pred_tmp = pred_tmp_gpu.data.get()
            loss_tmp = model(Variable(x_gpu), Variable(y_gpu)).data.get()
        pred_score.extend(pred_tmp.reshape(-1).tolist())
        loss.append(loss_tmp.tolist())
    loss = np.mean(loss)
    pred_score = np.array(pred_score).reshape(-1, 1)
    pred = 1*(pred_score >= 0.5)
    count_TP = np.sum(np.logical_and(y_score == pred, pred == 1)*1)
    count_FP = np.sum(np.logical_and(y_score != pred, pred == 1)*1)
    count_FN = np.sum(np.logical_and(y_score != pred, pred == 0)*1)
    count_TN = np.sum(np.logical_and(y_score == pred, pred == 0)*1)
    Accuracy = (count_TP+count_TN)/(count_TP+count_FP+count_FN+count_TN)
    Sepecificity = count_TN/(count_TN+count_FP)
    Precision = count_TP/(count_TP+count_FP)
    Recall = count_TP/(count_TP+count_FN)
    Fmeasure = 2*Recall*Precision/(Recall+Precision)
    B_accuracy = (Sepecificity+Recall)/2
    AUC = metrics.roc_auc_score(y_score, pred_score, average='weighted')
    params = (epoch, count_TP, count_FN, count_FP, count_TN, loss, Accuracy,
              B_accuracy, Sepecificity, Precision, Recall, Fmeasure, AUC)
    print(*params, sep='\t')
    text = '{},{},{},{},{},{},{},{},{},{},{},{},{}\n'.format(*params)
    f.write(text)
f.close()