<a href="https://colab.research.google.com/github/naototachibana/colab_transplantation/blob/master/chainer_chemistry_colab_transplantation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 2
# rdkitをインストール
!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()

!pip install chainer-chemistry

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  2415  100  2415    0     0   4480      0 --:--:-- --:--:-- --:--:--  4480


add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH
rdkit is already installed


CPU times: user 2 ms, sys: 0 ns, total: 2 ms
Wall time: 2.07 ms
Collecting chainer-chemistry
[?25l  Downloading https://files.pythonhosted.org/packages/df/47/99562193229fe60940ef28ef4366d5c16308f76db953350a4ae6f602d32a/chainer-chemistry-0.5.0.tar.gz (77kB)
[K     |████████████████████████████████| 81kB 4.3MB/s 
Building wheels for collected packages: chainer-chemistry
  Building wheel for chainer-chemistry (setup.py) ... [?25l[?25hdone
  Created wheel for chainer-chemistry: filename=chainer_chemistry-0.5.0-cp36-none-any.whl size=132520 sha256=616728127f4bf244a7e73b3699dfc3622a207146c85530bee1133a68e4f57f31
  Stored in directory: /root/.cache/pip/wheels/2e/32/e0/15d059a9218ee5d6306e124aa82b41a63c5fc61885fca277ae
Successfully built chainer-chemistry
Installing collected packages: chainer-chemistry
Successfully installed chainer-chemistry-0.5.0


In [0]:
#!/usr/bin/env python

from __future__ import print_function

import chainer
import numpy
import os
import pickle

from argparse import ArgumentParser
from chainer.datasets import split_dataset_random
from chainer.datasets import SubDataset
from chainer import cuda
from chainer import functions as F
from chainer import optimizers
from chainer import training
from chainer import Variable
from chainer.iterators import SerialIterator
from chainer.training import extensions as E
from chainer.training import triggers
from sklearn.preprocessing import StandardScaler


from chainer_chemistry.dataset.converters import concat_mols
from chainer_chemistry.dataset.parsers import CSVFileParser
from chainer_chemistry.dataset.preprocessors import preprocess_method_dict
from chainer_chemistry.datasets import NumpyTupleDataset
from chainer_chemistry.models import MLP, NFP, GGNN, SchNet, WeaveNet, RSGCN, Regressor  # NOQA
from datetime import datetime
starting_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

import requests
import json
post_url = "https://hooks.slack.com/services/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"

In [0]:
def post_slack(name, text):
    requests.post(
        post_url,
        data=json.dumps(
            {"text": text,
             "username": name,
             "icon_emoji": ":python:"}))

def best_model_save_extension(model_path_best, regressor, protocol):
    @training.make_extension()
    def _save_pickle(trainer):
        regressor.save_pickle(model_path_best, protocol=protocol)
    return _save_pickle

class GraphConvPredictor(chainer.Chain):
    def __init__(self, graph_conv, mlp=None):
        """Initializes the graph convolution predictor.

        Args:
            graph_conv: The graph convolution network required to obtain
                        molecule feature representation.
            mlp: Multi layer perceptron; used as the final fully connected
                 layer. Set it to `None` if no operation is necessary
                 after the `graph_conv` calculation.
        """

        super(GraphConvPredictor, self).__init__()
        with self.init_scope():
            self.graph_conv = graph_conv
            if isinstance(mlp, chainer.Link):
                self.mlp = mlp
        if not isinstance(mlp, chainer.Link):
            self.mlp = mlp
    # グラフ畳み込み層を適用したあと, 多層パーセプトロン (あれば) を適用する仕組み
    def __call__(self, atoms, adjs):
        h = self.graph_conv(atoms, adjs)
        if self.mlp:
            h = self.mlp(h)
        return h


class MeanAbsError(object):
    def __init__(self, scaler=None, out_dir = "result"):
        """Initializes the (scaled) mean absolute error metric object.

        Args:
            scaler: Standard label scaler.
        """
        self.scaler = scaler
        self.out_dir = out_dir

    def __call__(self, x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        if self.scaler is not None:
            scaled_x0 = self.scaler.inverse_transform(cuda.to_cpu(x0))
            scaled_x1 = self.scaler.inverse_transform(cuda.to_cpu(x1))
            diff = scaled_x0 - scaled_x1
            #print(scaled_x0)
            numpy.savetxt(self.out_dir + "/pred.csv",scaled_x0)
            numpy.savetxt(self.out_dir + "/meas.csv",scaled_x1)

        else:
            diff = cuda.to_cpu(x0) - cuda.to_cpu(x1)
            numpy.savetxt(self.out_dir + "/pred.csv",cuda.to_cpu(x0))
            numpy.savetxt(self.out_dir + "/meas.csv",cuda.to_cpu(x1))
        return numpy.mean(numpy.absolute(diff), axis=0)[0]


class RootMeanSqrError(object):
    def __init__(self, scaler=None):
        """Initializes the (scaled) root mean square error metric object.

        Args:
            scaler: Standard label scaler.
        """
        self.scaler = scaler

    def __call__(self, x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        if self.scaler is not None:
            scaled_x0 = self.scaler.inverse_transform(cuda.to_cpu(x0))
            scaled_x1 = self.scaler.inverse_transform(cuda.to_cpu(x1))
            diff = scaled_x0 - scaled_x1
        else:
            diff = cuda.to_cpu(x0) - cuda.to_cpu(x1)
        return numpy.sqrt(numpy.mean(numpy.power(diff, 2), axis=0)[0])


class ScaledAbsError(object):
    def __init__(self, scaler=None):
        self.scaler = scaler

    def __call__(self, x0, x1):
        if isinstance(x0, Variable):
            x0 = cuda.to_cpu(x0.data)
        if isinstance(x1, Variable):
            x1 = cuda.to_cpu(x1.data)
        if self.scaler is not None:
            scaled_x0 = self.scaler.inverse_transform(cuda.to_cpu(x0))
            scaled_x1 = self.scaler.inverse_transform(cuda.to_cpu(x1))
            diff = scaled_x0 - scaled_x1
        else:
            diff = cuda.to_cpu(x0) - cuda.to_cpu(x1)
        return numpy.mean(numpy.absolute(diff), axis=0)[0]


def set_up_predictor(method, n_unit, conv_layers, class_num):
    """Sets up the graph convolution network  predictor.

    Args:
        method: Method name. Currently, the supported ones are `nfp`, `ggnn`,
                `schnet`, `weavenet` and `rsgcn`.
        n_unit: Number of hidden units.
        conv_layers: Number of convolutional layers for the graph convolution
                     network.
        class_num: Number of output classes.

    Returns:
        An instance of the selected predictor.
    """

    predictor = None
    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)

    if method == 'nfp':
        print('Training an NFP predictor...')
        nfp = NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        predictor = GraphConvPredictor(nfp, mlp)
    elif method == 'ggnn':
        print('Training a GGNN predictor...')
        ggnn = GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        predictor = GraphConvPredictor(ggnn, mlp)
    elif method == 'schnet':
        print('Training an SchNet predictor...')
        schnet = SchNet(out_dim=class_num, hidden_dim=n_unit,
                        n_layers=conv_layers)
        predictor = GraphConvPredictor(schnet, None)
    elif method == 'weavenet':
        print('Training a WeaveNet predictor...')
        n_atom = 30 #adjust with config.py
        n_sub_layer = 1
        weave_channels = [50] * conv_layers

        weavenet = WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit,
                            n_sub_layer=n_sub_layer, n_atom=n_atom)
        predictor = GraphConvPredictor(weavenet, mlp)
    elif method == 'rsgcn':
        print('Training an RSGCN predictor...')
        rsgcn = RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers)
        predictor = GraphConvPredictor(rsgcn, mlp)
    else:
        raise ValueError('[ERROR] Invalid method: {}'.format(method))
    return predictor


def parse_arguments():
    # Lists of supported preprocessing methods/models.
    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
    scale_list = ['standardize', 'none']

    # Set up the argument parser.
    parser = ArgumentParser(description='Regression on own dataset')
    parser.add_argument('--datafile', '-d', type=str,
                        default='dataset_train.csv',
                        help='csv file containing the dataset')
    parser.add_argument('--datafile_test', '-dt', type=str,
                        default='dataset_test.csv',
                        help='csv file containing the dataset')
    parser.add_argument('--method', '-m', type=str, choices=method_list,
                        help='method name', default='nfp')
    
    parser.add_argument('--label', '-l', nargs='+',
                        default=['value1', 'value2'],
                        help='target label for regression')
    
    parser.add_argument('--label_t', '-lt', nargs='+',
                        default=['value1', 'value2'],
                        help='target label for regression')

    parser.add_argument('--scale', type=str, choices=scale_list,
                        help='label scaling method', default='standardize')
    parser.add_argument('--conv-layers', '-c', type=int, default=4,
                        help='number of convolution layers')
    parser.add_argument('--batchsize', '-b', type=int, default=32,
                        help='batch size')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='id of gpu to use; negative value means running'
                        'the code on cpu')
    parser.add_argument('--out', '-o', type=str, default='result',
                        help='path to save the computed model to')
    parser.add_argument('--epoch', '-e', type=int, default=10,
                        help='number of epochs')
    parser.add_argument('--unit-num', '-u', type=int, default=16,
                        help='number of units in one layer of the model')
    parser.add_argument('--seed', '-s', type=int, default=777,
                        help='random seed value')
    parser.add_argument('--train-data-ratio', '-r', type=float, default=0.8,
                        help='ratio of training data w.r.t the dataset')
    parser.add_argument('--protocol', type=int, default=2,
                        help='pickle protocol version')
    parser.add_argument('--model-filename', type=str, default='regressor.pkl',
                        help='saved model filename')
    return parser.parse_args(args=[])

In [3]:
def main():
    # Parse the arguments.
    args = parse_arguments()
    print ("attributes of argument parser\n")
    print(vars(args))
    best_loss_epoch_list = []

    if args.label:
        labels = args.label
        labels_test = args.label_t
        class_num = len(labels) if isinstance(labels, list) else 1
        #class_num = len(labels_test) if isinstance(labels_test, list1) else 1
    else:
        raise ValueError('No target label was specified.')

    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label(label_list):
        return numpy.asarray(label_list, dtype=numpy.float32)

    # Apply a preprocessor to the dataset.
    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[args.method]()
    parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label,
                           labels=labels, smiles_col='SMILES')
    dataset = parser.parse(args.datafile)['dataset']
    dataset_test = parser.parse(args.datafile_test)['dataset']
    #for key, value in
    #NumpyTupleDataset.save('data.npz',dataset)

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        scaler = StandardScaler()
        labels = scaler.fit_transform(dataset.get_datasets()[-1])
        #numpy.savetxt("label_scalar.csv",labels)
        labels_test = scaler.fit_transform(dataset_test.get_datasets()[-1])
        #la = scaler.inverse_transform(labels)
        #numpy.savetxt("label.csv",la)
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] + (labels,)))
        dataset_test = NumpyTupleDataset(*(dataset_test.get_datasets()[:-1] + (labels_test,)))
        #print(dataset)
    else:
        scaler = None

    # Split the dataset into training and validation.
    train = SubDataset(dataset,0,int(len(dataset)))
    print(len(train))
    val = SubDataset(dataset_test,0,int(len(dataset_test)))
    print(len(val))
    #train_data_size = int(len(dataset) * args.train_data_ratio)
    #train, val = split_dataset_random(dataset, train_data_size, args.seed)
    """
    print("train:"+str(len(train)))
    print("test:"+str(len(val)))
    print(test)
    for i in range(len(val)):
        tes=val[i][2]
        #print(float(tes))
        with open('label_test.csv','a') as ff:
            ff.write(str(tes)+'\n')
    """


    #numpy.savetxt('train.csv', train)
    #NumpyTupleDataset.save('test.npz', test)

    # Set up the predictor.
    predictor = set_up_predictor(args.method, args.unit_num,
                                 args.conv_layers, class_num)

    # Set up the iterator.
    train_iter = SerialIterator(train, args.batchsize)
    val_iter = SerialIterator(val, args.batchsize, repeat=False, shuffle=False)

    # Set up the regressor.
    metrics_fun = {'mean_abs_error': MeanAbsError(scaler=scaler, out_dir = args.out),
                   'root_mean_sqr_error': RootMeanSqrError(scaler=scaler)}
    
    regressor = Regressor(predictor, lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun, device=args.gpu)

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)

    # Set up the trainer.
    print('Training...')
    #trainer を作成. args.epoch で学習を終了し, args.out に関連データを保存する
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Extension を付加
    trainer.extend(E.Evaluator(val_iter, regressor, device=args.gpu,converter=concat_mols))

    # 最良のモデルを保存するエクステンション
    trigger_best_model = triggers.MinValueTrigger('validation/main/root_mean_sqr_error', trigger=(1, 'epoch'))
    model_path_best = os.path.join(args.out, "best" + args.model_filename)
    trainer.extend(best_model_save_extension(model_path_best, regressor, args.protocol), trigger=trigger_best_model)
    trainer.extend(E.LogReport(log_name='log_report'))
    trainer.extend(E.PrintReport(['epoch', 'main/loss', 'main/mean_abs_error',
                                  'main/root_mean_sqr_error','validation/main/loss',
                                  'validation/main/mean_abs_error','validation/main/root_mean_sqr_error',
                                  'elapsed_time']))
    trainer.extend(E.ProgressBar())
    trainer.extend(E.dump_graph(root_name="main/loss", out_name="cg.dot"))
    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)

    # Save the standard scaler's parameters.
    if scaler is not None:
        with open(os.path.join(args.out, 'scaler.pkl'), mode='wb') as f:
            pickle.dump(scaler, f, protocol=args.protocol)
    
    slack_message = "method: " + str(args.method) + "\n" + "targets:\n" + str(args.label)
    post_slack ("best_loss_epoch_list", "学習が終了しました\n" + slack_message)

if __name__ == '__main__':
    main()

usage: ipykernel_launcher.py [-h] [--datafile DATAFILE]
                             [--datafile_test DATAFILE_TEST]
                             [--method {nfp,ggnn,schnet,weavenet,rsgcn}]
                             [--label LABEL [LABEL ...]]
                             [--label_t LABEL_T [LABEL_T ...]]
                             [--scale {standardize,none}]
                             [--conv-layers CONV_LAYERS]
                             [--batchsize BATCHSIZE] [--gpu GPU] [--out OUT]
                             [--epoch EPOCH] [--unit-num UNIT_NUM]
                             [--seed SEED]
                             [--train-data-ratio TRAIN_DATA_RATIO]
                             [--protocol PROTOCOL]
                             [--model-filename MODEL_FILENAME]
ipykernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-74c8fc45-2b89-4dde-aa23-19d34f3dc4ba.json


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
