<a href="https://colab.research.google.com/github/maskot1977/PythonCourse2019/blob/master/%E7%AC%AC41%E5%9B%9E%E3%82%B1%E3%83%A2%E3%82%A4%E3%83%B3%E3%83%95%E3%82%A9%E3%83%9E%E3%83%86%E3%82%A3%E3%82%AF%E3%82%B9%E8%8B%A5%E6%89%8B%E3%81%AE%E5%9B%9E_RDKit_tutorial_%E3%83%8F%E3%83%B3%E3%82%BA%E3%82%AA%E3%83%B3%E8%B3%87%E6%96%99.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1
# セル。左の三角や、Shift-Enterなどでセル内のコードを実行する
print("Hello, world")

In [None]:
# 2
# rdkitをインストール
!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()

# 3D表示のためのpy3Dmolをインストール
!pip install py3Dmol

# 使用するテスト用のsdfをダウンロード
!curl -Lo example.sdf https://gist.githubusercontent.com/philopon/e32ac48058f9b96e16f04f65326154b5/raw/c205c8443f4826fe262de5669332076e16b12684/test.sdf
# gzip圧縮版もつくる
!cat example.sdf | gzip > example.sdf.gz

# mordredのインストール
!pip install mordred

In [None]:
# 3
# ライブラリをインポート
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import display

In [None]:
# 4 分子の読み込み
benzene = Chem.MolFromSmiles("c1ccccc1")
display(benzene)

In [None]:
# 変な入力が入るとNoneが返る
invalid = Chem.MolFromSmiles("c1ccccc")
display(invalid)

In [None]:
# 5 複数分子の読み込み
for mol in Chem.SDMolSupplier('example.sdf'):
    display(mol)

In [None]:
# ForwardSDMolSupplierを使用すれば圧縮したままのファイルなども扱う事ができる
import gzip

for mol in Chem.ForwardSDMolSupplier(gzip.open('example.sdf.gz')):
    display(mol)

In [None]:
# 6 分子の書き出し
print(Chem.MolToSmiles(benzene))

In [None]:
print(Chem.MolToMolBlock(benzene))

In [None]:
#7 複数分子の書き出し
from contextlib import closing

with closing(Chem.SDWriter('foo.sdf')) as out:
    out.write(benzene)
    out.write(Chem.MolFromSmiles("CCCCCC"))

In [None]:
# 8 MolVS: 分子の標準化
from rdkit.Chem import MolStandardize

mol = Chem.MolFromSmiles('C[S+2]([O-])([O-])O')
display(mol)
normalizer = MolStandardize.normalize.Normalizer()
normalizer.normalize(mol)

In [None]:
# 9 MolVS: 一番大きいサイズのフラグメントのみ残す
mol = Chem.MolFromSmiles("O=C(O)CCC.O=C(O)CCCC.O=C(O)CCCCC.O=C(O)CCCC")
display(mol)
lfc = MolStandardize.fragment.LargestFragmentChooser()
lfc.choose(mol)

In [None]:
# 10 MolVS: 電荷の中和
mol = Chem.MolFromSmiles("O=C([O-])CCCC")
display(mol)

uc = MolStandardize.charge.Uncharger()
mol = uc.uncharge(mol)
mol

In [None]:
# 11 トートマー列挙
mol = Chem.MolFromSmiles("OC(C)=C(C)C")
te = MolStandardize.tautomer.TautomerEnumerator()
mols = te.enumerate(mol)

for mol in mols:
    display(mol)

In [None]:
# 12 トートマーの正規化
tc = MolStandardize.tautomer.TautomerCanonicalizer()
for mol in mols:
    display(tc.canonicalize(mol))

In [None]:
# 13 立体構造の立ち上げ
from rdkit.Chem import AllChem

mol = Chem.AddHs(Chem.MolFromSmiles('OC(C)=C(C)C'))
AllChem.EmbedMolecule(mol)
IPythonConsole.drawMol3D(mol)

In [None]:
# m1
from mordred.AtomCount import AtomCount

carbon_count = AtomCount("C")
print(carbon_count)
carbon_count(Chem.MolFromSmiles('c1ccccc1C(=O)O'))

In [None]:
# m2
from mordred import Calculator

calc = Calculator()
calc.register(AtomCount("C"))
calc.register(AtomCount("O"))

calc(Chem.MolFromSmiles('c1ccccc1C(=O)O')).asdict()

In [None]:
# m3
calc = Calculator(AtomCount)

calc(Chem.MolFromSmiles('c1ccccc1C(=O)O')).asdict()

In [None]:
# m4
from mordred import descriptors

calc = Calculator(descriptors)

len(calc(Chem.MolFromSmiles('c1ccccc1C(=O)O')))

In [None]:
# m5
result = calc.pandas([Chem.MolFromSmiles('c1ccccc1'), Chem.MolFromSmiles('CCCCCC')])
result

In [None]:
# m6 記述子演算
nC = AtomCount("C")
nN = AtomCount("N")

nCnN = nC * nN
mol = Chem.MolFromSmiles("c1ccncc1N")
print(nC(mol), nN(mol), nCnN(mol))

In [None]:
# e1 - データのダウンロード
!curl -OL https://gist.githubusercontent.com/philopon/7dfae1175aa0788224ecbb961761990e/raw/839010eb0deed2308034f650ef82550f582069e7/test.sdf
!curl -OL https://gist.githubusercontent.com/philopon/7dfae1175aa0788224ecbb961761990e/raw/839010eb0deed2308034f650ef82550f582069e7/train.sdf

In [None]:
# e1 - 読み込み
train_mols = [mol for mol in Chem.SDMolSupplier('train.sdf', removeHs=False)]
train_y = [mol.GetDoubleProp('hERG') for mol in train_mols]

# 最大フラグメントの抽出
lfc = MolStandardize.fragment.LargestFragmentChooser()
train_mols = [lfc(mol) for mol in train_mols]

for mol in train_mols:
    display(mol)

In [None]:
# e1 - 記述子計算
calc = Calculator(descriptors)
train_descs = calc.pandas(train_mols)

In [None]:
# e1 - 学習
from sklearn import pipeline, preprocessing, linear_model

model = pipeline.Pipeline([
    ('imputer', preprocessing.Imputer()),  # 欠損値を補間
    ('scaling', preprocessing.RobustScaler()),  # スケーリング
    ('model', linear_model.LassoCV()),  # モデル
])
model.fit(train_descs.fill_missing(), train_y)

In [None]:
# e1 - テストの読み込み
test_mols = [mol for mol in Chem.SDMolSupplier('test.sdf', removeHs=False)]
test_y = [mol.GetDoubleProp('hERG') for mol in test_mols]

# 最大フラグメントの抽出
test_mols = [lfc(mol) for mol in test_mols]

for mol in test_mols:
    display(mol)

In [None]:
# e1 - テストの記述子計算
test_descs = calc.pandas(test_mols)

In [None]:
# e1 - R2を計算
model.score(test_descs.fill_missing(), test_y)

In [None]:
# 比較用のECFP4を使用したモデル
from sklearn import feature_selection

train_fps = [list(AllChem.GetMorganFingerprintAsBitVect(mol, 4)) for mol in train_mols]
test_fps = [list(AllChem.GetMorganFingerprintAsBitVect(mol, 4)) for mol in test_mols]

In [None]:
fp_model = pipeline.Pipeline([
    ('threshold', feature_selection.VarianceThreshold()),  # 分散が小さい特徴を除去
    ('imputer', preprocessing.Imputer()),  # 欠損値を補間
    ('scaling', preprocessing.RobustScaler()),  # スケーリング
    ('model', linear_model.LassoCV(max_iter=10000)),  # モデル
])

fp_model.fit(train_fps, train_y)

In [None]:
fp_model.score(test_fps, test_y)

In [None]:
#@title リンク集
#@markdown [公式サイト](https://www.rdkit.org)
#@markdown [ソース](https://github.com/rdkit/rdkit)
#@markdown [ブログ](https://rdkit.blogspot.com)
#@markdown [ブログ](https://rdkit.blogspot.com)
#@markdown [RDKit Users JPのSlack](https://slackin-ojhaoyxrak.now.sh)
#@markdown [RDKit o-cha-kaiのconnpass](https://rdkit-users-jp.connpass.com/)
#@markdown [Anaconda](https://www.anaconda.com)
#@markdown [Miniconda](https://conda.io/miniconda.html)
#@markdown [公式ドキュメント](http://www.rdkit.org/docs/index.html)
#@markdown [github releases](https://github.com/rdkit/rdkit/releases)
#@markdown [Contrib](https://github.com/rdkit/rdkit/tree/master/Contrib)
#@markdown [scikit-learnのドキュメント](https://github.com/rdkit/rdkit/releases)
