In [1]:
!pip install --pre deepchem
!pip install --user --upgrade catboost
!pip install --user --upgrade ipywidgets
!pip install shap
!pip install sklearn
!pip install --upgrade numpy
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


We can now import the `deepchem` package to play with.

## Подготовка данных

Датасет содержит `SMILES`, по которым определяются 1024 молекулярных дескрипторов, используя ECPF.

Суть scaffold — разбить молекулы на группы на основе их молекулярного скелета (фрагмента молекулы, который остается, если удалить все боковые цепочки и заместители), а затем распределить эти группы молекул между обучающей, тестовой и валидационной выборками таким образом, чтобы каждая группа принадлежала только одной выборке. Разделение на основе скелета позволяет гарантировать, что новые молекулы не будут похожи на те, которые модель уже видела в обучающей выборке.

In [2]:
import deepchem as dc
import pandas as pd
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader

from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union
import pandas as pd
import numpy as np
import random
import os
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import catboost
from catboost import *
from catboost import datasets
from catboost import CatBoostClassifier



- `TASKS` - параметры, которы будут предсказываться
- `data_source` - название файла датасета
- `feature_field` - название колонки с smiles
- `data_name` - название датасета

In [11]:
data_dir = "/Solution/Water/"
save_dir = "/Solution/Water/"

In [12]:
task = "ames"
data_acute = "/Solution/Water/input/water_acute.csv"
data_long_term = "/Solution/Water/input/water_long_term.csv"
result_ds = "/Solution/Water/dataset.csv"

In [13]:
acute_df = pd.read_csv(data_acute, sep=';')
long_term_df = pd.read_csv(data_long_term, sep=';')
merge_df = pd.merge(acute_df, long_term_df, on='smiles')
merge_df.to_csv(result_ds, index=False)

In [14]:
feature_field="smiles"
TASKS = ['Acute', 'Long-term']

Класс - расширающий `_MolnetLoader` из библиотеки DeepChem. Он используется для загрузки датасета и вычисления молекулярных дескрипторов

In [15]:
class MyLoader(_MolnetLoader):

    def create_dataset(self) -> Dataset:
        loader = dc.data.CSVLoader(tasks=self.tasks,
                                   feature_field=feature_field,
                                   featurizer=self.featurizer)
        return loader.create_dataset(result_ds, shard_size=8192)


def load_tox21(
    featurizer: Union[dc.feat.Featurizer, str] = 'ECFP',
    splitter: Union[dc.splits.Splitter, str, None] = 'scaffold',
    transformers: List[Union[TransformerGenerator, str]] = ['balancing'],
    reload: bool = True,
    data_dir: Optional[str] = data_dir,
    save_dir: Optional[str] = save_dir,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
    
    loader = MyLoader(featurizer, splitter, transformers, TASKS,
                          data_dir, save_dir, **kwargs)
    return loader.load_dataset(task, reload)

In [16]:
tasks, datasets, transformers = load_tox21()
train_dataset, valid_dataset, test_dataset = datasets

RDKit ERROR: [20:39:41] Explicit valence for atom # 1 Br, 5, is greater than permitted
Failed to featurize datapoint 2374, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
RDKit ERROR: [20:39:41] Explicit valence for atom # 1 Cl, 3, is greater than permitted
Failed to featurize datapoint 2381, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
RDKit ERROR: [20:39:41] Explicit valence for atom # 1 Si, 8, is greater than permitted
Failed to featurize datapoint 2626, None. Appending empty array
Exception message: Python argument types in
    rdkit.Che

In [17]:
df_tr = train_dataset.to_dataframe()
df_t = test_dataset.to_dataframe()
df_v = valid_dataset.to_dataframe()

In [18]:
x = df_tr.drop(['y1','y2','w1','w2','ids'],axis=1)
y = df_tr[['y1','y2']]
xt = df_t.drop(['y1','y2','w1','w2','ids'],axis=1)
x_smiles = df_t['ids'].to_numpy()
yt = df_t[['y1','y2']]
xv = df_v.drop(['y1','y2','w1','w2','ids'],axis=1)

yv = df_v[['y1','y2']]

In [19]:
def set_seed(s):
    seed_value= s
    os.environ['PYTHONHASHSEED']=str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)


In [20]:
set_seed(17)

In [21]:
def show_history(hist):
    acc = hist.history['AUC']
    val_acc = hist.history['val_AUC']
    epochs = range(1, len(acc) + 1)

    plt.plot(epochs, acc, '-', label='AUC')
    plt.plot(epochs, val_acc, ':', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.plot()

In [22]:
cat_features = list(range(x.shape[1]))

In [24]:
modely1 = CatBoostClassifier(
    iterations=100000,
    learning_rate=0.001,
    loss_function='MultiClass',
    # loss_function='CrossEntropy',
    eval_metric='AUC',
    early_stopping_rounds=2000,
    
    # weight=w
        # stratify=True


    # task_type="GPU"
)

modely1.fit(
    x, y['y1'],
    # cat_features=cat_features,
    eval_set=(xv, yv['y1']),
    verbose=False,
    plot=True,

)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f62057b0080>

In [25]:
roc_auc_score(yt['y1'],modely1.predict_proba(xt),multi_class='ovr')

0.6384994277733831

In [26]:
modely2 = CatBoostClassifier(
    iterations=100000,
    learning_rate=0.001,
    loss_function='MultiClass',
    eval_metric='AUC',
    early_stopping_rounds=2000,
    
 
)

modely2.fit(
    x, y['y2'],
    # cat_features=cat_features,
    eval_set=(xv, yv['y2']),
    verbose=False,
    plot=True,

)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f61f950a278>

In [27]:
roc_auc_score(yt['y2'], modely2.predict_proba(xt),multi_class='ovr')

0.6331922169494656

In [28]:
pred1 = modely1.predict(xt).astype('int64')
pred2 = modely2.predict(xt).astype('int64')

In [29]:
df_savey1 = pd.DataFrame(data={'smiles':list(x_smiles)})
df_savey2 = pd.DataFrame(data={'smiles':list(x_smiles)})
df_savey1['y1']=pred1
df_savey2['y2']=pred2

df_savey1.to_csv('output_classification_y1.csv',index=False)
df_savey2.to_csv('output_classification_y2.csv',index=False)     