### Imports

In [120]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, roc_auc_score
from deepchem.molnet import load_bace_classification, load_bace_regression

import numpy as np
import pandas as pd
import deepchem as dc
from functools import partial

### Parameters

In [121]:
regression = False
deepchem_preprocessing = True ### Use preprocessing used in the benchmark file?
scaffold_split = False ### Submit the scaffold list as the basis?
descriptors = False
fingerprints = False
dataset_features = True ### BACE built-in features
models = ['rf']

hyperparams = {
  "bootstrap": True,
  "criterion": "entropy",
  "min_samples_split": 32,
  "n_estimators": 30
}

dataset_path = r"C:\Users\wojci\Documents\GitHub\czasteczkowa-inzynierka\experiments\BACE\bace.csv"


### Preprocessing

In [137]:

if deepchem_preprocessing:
    featurizer = 'ECFP'
    splitter = 'scaffold' ### Others? RandomSpliter?
    tasks, all_dataset, transformers = load_bace_classification(
                                            featurizer=featurizer, splitter=splitter, reload=False)
    
    print(transformers)
    
    train_set, valid_set, test_set = all_dataset

    print(test_set)

else:
    ### Read csv dataset
    df = pd.read_csv(dataset_path)
    df.drop_duplicates('mol')
    df = df.dropna()
    df.drop(['CID', 'canvasUID'], axis=1, inplace=True)
    if regression:
        df['Target'] = df['pIC50']
        df.drop('Class', axis=1, inplace=True)
        df.drop('pIC50', axis=1, inplace=True)
    else:
        df['Target'] = df['Class']
        df.drop('Class', axis=1, inplace=True)
        df.drop('pIC50', axis=1, inplace=True)


    ### Create dataframe with only molecules and targets
    ### Calculate descriptors and/or fingerprints or use submitted parameters
    
    #if descriptors:
        

    #if fingerprints:


    if dataset_features:
        X = df.drop(['Target', 'mol'], axis=1)
        y = df[['Target', 'Model']]
        
        X_train = X[X['Model'] == 'Train']
        y_train = y[y['Model'] == 'Train']
        X_test = X[X['Model'] == 'Test']
        y_test = y[y['Model'] == 'Test']
        X_valid = X[X['Model'] == 'Valid']
        y_valid = y[y['Model'] == 'Valid']
        
        X_train.drop('Model', axis=1, inplace=True)
        y_train.drop('Model', axis=1, inplace=True)
        X_test.drop('Model', axis=1, inplace=True)
        y_test.drop('Model', axis=1, inplace=True)
        X_valid.drop('Model', axis=1, inplace=True)
        y_valid.drop('Model', axis=1, inplace=True)
            

        train_set = dc.data.DiskDataset.from_numpy(X_train, y=y_train)
        test_set = dc.data.DiskDataset.from_numpy(X_test, y=y_test)
        valid_set = dc.data.DiskDataset.from_numpy(X_valid, y=y_valid)

        print(train_set.y)

    #X_train = dc.data.DiskDataset.from_numpy(X_train)
    #y_train = dc.data.DiskDataset.from_numpy(y_train)
    #X_test = dc.data.DiskDataset.from_numpy(X_test)
    #y_test = dc.data.DiskDataset.from_numpy(y_test)
    #X_valid = dc.data.DiskDataset.from_numpy(X_valid)
    #y_valid = dc.data.DiskDataset.from_numpy(y_valid)



    ### Clean dataset

    ### Translate SMILES into features through fingerprints or descriptors

    ### Split into train, test and validation steps


[<deepchem.trans.transformers.BalancingTransformer object at 0x0000028283D11610>]
<DiskDataset X.shape: (152, 1024), y.shape: (152, 1), w.shape: (152, 1), ids: ['O1CCC(CC1)CNC(=O)C(Cc1cc2cc(ccc2nc1N)-c1ccccc1C)C'
 'O1c2ncc(cc2C([NH2+]CC(O)C2NC(=O)C=3C=CC(=O)N(CCCCc4cc(C2)ccc4)C=3)CC12CCC2)CC(C)(C)C'
 'O1c2ncc(cc2C([NH2+]CC(O)C(NC(=O)COC)Cc2cc(ccc2)-c2occn2)CC12CCC2)CC(C)(C)C'
 ...
 'Fc1c2c(ccc1)[C@@]([NH+]=C2N)(C=1C=C(C)C(=O)N(C=1)CC)c1cc(ccc1)-c1cc(cnc1)C#CC'
 'S(=O)(=O)(CCCCC)C[C@@H](NC(=O)c1cccnc1)C(=O)N[C@H]([C@H](O)C[NH2+]Cc1cc(ccc1)CC)Cc1cc(F)cc(F)c1'
 'O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C'], task_names: ['Class']>


### Creating models

In [123]:
def rf_model_builder(model_dir, hyperparams, tasks):
        sklearn_model = RandomForestClassifier(n_estimators=hyperparams["n_estimators"],
            min_samples_split=hyperparams["min_samples_split"], criterion=hyperparams["criterion"],
            bootstrap=hyperparams["bootstrap"])

        return dc.models.SklearnModel(sklearn_model, model_dir)

In [124]:
if regression:
    model = RandomForestRegressor(n_estimators=hyperparams["n_estimators"],
                                    min_samples_split=hyperparams["min_samples_split"],
                                    criterion=hyperparams["criterion"],
                                    bootstrap=hyperparams["bootstrap"])
else:
    model = RandomForestClassifier(n_estimators=hyperparams["n_estimators"],
                                    min_samples_split=hyperparams["min_samples_split"],
                                    criterion=hyperparams["criterion"],
                                    bootstrap=hyperparams["bootstrap"])

In [125]:
for i in range(3):
    model.fit(X_train, y_train)
    
    y_test_predicted = model.predict(X_test)
    y_valid_predicted = model.predict(X_valid)

    roc_auc_test = roc_auc_score(y_test, y_test_predicted)
    roc_auc_valid = roc_auc_score(y_valid, y_valid_predicted)

    print(roc_auc_test)
    print(roc_auc_valid)

0.695994895229944
0.2840909090909091
0.7013862354613397
0.7954545454545454
0.684819932038291
0.2840909090909091


  model.fit(X_train, y_train)
  model.fit(X_train, y_train)
  model.fit(X_train, y_train)


In [136]:
print(train_set.X.dtype)
print(train_set.y.dtype)

alt_train_set = dc.data.DiskDataset.from_numpy(X=train_set.X, y=train_set.y, w=train_set.w, ids=train_set.ids, tasks=['Class'])
print(train_set)
print(alt_train_set)

float64
float64
<DiskDataset X.shape: (1210, 1024), y.shape: (1210, 1), w.shape: (1210, 1), task_names: ['Class']>
<DiskDataset X.shape: (1210, 1024), y.shape: (1210, 1), w.shape: (1210, 1), task_names: ['Class']>


In [128]:


model = dc.models.SingletaskToMultitask(
        tasks, partial(rf_model_builder, hyperparams=hyperparams, tasks=tasks))
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

for i in range(3):
    model.fit(alt_train_set)

    test_metric = model.evaluate(test_set, [metric], transformers)
    valid_metric = model.evaluate(valid_set, [metric], transformers)

    print(test_metric)
    print(valid_metric)

{'mean-roc_auc_score': 0.8364130434782608}
{'mean-roc_auc_score': 0.7308990760483298}
{'mean-roc_auc_score': 0.8559782608695653}
{'mean-roc_auc_score': 0.7277896233120114}
{'mean-roc_auc_score': 0.8472826086956522}
{'mean-roc_auc_score': 0.7392501776830136}


### Returning results