In [None]:
import json
import os
import pickle
from multiprocessing import Pool
from pathlib import Path
from time import gmtime, strftime, time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from guacamol_baselines.graph_ga.goal_directed_generation import \
    GB_GA_Generator
from guacamol_baselines.smiles_lstm_hc.goal_directed_generation import SmilesRnnDirectedGenerator
from utils import TPScoringFunction, calc_auc, ecfp, score

def timestamp():
    return strftime("%Y-%m-%d_%H:%M:%S", gmtime())

def fit_clfs(chid, n_estimators, n_jobs):
    """
    Args:
        chid: which assay to use:
        external_file:
    """
    # read data and calculate ecfp fingerprints
    assay_file = f'./assays/processed/{chid}.csv'
    print(f'Reading data from: {assay_file}')
    df = pd.read_csv(assay_file)
    X = np.array(ecfp(df.smiles))
    y = np.array(df.label)

    # split in equally sized sets. Stratify to get same label distributions
    X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5, stratify=y)

    balance = (np.mean(y1), np.mean(y2))

    # train classifiers and store them in dictionary
    clfs = {}
    clfs['Split1'] = RandomForestClassifier(
        n_estimators=n_estimators, n_jobs=n_jobs)
    clfs['Split1'].fit(X1, y1)

    clfs['Split1_alt'] = RandomForestClassifier(
        n_estimators=n_estimators, n_jobs=n_jobs)
    clfs['Split1_alt'].fit(X1, y1)

    clfs['Split2'] = RandomForestClassifier(
        n_estimators=n_estimators, n_jobs=n_jobs)
    clfs['Split2'].fit(X2, y2)

    # calculate AUCs for the clfs
    aucs = {}
    aucs['Split1'] = calc_auc(clfs['Split1'], X2, y2)
    aucs['Split1_alt'] = calc_auc(clfs['Split1_alt'], X2, y2)
    aucs['Split2'] = calc_auc(clfs['Split2'], X1, y1)
    print("AUCs:")
    for k, v in aucs.items():
        print(f'{k}: {v}')

    return clfs, aucs, balance

In [None]:
# def optimize(chid,
#              n_estimators,
#              n_jobs,
#              external_file,
#              n_external,
#              seed,
#              optimizer_args):


chid='CHEMBL3888429'
n_estimators=100
n_jobs=8
external_file='./data/guacamol_v1_test.smiles.can'
n_external=3000
seed=101
optimizer_args=dict(smi_file='./data/guacamol_v1_valid.smiles.can',
                    population_size=100,
                    offspring_size=200,
                    generations=5,
                    mutation_rate=0.01,
                    n_jobs=-1,
                    random_start=True,
                    patience=150,
                    canonicalize=False)
    
np.random.seed(seed)
# config = locals()
# print(locals())
#set up logging
results_dir = os.path.join('./test', 'graph_ga', chid, timestamp())
os.makedirs(results_dir)

# config_file = os.path.join(results_dir, 'config.json')
# with open(config_file, 'w') as f:
#     json.dump(config, f)



clfs, aucs, balance = fit_clfs(chid, n_estimators, n_jobs)
results = {}
results['AUC'] = aucs
results['balance'] = balance

clf_file = os.path.join(results_dir, 'classifiers.p')
with open(clf_file, 'wb') as f:
    pickle.dump(clfs, f)

Reading data from: ./assays/processed/CHEMBL3888429.csv
AUCs:
Split1: 0.8218614718614718
Split1_alt: 0.8126082251082252
Split2: 0.8033948940793046


In [None]:
# Create guacamol scoring function with clf trained on split 1
scoring_function = TPScoringFunction(clfs['Split1'])

# run optimization
t0 = time()
optimizer = SmilesRnnDirectedGenerator()

smiles_history = optimizer.generate_optimized_molecules(
    scoring_function, 100, get_history=True)

In [None]:
# make a list of dictionaries for every time step
statistics = []
for optimized_smiles in smiles_history:
    row = {}
    row['smiles'] = optimized_smiles
    row['preds'] = {}
    row['ratio_active'] = {}
    row['mean_pred'] = {}
    for k, clf in clfs.items():
        preds = score(optimized_smiles, clf)
        row['preds'][k] = preds
    statistics.append(row)

results['statistics'] = statistics

stat_time = time() - t1
# add predictions on external set
# load external smiles for evaluation
with open(external_file) as f:
    external_smiles = f.read().split()
external_smiles = np.random.choice(external_smiles, n_external)
results['predictions_external'] = {k: score(external_smiles, clf) for k, clf in clfs.items()}

results_file = os.path.join(results_dir, 'results.json')
with open(results_file, 'w') as f:
    json.dump(results, f)

print(f'Storing results in {results_dir}')
print(f'Optimization time {opt_time:.2f}')
print(f'Statistics time {stat_time:.2f}')

Reading data from: ./assays/processed/CHEMBL3888429.csv
AUCs:
Split1: 0.8218614718614718
Split1_alt: 0.8126082251082252
Split2: 0.8033948940793046
selecting initial population...
0 | max: 0.420 | avg: 0.268 | min: 0.198 | std: 0.050 | sum: 26.821 | 8.33 sec/gen | 12.01 mol/sec | 0.32 rest 
1 | max: 0.525 | avg: 0.311 | min: 0.252 | std: 0.050 | sum: 31.097 | 1.01 sec/gen | 99.44 mol/sec | 0.35 rest 
2 | max: 0.525 | avg: 0.341 | min: 0.285 | std: 0.042 | sum: 34.073 | 1.04 sec/gen | 96.21 mol/sec | 0.32 rest 
3 | max: 0.525 | avg: 0.362 | min: 0.316 | std: 0.035 | sum: 36.232 | 0.99 sec/gen | 101.31 mol/sec | 0.34 rest 
4 | max: 0.525 | avg: 0.387 | min: 0.340 | std: 0.040 | sum: 38.667 | 0.94 sec/gen | 106.46 mol/sec | 0.32 rest 


Process ForkPoolWorker-122:
Process ForkPoolWorker-124:
Process ForkPoolWorker-123:
Process ForkPoolWorker-121:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/system/apps/userenv/renz/miniconda3/envs/guacamol/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/system/apps/userenv/renz/miniconda3/envs/guacamol/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/system/apps/userenv/renz/miniconda3/envs/guacamol/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/system/apps/userenv/renz/miniconda3/envs/guacamol/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/system/apps/userenv/renz/miniconda3/envs/guacamol/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/system/apps/userenv/renz/miniconda3/envs/guacamol/li

KeyboardInterrupt: 

KeyboardInterrupt
