## ChemML

In [1]:
import numpy as np
np.random.seed(90)

import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from chemml.datasets import load_organic_density
smiles, density, features = load_organic_density()

# X = features.values
y = density.values

In [3]:
from chemml.chem import Molecule

molecules = [Molecule(i, 'smiles') for i in smiles.iloc[:,0]]
print(len(molecules))

500


In [4]:
molecules[100]

<chemml.chem.Molecule(
        rdkit_molecule : <rdkit.Chem.rdchem.Mol object at 0x1240788f0>,
        pybel_molecule : None,
        creator        : ('SMILES', 'Cc1cc(ncc1c1ncsc1)c1ncsc1'),
        smiles         : 'Cc1cc(-c2cscn2)ncc1-c1cscn1',
        smarts         : None,
        inchi          : None,
        xyz            : None)>

In [5]:
# geometry optimization
for mol in molecules:
    mol.hydrogens('add')
    mol.to_xyz(optimizer='MMFF', mmffVariant='MMFF94s', maxIters=200) # 'UFF'

In [7]:
# calculate CoulombMatrix
from chemml.chem import CoulombMatrix

cm = CoulombMatrix(CMtype='SC')
X = cm.represent(molecules)

In [8]:
# Train-Test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape, X_test.shape

((450, 1653), (50, 1653))

In [9]:
# validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0)
X_train.shape, X_val.shape

((405, 1653), (45, 1653))

In [None]:
# normalize

## Defining hyperparameter space

In [10]:
from sklearn.kernel_ridge import KernelRidge
space = (
        {'alpha'   :   {'uniform' : (0.1, 10), 'mutation': (0,1)}},
        {'kernels' :   {'choice'  : ['rbf', 'sigmoid', 'polynomial']}},
        {'degree'  :   {'int'     : (1,5)}} )

## Defining objective function

In [11]:
from sklearn.metrics import mean_absolute_error
def obj(individual):
    krr = KernelRidge(alpha=individual[0], kernel=individual[1], degree=individual[2])
    krr.fit(X_train, y_train)
    pred = krr.predict(X_val)
    return mean_absolute_error(y_val, pred)

## model selection

In [12]:
from chemml.optimization import GeneticAlgorithm
ga = GeneticAlgorithm(evaluate=obj, space=space, fitness=("min", ),
                        pop_size = 8, crossover_size=0.5, mutation_size=0.5, algorithm=3)
best_models_per_generation, final_best_hyperparameters = ga.search(n_generations=10)

In [13]:
best_models_per_generation


Unnamed: 0,Best_individual,Fitness_values,Time (hours)
0,"(0.1, sigmoid, 2)",64.562413,3.8e-05
1,"(0.1, sigmoid, 2)",64.562413,3.4e-05
2,"(0.1, sigmoid, 2)",64.562413,3.6e-05
3,"(0.1, sigmoid, 2)",64.562413,3.8e-05
4,"(0.1, sigmoid, 2)",64.562413,3.9e-05
5,"(0.1, sigmoid, 2)",64.562413,4e-05
6,"(0.1, sigmoid, 2)",64.562413,3.7e-05
7,"(0.1, sigmoid, 2)",64.562413,3.7e-05
8,"(0.1, sigmoid, 2)",64.562413,3.4e-05
9,"(0.1, sigmoid, 2)",64.562413,3.9e-05
