In [1]:
# Change to the top directory
%cd ..

/home/OSDA_Generator


In [2]:
# imports
from ddc_pub import ddc_v3 as ddc
import pandas as pd
from rdkit import Chem
import numpy as np
import random
import pickle

In [3]:
# load in data
data = pd.read_excel('data/Jensen_et_al_CentralScience_OSDA_Zeolite_data.xlsx', engine='openpyxl')

In [4]:
# define our featurization for the zeolite structures and synthesis conditions
# zeolites are featurized with structural features taken from IZA (http://www.iza-structure.org/)
def featurize_zeolite(row):
    features = [row['FD'], row['max_ring_size'], row['channel_dim'], row['inc_vol'], row['accvol'], row['maxarea'], row['minarea']]
    if any([pd.isnull(x) for x in features]):
        # Several structures without official IZA designations will not have complete data
        # and therefore are removed from the data ie ITQ-21, ITQ-43, SU-74, etc
        print('Problem: ', row['Code'], 'Missing Information in Data')
        return None
    else:
        return features
# synthesis conditions are one encoded based on the most common elements encountered
def featurize_synthesis(row):
    seeds = ['seed', 'SAPO-56 seeds', 'SSZ-57', 'FAU', 'seeded with magadiite', 'seeds']
    solvents = ['ethylene glycol', 'hexanol', '2-propanol', 'triethylene glycol', 'triglycol',
                'polyethylene glycol', 'n-hexanol', 'glycol', 'propane-1,3-diol', 'butanol', 
                'glycerol', 'isobutylamine', 'tetraethylene glycol', '1-hexanol', 
               'sec-butanol', 'iso-butanol', 'ethylene glycol monomethyl ether', 'ethanol']
    acids = ['H2SO4', 'acetic acid', 'oxalic acid', 'succinic acid', 'arsenic acid', 'HNO3', 'HCl',
            'SO4']
    frameworks = ['Co', 'Mn', 'Cu', 'Zn', 'Cd', 'Cr', 'V', 'Ce', 'Nd', 'Sn', 'Zr', 'Ni',
                  'S', 'Sm', 'Dy', 'Y', 'La', 'Gd', 'In', 'Nb', 'Te', 'As', 'Hf', 'W',
                 'Se']
    common_frameworks = ['Si', 'Al', 'P', 'Ge', 'B', 'Ti', 'Ga', 'Fe']
    cations = ['Mg', 'Rb', 'Li', 'Cs', 'Sr', 'Ba', 'Be', 'Ca']
    common_cations = ['Na', 'K']
    bad = ['pictures', 'need access', 'also called azepane', 'SMILES code']
    syns = [x.strip() for x in [row['syn1'], row['syn2'], row['syn3'], row['syn4'], row['syn5'],
                       row['syn6'], row['syn7'], row['syn8']] if not pd.isnull(x)]
    if not syns:
        return None
    syn_vector = []
    for c in common_frameworks:
        if c in syns:
            syn_vector.append(1)
        else:
            syn_vector.append(0)
    for c in common_cations:
        if c in syns:
            syn_vector.append(1)
        else:
            syn_vector.append(0)
    if 'F' in syns:
        syn_vector.append(1)
    else:
        syn_vector.append(0)
    frame, cat, seed, solv, acid, oth = 0,0,0,0,0,0
    for s in syns:
        if s in frameworks:
            frame = 1
        elif s in cations:
            cat = 1
        elif s in seeds:
            seed = 1
        elif s in solvents:
            solv = 1
        elif s in acids:
            acid = 1
        elif s.count(' ') < 2 and s not in bad and len(s) > 2:
            oth = 1
    syn_vector.extend([frame, cat, seed, solv, acid, oth])
    return syn_vector

In [5]:
# featurize and augment the data
smiles, zeolites, synthesis, codes = [],[],[],[]
# Whether or not to augment the data with smiles string randomization
# SMILEs string are not unique to a molecule so by changing the SMILEs we can add more data
augment = True
for i, row in data.iterrows():
    if ' + ' not in row['smiles']: # only look at single-template synthesis
        zeo = featurize_zeolite(row=row)
        syn = featurize_synthesis(row=row)
        if zeo is not None and syn is not None:
            if augment:
                new_smiles = []
                m = Chem.MolFromSmiles(row['smiles'])
                for i in range(100): # randomize smiles string up to 100 times
                    try:
                        rand_smile = Chem.MolToSmiles(m, canonical=False, doRandom=True, isomericSmiles=False)
                        rand_mol = Chem.MolFromSmiles(rand_smile)
                        if m is not None and rand_smile not in new_smiles:
                            new_smiles.append(rand_smile)
                    except:
                        print('Problem:', row['smiles'], 'could not be randomized')
                        break
                for smile in new_smiles:
                    smiles.append(smile)
                    zeolites.append(zeo)
                    synthesis.append(syn)
                    codes.append(row['Code'])
            else:
                smiles.append(row['smiles'])
                zeolites.append(zeo)
                synthesis.append(syn)
                codes.append(row['Code'])
zeolites = np.array(zeolites)
synthesis = np.array(synthesis)
print(len(smiles), zeolites.shape, synthesis.shape, len(codes))

Problem:  ITQ-21 Missing Information in Data
Problem:  NUD-1 Missing Information in Data
Problem:  NUD-1 Missing Information in Data
Problem:  NUD-1 Missing Information in Data
Problem:  NUD-1 Missing Information in Data
Problem:  NUD-1 Missing Information in Data
Problem:  NUD-1 Missing Information in Data
Problem:  ASU-14 Missing Information in Data
Problem:  ASU-16 Missing Information in Data
Problem: [Na+].CCCCCCCCCCCCO[S]([O-])(=O)=O could not be randomized
Problem:  SU-74 Missing Information in Data
Problem:  SU-74 Missing Information in Data
Problem:  SU-M Missing Information in Data
Problem:  SU-MB Missing Information in Data
Problem:  SU-MB Missing Information in Data
Problem:  SU-74 Missing Information in Data
Problem:  nan Missing Information in Data
Problem:  ITQ-43 Missing Information in Data
Problem:  ITQ-43 Missing Information in Data
Problem:  ITQ-43 Missing Information in Data
Problem:  ITQ-43 Missing Information in Data
Problem:  ITQ-21 Missing Information in Data
Pro

In [6]:
# Need to clean the OSDAs to remove unusual characters
chars = set()
for s in smiles:
    if 'Si' in s:
        continue
    for c in s:
        chars.add(c)
chars = list(chars)
cant = ['a', 't', ' ', 'i', 'd', 'e', 'f', 'y']
new_smile, new_z, new_s, new_c = [],[],[],[]
for s, z, d, c in zip(smiles, zeolites, synthesis, codes):
    if 'Si' in s:
        continue
    found = False
    for c in s:
        if c in cant:
            found = True
    if not found:
        new_z.append(z)
        new_s.append(d)
        new_smile.append(s)
        new_c.append(c)
smiles = new_smile
codes = new_c
zeolites = np.array(new_z)
synthesis = np.array(new_s)
print(len(smiles), zeolites.shape, synthesis.shape, len(codes))
chars = "".join(chars)

179613 (179613, 7) (179613, 17) 179613


In [7]:
# Normalize the zeolite vectors
zeo_norm = pickle.load(open('models/zeolite_normalizer.pkl', 'rb'))
zeolites = zeo_norm.transform(zeolites)
print(zeolites.shape)

(179613, 7)


In [8]:
# Train test split
split =  "random" # options are None, random, or zeolite (ie CHA or SFW)
train_smiles, train_zeolites, train_synthesis, train_codes = [],[],[],[]
test_smiles, test_zeolites, test_synthesis, test_codes = [],[],[],[]
train_input, test_input = [],[]
if split is None:
    train_smiles = smiles
    train_zeolites = zeolites
    train_synthesis = synthesis
    train_codes = codes
    for z, syn in zip(zeolites, synthesis):
        train_input.append(list(z)+list(syn))
elif split == 'random':
    unique_smiles = list(np.unique(smiles))
    print(len(unique_smiles))
    test_indices = []
    random.shuffle(unique_smiles)
    print(len(unique_smiles))
    test_smiles = unique_smiles[:round(0.2*len(unique_smiles))] # 20% held out set
    print(len(test_smiles))
    for t in test_smiles:
        for i, s in enumerate(smiles):
            if t == s:
                test_indices.append(i)
    print(len(test_indices))
    for i, (s, z, syn, c) in enumerate(zip(smiles, zeolites, synthesis, codes)):
        if i in test_indices:
            test_smiles.append(s)
            test_zeolites.append(z)
            test_synthesis.append(syn)
            test_codes.append(c)
            test_input.append(list(z)+list(syn))
        else:
            train_smiles.append(s)
            train_zeolites.append(z)
            train_synthesis.append(syn)
            train_codes.append(c)
            train_input.append(list(z)+list(syn))
else:
    if split not in unique_codes:
        print('Problem:', split, 'not a zeolite in the data')
    else:
        for i, (s, z, syn, c) in enumerate(zip(smiles, zeolites, synthesis_codes)):
            if split == c:
                test_smiles.append(s)
                test_zeolites.append(z)
                test_synthesis.append(syn)
                test_codes.append(c)
                test_input.append(list(z)+list(syn))
            else:
                train_smiles.append(s)
                train_zeolites.append(z)
                train_synthesis.append(syn)
                train_codes.append(c)
                train_input.append(list(z)+list(syn))
train_zeolites = np.array(train_zeolites)
train_synthesis = np.array(train_synthesis)
test_zeolites = np.array(test_zeolites)
test_synthesis = np.array(test_synthesis)
train_input = np.array(train_input)
test_input = np.array(test_input)
print(len(train_smiles), len(train_codes), train_zeolites.shape, train_synthesis.shape)
print(len(test_smiles), len(test_codes), test_zeolites.shape, test_synthesis.shape)
print(train_input.shape, test_input.shape)

73913
73913
14783
36878
142735 142735 (142735, 7) (142735, 17)
51661 36878 (36878, 7) (36878, 17)
(142735, 24) (36878, 24)


In [9]:
# convert training smiles to binary
mol_train = [Chem.rdchem.Mol.ToBinary(Chem.MolFromSmiles(x)) for x in train_smiles]
mol_train = np.array(mol_train)
print(len(mol_train))

142735


In [10]:
# Need to pass in set of possible characters and max len of smiles
charset = ''
for x in train_smiles:
    for i in x:
        if i not in charset:
            charset = charset+i
print(len(charset))
print(charset)
max_len = 0
for x in train_smiles:
    if len(x) > max_len:
        max_len = len(x)
print(max_len)

24
C1(N)2[n+]c34PO=HlF-567S
118


In [11]:
dataset_info = dict()
dataset_info['maxlen'] = max_len
dataset_info['charset'] = charset
dataset_info['name'] = 'OSDA_model'
model = ddc.DDC(x=np.array(list(train_input)),
                y=mol_train,
                scaling=False,
                pca=False,
                dec_layers=3,
                code_layer_dim=128,
                batch_size=128,
                dataset_info=dataset_info
               )

Initializing model in train mode.
Input type is 'molecular descriptors'.
Model received 128461 train samples and 14274 validation samples.
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Latent_Input (InputLayer)       [(None, 24)]         0                                            
__________________________________________________________________________________________________
Decoder_Inputs (InputLayer)     [(None, 132, 27)]    0                                            
__________________________________________________________________________________________________
latent_to_states_model (Model)  [(None, 256), (None, 44544       Latent_Input[0][0]               
__________________________________________________________________________________________________
batch_model (Model)             (None, 132, 27)     

In [12]:
model.fit(epochs=100, 
            lr=1e-3,
            mini_epochs=10,
            patience=25,
            model_name='OSDA_model',
            verbose=1,
            save_period=50,
            lr_decay=True, 
            sch_epoch_to_start=500,
            sch_lr_init=1e-3,
            sch_lr_final=1e-6,
            checkpoint_dir='bin/'
           )


Model trained with dataset OSDA_model that has maxlen=128 and charset=C1(N)2[n+]c34PO=HlF-567S for 100 epochs.
noise_std: 0.010000, lstm_dim: 256, dec_layers: 3, td_dense_dim: 0, batch_size: 128, codelayer_dim: 24, lr: 0.001000.

Epoch 00001: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 1/1000

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 2/1000

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 3/1000

Epoch 00004: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 4/1000

KeyboardInterrupt: 

In [13]:
model.save('models/OSDA_model_example')

Model saved.
Elapsed time: 0.402 seconds.
