In [None]:
iteration = '10 Iter'
new.to_csv('data_cycles.csv')

import packages

In [None]:
import torch
import properscoring as ps

from torch import nn
from torch import optim

from nflows.nn.nets import ResidualNet
from torch.nn import functional as F
from nflows.flows.base import Flow
from nflows.distributions.normal import StandardNormal

from nflows.transforms.base import CompositeTransform
from nflows.transforms.autoregressive import MaskedAffineAutoregressiveTransform, MaskedPiecewiseQuadraticAutoregressiveTransform, MaskedPiecewiseLinearAutoregressiveTransform
from nflows.transforms.permutations import ReversePermutation
from nflows.transforms.autoregressive import MaskedUMNNAutoregressiveTransform, MaskedPiecewiseRationalQuadraticAutoregressiveTransform
from nflows.transforms.base import (
    CompositeTransform,
    InputOutsideDomain,
    InverseTransform,
    Transform)
from nflows.transforms.splines.cubic import unconstrained_cubic_spline
from nflows.transforms.autoregressive import AutoregressiveTransform
from nflows.transforms import made as made_module
from nflows.utils import torchutils
from nflows.transforms.base import InputOutsideDomain

In [None]:
import selfies as sf
import tensorflow as tf

In [None]:
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
import pandas as pd
import numpy as np
import copy

import os
import sys
from rdkit.Chem import RDConfig
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer
from rdkit import Chem
from rdkit.Chem import QED
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*')  # suppress error messages

import math # just for train/test split
import time
import seaborn as sns
import matplotlib.pyplot as plt

from collections import OrderedDict

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
torch.cuda.empty_cache()

import data

In [None]:
def get_data(df):
  data = df['QED'].values.reshape((-1,1))
    #  data = df[['QED', 'len']]#.values.reshape((-1,1)) # to get multiple column context
  return data

def preprocess_smiles_NaN(smiles):
    try:
        encoded = sf.encoder(smiles)
    except:
        print('Failed to convert'+str(smiles))
        encoded = 'nop'    # substitute empty strings for [nop]
    return encoded

def preprocess_smiles(smiles):
    encoded = sf.encoder(smiles)
    return encoded

def keys_int(symbol_to_int):
  d={}
  i=0
  for key in symbol_to_int.keys():
    d[i]=key
    i+=1
  return d

def get_selfies(df):
    global selfies_list, largest_selfie_len, int_mol, selfies_alphabet

    sf.set_semantic_constraints()  # reset constraints
    constraints = sf.get_semantic_constraints()
    constraints['?'] = 3
    sf.set_semantic_constraints(constraints)
    
    selfies_list = np.asanyarray(df.selfies)
    selfies_alphabet = sf.get_alphabet_from_selfies(selfies_list)
    # selfies_alphabet.add('.')  # test this
    selfies_alphabet.add('[nop]')  # Add the "no operation" symbol as a padding character; selfies_alphabet.remove('[P]')
    selfies_alphabet = list(sorted(selfies_alphabet))
    # selfies_list = np.asanyarray(df.selfies) # added this line for a subset only if I want to load more selfie alphabet examples
    largest_selfie_len = max(sf.len_selfies(s) for s in selfies_list)
    symbol_to_int = dict((c, i) for i, c in enumerate(selfies_alphabet))
    int_mol=keys_int(symbol_to_int)
    
    onehots=sf.batch_selfies_to_flat_hot(selfies_list, symbol_to_int, largest_selfie_len)
    input_tensor = tf.convert_to_tensor(onehots, dtype='float32')
    noise_tensor = tf.random.uniform(shape=input_tensor.shape, minval=0, maxval=1, dtype='float32')
    data = tf.add(input_tensor, noise_tensor) # dequantized data
    return data


In [None]:
#df = pd.read_csv('../Dataset_after_cycle_4.csv',delimiter=',').dropna(subset=['QED']).drop_duplicates(subset=['Original_SMILES'])
data = pd.read_csv('data_cycles.csv',delimiter=',').dropna(subset=['QED']).drop_duplicates(subset=['Original_SMILES'])
#data = df[df["Origin"] == 'Initial'] #df.copy()
#data['len'] = data['selfies'].apply(lambda x: sf.len_selfies(x))

In [None]:
data_s = data.sort_values(by=['QED']).tail(50)

In [None]:
x = np.float32(get_selfies(data))
y = np.float32(get_data(data))

y = torch.tensor(y).float()
x = torch.tensor(x).float()

index = math.floor(x.shape[0]*1.0)

x_train = x[:index]
y_train = y[:index,:]

In [None]:
y_s = np.float32(get_data(data_s))
# x_s = np.float32(get_data(data_s))

y_s = torch.tensor(y_s).float()
#x_s = torch.tensor(x_s).float()
index = math.floor(y_s.shape[0]*1.0)

# x_test = x_s[:index]
y_test = y_s[:index,:]

In [None]:
if torch.cuda.is_available():
  x_train, y_train, y_test = x_train.cuda(), y_train.cuda(), y_test.cuda()

In [None]:
print(x_train.shape,y_train.shape,y_test.shape)

In [None]:
num_layers = 12 # was 5
hiddenfeatures = 16
base_dist = StandardNormal(shape=[x_train.shape[1]])

transforms = []

for _ in range(num_layers):
    transforms.append(ReversePermutation(features=x_train.shape[1]))
    transforms.append(MaskedAffineAutoregressiveTransform(features=x_train.shape[1],
                                                          hidden_features=hiddenfeatures,
                                                          context_features=1))

transform = CompositeTransform(transforms)

flow = Flow(transform, base_dist)
if torch.cuda.is_available():
  flow = flow.cuda()

optimizer = optim.Adam(flow.parameters(),lr=1e-4)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3000,5000],
                                           gamma=0.3) # was 300, 600, 0.3, with 1000 epochs

In [None]:
%%time
num_iter = 7500

for i in range(num_iter):
    optimizer.zero_grad()
    loss = -flow.log_prob(inputs=x_train, context=y_train).mean()
    if i%500 == 0:
      print('iteration',i,':',loss.item())
    loss.backward()
    optimizer.step()
    scheduler.step()

In [None]:
y_true = y_test.cpu().numpy()

In [None]:
%%time
samples = []
with torch.no_grad():
  for i in range(y_true.shape[0]):
      sample = flow.sample(1,context=y_test[i,:].reshape((-1,1))).cpu().numpy()
      sample = sample.squeeze()
      if i%10 == 0:
          print('sample',i)
      samples.append(list(sample))

samples = np.array(samples)
print(samples.shape)

In [None]:
mols = tf.math.floor(samples)  # quantize data
mols = tf.clip_by_value(mols, 0, 1)  # Set negative values to 0 and values > 1 to 1
mols_list = mols.numpy().tolist()

# Add padding characters if needed
for mol in mols_list:
    for i in range(largest_selfie_len):
        row = mol[len(selfies_alphabet) * i: len(selfies_alphabet) * (i + 1)]
        if all(elem == 0 for elem in row):
            mol[len(selfies_alphabet) * (i+1) - 1] = 1

In [None]:
mols=sf.batch_flat_hot_to_selfies(mols_list, int_mol)

valid_count = 0
valid_selfies, invalid_selfies = [], []
for idx, selfies in enumerate(mols):
  try:
    if Chem.MolFromSmiles(sf.decoder(mols[idx]), sanitize=True) is not None:
        valid_count += 1
        valid_selfies.append(selfies)
    else:
      invalid_selfies.append(selfies)
  except Exception:
    pass
print('%.2f' % (valid_count / len(mols)),  '% of generated samples are valid molecules.')

In [None]:
gen_mols = [Chem.MolFromSmiles(sf.decoder(vs)+'OP(C)(=O)F') for vs in valid_selfies]

gen_mols = [i for i in gen_mols if i]

smiles_generated = [sf.decoder(vs) for vs in valid_selfies] #+'OP(C)(=O)F' for vs in valid_selfies]

from collections import OrderedDict
OrderedDict((x, True) for x in smiles_generated).keys()

In [None]:
my_range_min = data.idxmax()[0]+1
df_generated = pd.DataFrame(smiles_generated, columns=["Original_SMILES"])#.drop_duplicates()
df_generated['Context'] = y_true.reshape((-1,1))
#df_generated['len'] = df_generated['Original_SMILES'].apply(lambda x: len(x))
df_generated['selfies'] = df_generated['Original_SMILES'].apply(preprocess_smiles_NaN)
df_generated['RowID'] = pd.Series(samples.shape[0])
df_generated['Origin'] = iteration
df_generated.index += my_range_min
new = pd.concat([data, df_generated])#.drop_duplicates(subset=['Original_SMILES'])

In [None]:
for index, row in new.iterrows():
    try:
        mol = Chem.MolFromSmiles(row['Original_SMILES']+'OP(C)(=O)F')
        qed = QED.default(mol)
        try:
            sas_score = sascorer.calculateScore(mol)
        except:
            sas_score = np.nan 
    except:
        sas_score = np.nan
        qed = np.nan
    
    new.at[index, "QED"] = qed
    new.at[index, "SA_score"] = sas_score

In [None]:
new['SELFIES_Length'] = new['selfies'].apply(lambda x: sf.len_selfies(x))

fig, ax = plt.subplots()
sns.set_theme(style="whitegrid")

#ax.hist(new_vis['SELFIES_Length'], 15, density=True, histtype='step')#, stepped=True)
ax = sns.histplot(data=new, x="SELFIES_Length", hue="Origin", kde=True, element="step", bins = 15, multiple="stack")
ax.set_xlabel('Selfie String Length')

In [None]:
fig, ax = plt.subplots()
sns.set_theme(style="whitegrid")

#ax.hist(new_vis['SELFIES_Length'], 15, density=True, histtype='step')#, stepped=True)
ax = sns.histplot(data=new, x="QED", hue="Origin", kde=True, element="step", bins = 20, multiple="stack")
ax.set_xlabel('QED')
plt.axvline(x=0.712277)

In [None]:
y_new = new[new["Origin"] == iteration]
#y_new['Context [QED]']= y_test.cpu().numpy().flatten().tolist()

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=new, x="QED", y="Context", hue="Origin")
ax.set_xlabel('Calculated QED for the sample molecule')
#x.set_ylim(bottom=0.02, top=0.80)
#ax.set_xlim(left=0.02, right=0.80)
ax.legend(bbox_to_anchor=(1.0, 1.00))
plt.axvline(x=0.712277)
plt.axhline(y=0.712277)

In [None]:
symbol_to_int = dict((c, i) for i, c in enumerate(selfies_alphabet))

analysis_input=sf.batch_selfies_to_flat_hot(selfies_list, symbol_to_int, largest_selfie_len)
analysis_input_results = np.zeros((len(analysis_input[0]),), dtype=int).tolist()

for i in analysis_input: #range(0, 2):#len(analysis_input)):
    #print(i)
    summa = []
    for k in range(0, len(analysis_input[0])):
        #print(analysis_input_results[k] , 'and',  i[k])
        summa.append(analysis_input_results[k] + i[k])
    analysis_input_results = summa

def split_list(lst, chunk_size):
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

input_analysis = split_list(analysis_input_results, len(selfies_alphabet))
# symbol_to_int # = x 

In [None]:
symbol_to_int = dict((c, i) for i, c in enumerate(selfies_alphabet))

analysis_output=sf.batch_selfies_to_flat_hot(mols, symbol_to_int, largest_selfie_len)
analysis_output_results = np.zeros((len(analysis_output[0]),), dtype=int).tolist()

for i in analysis_output: #range(0, 2):#len(analysis_input)):
    #print(i)
    summa = []
    for k in range(0, len(analysis_output[0])):
        summa.append(analysis_output_results[k] + i[k])
    analysis_output_results = summa

output_analysis = split_list(analysis_output_results, len(selfies_alphabet))

histogram_inp = dict(list(enumerate(input_analysis)))
histogram_out = dict(list(enumerate(output_analysis)))

histogram_inp['Selfie'] = selfies_alphabet
histogram_out['Selfie'] = selfies_alphabet

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(24, 24))
plt.subplots_adjust(left=0.1,
                    bottom=0.05, 
                    right=0.99, 
                    top=0.97,
                    wspace=0.3, 
                    hspace=0.6
                   )

count=0
position=[axes[0, 0], axes[0, 1], axes[0, 2], axes[0, 3],
          axes[1, 0], axes[1, 1], axes[1, 2], axes[1, 3],
          axes[2, 0], axes[2, 1], axes[2, 2], axes[2, 3],
          axes[3, 0], axes[3, 1], axes[3, 2], axes[3, 3]
         ]

names =     ["0th", "1st", "2nd", "3rd",
             "4th", "5th", "6th", "7th",
             "8th", "9th", "10th", "11th",
             "12th", "13th", "14th", "15th"]

for ax, title in zip(axes.flat, names):
    ax.set_title(title, fontsize=16)

for i in range(0, 16):
    data_vis = pd.DataFrame()
    
    data_vis['Selfie'] = selfies_alphabet
    data_vis['Input'] = histogram_inp[i]
    data_vis['Output'] = histogram_out[i]
    
    data_vis.plot(ax=position[count], kind='bar', x='Selfie', y=['Input', 'Output'], rot = 90)

    position[count].set_yscale('log')
#    position[count].set_ylim(top=200)
    
    # deleting the duplicate x and y labels
    axes[0, 0].set_xlabel('', fontsize=20)
    axes[0, 1].set_xlabel('', fontsize=20)
    axes[0, 2].set_xlabel('', fontsize=20)
    axes[0, 3].set_xlabel('', fontsize=20)
#    axes[1, 0].set_xlabel('', fontsize=20)
#    axes[1, 1].set_xlabel('', fontsize=20)
#    axes[1, 2].set_xlabel('', fontsize=20)
#    axes[1, 3].set_xlabel('', fontsize=20)
    axes[2, 0].set_xlabel('', fontsize=20)
    axes[2, 1].set_xlabel('', fontsize=20)
    axes[2, 2].set_xlabel('', fontsize=20)
    axes[2, 3].set_xlabel('', fontsize=20)
    
    count=count+1

fig.savefig('Distributions-in-out.png')

In [None]:
to_clean_up = pd.read_csv('../Dataset_after_cycle_4.csv',delimiter=',')#.dropna(subset=['QED']).drop_duplicates(subset=['Original_SMILES'])

for index, row in to_clean_up.iterrows():
    try:
        mol = Chem.MolFromSmiles(row['Original_SMILES']+'OP(C)(=O)F')
        qed = QED.default(mol)
        try:
            sas_score = sascorer.calculateScore(mol)
        except:
            sas_score = np.nan 
    except:
        sas_score = np.nan
        qed = np.nan
    
    to_clean_up.at[index, "QED"] = qed
    to_clean_up.at[index, "SA_score"] = sas_score

to_clean_up.to_csv('../Dataset_after_cycle_4-corrected.csv')

In [None]:
fig, ax = plt.subplots()
sns.set_theme(style="whitegrid")

#ax.hist(new_vis['SELFIES_Length'], 15, density=True, histtype='step')#, stepped=True)
ax = sns.histplot(data=to_clean_up, x="QED", hue="Origin", kde=True, element="step", bins = 20, multiple="stack")
ax.set_xlabel('QED')
plt.axvline(x=0.712277)