# Export

Take SMILES sequences and generate SDF and PDBQT files, for analysis in PyRx.

In [None]:
import os
from os import listdir
import random
import pandas as pd
import numpy as np

from openbabel import openbabel # conda install -c conda-forge openbabel

from rdkit import Chem, DataStructs
import rdkit.Chem.PropertyMol
from rdkit.Chem.Fingerprints import FingerprintMols

from src.features.smiles import cleanup_list_smiles, validate_mols, convert_mols_to_smiles, encode_list_smiles

In [None]:
df_smiles = pd.read_csv('data/processed/generated_smiles.smi', sep=',', header=None)
display(df_smiles)
list_smiles = list(df_smiles[0])

In [None]:
# Intakes a list of smiles, randomly shuffles them, then adds first thirty,
# then sets a max-similarity threshold between any new molecule and existing list
# and iteratively increases the treshold until X components are picked to ensure diveristy

def initialize_generation_from_mols(list_of_mols,desired_length):  
    assert desired_length >30
    random.shuffle(list_of_mols)
    random.shuffle(list_of_mols)
    
    #Prepare fingerprints for similarity calcs
    mol_fingerprints = []
    for mol in list_of_mols:
        mol_fingerprints.append(Chem.RDKFingerprint(mol))
    
    selected_mols = list_of_mols[0:30]
    selected_fingerprints = mol_fingerprints[0:30]
    remaining_mols = list_of_mols[30:]
    remaining_fingerprints = mol_fingerprints[30:]
    
    similarity_threshold = .05   
    while len(selected_mols) < desired_length:
        for fingerprint, mol in zip(remaining_fingerprints, remaining_mols):
            max_similarity = np.max(DataStructs.BulkTanimotoSimilarity(fingerprint,selected_fingerprints))
            if (max_similarity <= similarity_threshold) and (max_similarity < 1):
                selected_fingerprints.append(fingerprint)
                selected_mols.append(mol)
        #print("Completed loop with threshold at: ", similarity_threshold, ". Length is currently: ", len(selected_mols))
        similarity_threshold += .05
    return selected_mols

In [None]:
mols = validate_mols(list_smiles)
mols = initialize_generation_from_mols(mols, 1000)
print(len(mols))

In [None]:
BATCH_SIZE = 1
N = len(df_smiles)
OUTPUT_PATH = "data/processed/"

for j in range(int(N / BATCH_SIZE) + 1):
    
    first = j * BATCH_SIZE
    last = min(N - 1, (j + 1) * BATCH_SIZE - 1)
    
    print('Batch', j + 1, ' - ', first, 'to', last)
    
    filename = OUTPUT_PATH + "batch" + str(j + 1) + ".sdf"
    
    f = Chem.SDWriter(filename)
    
    for i in range(first, last + 1):
        smiles = df_smiles[0][i]
        
        mol = Chem.MolFromSmiles(smiles)

        print(FingerprintMols.FingerprintMol(mol))

        f.write(mol)
        
    f.close()

In [None]:
sdf_files = [f for f in os.listdir('data/processed/') if f.endswith('.sdf')]

for sdf in sdf_files:

    obConversion = openbabel.OBConversion()
    obConversion.SetInAndOutFormats("sdf", "pdbqt")

    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, OUTPUT_PATH + sdf)
    
    mol.AddHydrogens()
    
    print(mol.GetFormula())
    mol.SetTitle(mol.GetFormula())

    obConversion.WriteFile(mol, OUTPUT_PATH + os.path.splitext(sdf)[0] + '.pdbqt')
    
    obConversion.CloseOutFile()