# Ligand Selection and Preparation, Docking and Analysis

This notebook contains the code necessary to; prepare the ligands for docking, run docking in smina, and analyse the results of the docking.

In [22]:
# Ensure docking-env is activated as kernel.

# Import all libraries that are required.

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDistGeom
from rdkit.Chem.rdmolfiles import MolToPDBFile
import os
import pandas as pd
import numpy as np
from standardiser import standardise
import datamol as dm
from openbabel import pybel


DATAPATH = "../data"
RESULTSPATH = "../results"
SOURCEPATH = "../src"


## Ligand Selection, Validation and Standardisation

Ligand selection step involves;

- Identification of ligands (from generative model, DB's, etc) and extraction as smiles strings
- Standardisation - smiles-->mol, mol--> smiles, removal of any ligands that have no smiles
- Filtering of ligands (based on pharmacophore matching score, eos models, synthetic likelihood, etc)

Note on using Erilia 'find similar compounds' models - these return a .csv file with input smiles, then all similar smiles in one cell, comma separated. These can be separated by 'text to columns', and this gives the smiles in a row. Can then select all manually (shift), cut, paste and transpose. Then find " ' " and replace with nothing. Must add in an 'ID' column, can just have numbers (1-x), as this must be called to be used as filename when converting smiles->mols for 3D gen. Save.

In [11]:
#preprocessing via dm
testcsv = os.path.join(DATAPATH, "test", "smalltest.csv")
smiles_column = "SMILES"
data = pd.read_csv(testcsv)
clean_data = os.path.join(DATAPATH, "test", "std_smiles.csv")

def _preprocess(row):
    mol = dm.to_mol(row[smiles_column], ordered=True)
    mol = dm.fix_mol(mol, remove_singleton=True, inplace=True)
    mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    mol = dm.standardize_mol(
        mol,
        disconnect_metals=False,
        normalize=True,
        reionize=True,
        uncharge=False,
        stereo=True,
    )
    mol = dm.fix_valence(mol, inplace=True, allow_ring_break=True)
    mol = dm.fix_valence_charge(mol, inplace=True)

    row["ST_SMILES"] = dm.standardize_smiles(dm.to_smiles(mol))
    return row


data_clean = data.apply(_preprocess, axis=1)
data_clean.drop(columns = ["SMILES"], inplace=True)
data_clean.to_csv(clean_data, index = False)


[15:33:46] Initializing Normalizer
[15:33:46] Running Normalizer
[15:33:46] Initializing MetalDisconnector
[15:33:46] Running MetalDisconnector
[15:33:46] Initializing Normalizer
[15:33:46] Running Normalizer
[15:33:46] Initializing Normalizer
[15:33:46] Running Normalizer
[15:33:46] Initializing MetalDisconnector
[15:33:46] Running MetalDisconnector
[15:33:46] Initializing Normalizer
[15:33:46] Running Normalizer
[15:33:46] Initializing Normalizer
[15:33:46] Running Normalizer
[15:33:46] Initializing MetalDisconnector
[15:33:46] Running MetalDisconnector
[15:33:46] Initializing Normalizer
[15:33:46] Running Normalizer
[15:33:46] Initializing Normalizer
[15:33:46] Running Normalizer
[15:33:46] Initializing MetalDisconnector
[15:33:46] Running MetalDisconnector
[15:33:46] Initializing Normalizer
[15:33:46] Running Normalizer
[15:33:46] Initializing Normalizer
[15:33:46] Running Normalizer
[15:33:46] Initializing MetalDisconnector
[15:33:46] Running MetalDisconnector
[15:33:46] Initializ

In [12]:
#insert descriptors, remove all with MW<100 or MW>700, remove descriptors

#this takes the std_simles csv and links to affinities score, with descriptors

csv_200_descriptors = os.path.join(DATAPATH, "test", "csv_200_desc.csv")
smiles_plus_desc = os.path.join(DATAPATH, "test", "smiles_descriptors.csv") 
filtered_std_smiles = os.path.join(DATAPATH, "test", "filtered_std_smiles.csv")                                                          

from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

#df1 = pd.read_csv(std_csv_file)

def mol_descriptor(clean_data):
    
    mols = [Chem.MolFromSmiles(i) for i in clean_data] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 


# Function call

def filterer():
    
    df1 = pd.read_csv(clean_data)

    Mol_descriptors,desc_names = mol_descriptor(df1['ST_SMILES'])

    df_200_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)

    df_200_descriptors.to_csv(csv_200_descriptors, index=False)


    cols = [5]
    df_molwt = df_200_descriptors[df_200_descriptors.columns[cols]]

    
    df2 = df_molwt
    merged = pd.concat([df1, df2], axis="columns")

    merged.to_csv(smiles_plus_desc, index=False)
    os.remove(csv_200_descriptors)

    df = pd.read_csv(smiles_plus_desc)

    # Filter all rows for which the smiles mw is under 100
    df_filtered = df[df['MolWt'] >= 300]
    df_filtered = df_filtered[df_filtered['MolWt'] <= 800]

    df_filtered.drop(columns = ["MolWt"], inplace=True)

    df_filtered.to_csv(filtered_std_smiles, index=False)

    os.remove(smiles_plus_desc)
    #os.remove(clean_data)

filterer()

In [2]:
# Ligand standardisation code here - smiles -> mol -> standard mol -> standard smiles

csv_file = os.path.join(DATAPATH, "test", "smalltest.csv")
std_csv_file = os.path.join(DATAPATH, "test", "std_smalltest.csv")

def smiles_standardiser(csv_file, std_csv_file):

    df=pd.read_csv(csv_file) 

    mols = [Chem.MolFromSmiles(smi) for smi in df["SMILES"].tolist()]

    std_mols = []

    for mol in mols:
        if mol is not None:
            try:
                std_mol = standardise.run(mol)
            except:
                std_mol = np.nan
        else:
            std_mol = np.nan
        std_mols += [std_mol]

    std_smiles = []

    for std_mol in std_mols:
        if std_mol is not None:
            try: 
                std_smi = Chem.MolToSmiles(std_mol)
            except:
                std_smi=np.nan
        else:
            std_smi = np.nan
        std_smiles += [std_smi]

    df["ST_SMILES"] = std_smiles

    df=df[df["ST_SMILES"].notna()]

    df.drop(columns = ["SMILES"], inplace=True)

    df.to_csv(std_csv_file, index=False)

    #os.remove(csv_file)

smiles_standardiser(csv_file, std_csv_file)

In [3]:
#insert descriptors, remove all with MW<100 or MW>700, remove descriptors

#this takes the std_simles csv and links to affinities score, with descriptors

csv_200_descriptors = os.path.join(DATAPATH, "test", "csv_200_desc.csv")
smiles_plus_desc = os.path.join(DATAPATH, "test", "smiles_descriptors.csv") 
filtered_std_smiles = os.path.join(DATAPATH, "test", "filtered_std_smiles.csv")                                                          

from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

#df1 = pd.read_csv(std_csv_file)

def mol_descriptor(std_csv_file):
    
    mols = [Chem.MolFromSmiles(i) for i in std_csv_file] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 


# Function call

def filterer():
    
    df1 = pd.read_csv(std_csv_file)

    Mol_descriptors,desc_names = mol_descriptor(df1['ST_SMILES'])

    df_200_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)

    df_200_descriptors.to_csv(csv_200_descriptors, index=False)


    cols = [5]
    df_molwt = df_200_descriptors[df_200_descriptors.columns[cols]]

    
    df2 = df_molwt
    merged = pd.concat([df1, df2], axis="columns")

    merged.to_csv(smiles_plus_desc, index=False)
    os.remove(csv_200_descriptors)

    df = pd.read_csv(smiles_plus_desc)

    # Filter all rows for which the smiles mw is under 100
    df_filtered = df[df['MolWt'] >= 300]
    df_filtered = df_filtered[df_filtered['MolWt'] <= 800]

    df_filtered.drop(columns = ["MolWt"], inplace=True)

    df_filtered.to_csv(filtered_std_smiles, index=False)

    os.remove(smiles_plus_desc)
    #os.remove(std_csv_file)

filterer()

## Ligand Preparation

- Conversion of smiles strings to 3D conformers using RDKIT 
- Protonation at a specific pH (7.4) and conversion to .pdbqt via obabel
- OR
- Protonation at a specific pH, conversion to .sdf and merging of all .sdf files into one .sdf file containing all ligands

In [13]:
pH = 7.4
sdf_folder = os.path.join(DATAPATH, "test")

In [16]:
def prepare_ligands_sdf(filtered_std_smiles, pH, header_len=1, output_dir = sdf_folder, delim=',') -> list:

    out_sdfs = [] 
    
    print(filtered_std_smiles)
    
    with open(filtered_std_smiles, 'r') as csv: 
        
        id_list = []
        smi_list = []
        
        for entry in csv.readlines()[header_len:]:
            
            ID, ST_SMILES = entry.split(delim)[:2]            
           
            # Convert smiles str to 3D coordinates
            mol = Chem.MolFromSmiles(ST_SMILES)
            mol = Chem.AddHs(mol)
            ps = AllChem.ETKDGv2()
            ps.useRandomCoords = True
            result = AllChem.EmbedMolecule(mol, ps)
                        
            if type(result) is int:
                if result == -1:
                    print(ID)
                    continue
            else:
                id_list += [ID]
                smi_list += [ST_SMILES]
                     
            # Ouput coords to pdb
            pdb_name = f"{output_dir}/{ID}.pdb"
            MolToPDBFile(mol, pdb_name)
       
            # Protonate according to pH, convert to .sdf
            sdf_name = f"{output_dir}/{ID}.sdf"
            ! obabel {pdb_name} -pH {pH} -O {sdf_name} 
          
            os.remove(pdb_name) # removes the .pdb files after obabel protonates and converts to .sdf
                       
            out_sdfs.append(sdf_name)

        
        final_smi_ID = pd.DataFrame()
        final_smi_ID[ID] = id_list
        #final_smi_ID.to_csv(os.path.join(DATAPATH, "test", "error_embedding.csv"), index=False)
        final_smi_smi = pd.DataFrame()
        final_smi_smi[ST_SMILES] = smi_list
        #final_smi_smi.to_csv(os.path.join(DATAPATH, "test", "docking_smiles_smi.csv"), index=False)
        merged = pd.concat([final_smi_ID, final_smi_smi], axis="columns")
        merged.to_csv(os.path.join(DATAPATH, "test", "embedded_smiles.csv"), index=False)

    return out_sdfs

def merge_sdfs(out_sdfs, merged_sdf):

    #embed_list = []
    valence_list = []

    mols = []
    for s in out_sdfs:
        suppl = Chem.SDMolSupplier(s)
        for  i, mol in enumerate(suppl):
            if mol is None:
                #embed_list += [i]
                continue
        else:
            mols += [mol]
        os.remove(s)

    with Chem.SDWriter(merged_sdf) as w:
        for a, mol in enumerate(mols):
            if mol is None:
                valence_list += [a]
                continue
            else:
                w.write(mol) 

    #embedding_error = pd.DataFrame()
    #embedding_error[i] = embed_list
    #embedding_error.to_csv(os.path.join(DATAPATH, "test", "error_embedding.csv"), index=False, header=None)
    #print(embedding_error)

    valence_error = pd.DataFrame()
    valence_error[a] = valence_list
    valence_error.to_csv(os.path.join(DATAPATH, "test", "error_valence.csv"), index=False, header=None)
    #print(valence_error)

    #merged = pd.concat([embedding_error, valence_error], axis=0, ignore_index=False)
    #merged.to_csv(os.path.join(DATAPATH, "test", "error_indexes.csv"), index=False, header=None)
    #print(merged)             
    

def prepare_and_merge_ligands(filtered_std_smiles,  pH, header_len=1, output_dir='', delim=','):
    ligands = prepare_ligands_sdf(filtered_std_smiles,  pH, header_len, output_dir, delim)
    merge_sdfs(ligands, output_dir+"/merged.sdf") 




In [17]:
prepare_and_merge_ligands(filtered_std_smiles, pH, output_dir=sdf_folder)


../data/test/filtered_std_smiles.csv
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted


[15:38:31] UFFTYPER: Unrecognized charge state for atom: 14


1 molecule converted
5
1 molecule converted


[15:38:57] UFFTYPER: Unrecognized charge state for atom: 14


51


[15:39:24] Explicit valence for atom # 15 O, 3, is greater than permitted
[15:39:24] ERROR: Could not sanitize molecule ending on line 90
[15:39:24] ERROR: Explicit valence for atom # 15 O, 3, is greater than permitted
[15:39:24] Explicit valence for atom # 15 O, 3, is greater than permitted
[15:39:24] ERROR: Could not sanitize molecule ending on line 90
[15:39:24] ERROR: Explicit valence for atom # 15 O, 3, is greater than permitted


In [201]:
df1 = pd.read_csv(os.path.join(DATAPATH, "test", "error_embedding.csv"))
df1.to_csv(os.path.join(DATAPATH, "test", "error_embedding_2.csv"), index=False)

df2 = pd.read_csv(os.path.join(DATAPATH, "test", "error_valence.csv"))
df2.to_csv(os.path.join(DATAPATH, "test", "error_valence_2.csv"), index=False)

df1 = pd.read_csv(os.path.join(DATAPATH, "test", "error_embedding_2.csv"))
df2 = pd.read_csv(os.path.join(DATAPATH, "test", "error_valence_2.csv"))

merged = pd.concat([df1, df2])
merged.to_csv(os.path.join(DATAPATH, "test", "error_indexes.csv"), index=False, header=None)

In [None]:
def prepare_ligands_sdf(filtered_std_smiles, pH, header_len=1, output_dir = sdf_folder, delim=',') -> list:

    out_sdfs = [] 
    
    print(filtered_std_smiles)
    
    with open(filtered_std_smiles, 'r') as csv: 
        
        #id_list = []
        #smi_list = []
        
        for entry in csv.readlines()[header_len:]:
            
            ID, ST_SMILES = entry.split(delim)[:2]            
           
            # Convert smiles str to 3D coordinates
            mol = Chem.MolFromSmiles(ST_SMILES)
            mol = Chem.AddHs(mol)
            ps = AllChem.ETKDGv2()
            ps.useRandomCoords = True
            result = AllChem.EmbedMolecule(mol, ps)
            
            #if type(result) is int:
            if type(result) is int:
                if result == -1:
                    print(ID)
                    continue
            else:
                id_list += [ID]
                smi_list += [ST_SMILES]
                continue

            #final_smi_ID = pd.DataFrame()
            #final_smi_ID[ID] = id_list
            #final_smi_ID.to_csv(os.path.join(DATAPATH, "test", "docking_smiles_ID.csv"), index=False)
            #final_smi_smi = pd.DataFrame()
            #final_smi_smi[ST_SMILES] = smi_list
            #final_smi_smi.to_csv(os.path.join(DATAPATH, "test", "docking_smiles_smi.csv"), index=False)
            #merged = pd.concat([final_smi_ID, final_smi_smi], axis="columns")
            #merged.to_csv(os.path.join(DATAPATH, "test", "docking_smiles.csv"), index=False)

            # Ouput coords to pdb
            pdb_name = f"{output_dir}/{ID}.pdb"
            MolToPDBFile(mol, pdb_name)
       
            # Protonate according to pH, convert to .sdf
            sdf_name = f"{output_dir}/{ID}.sdf"
            ! obabel {pdb_name} -pH {pH} -O {sdf_name} 
          
            os.remove(pdb_name) # removes the .pdb files after obabel protonates and converts to .sdf
                       
            out_sdfs.append(sdf_name)

    return out_sdfs


def merge_sdfs(out_sdfs, merged_sdf):


    mols = []
    for s in out_sdfs:
        suppl = Chem.SDMolSupplier(s)
        for  i, mol in enumerate(suppl):
            if mol is None:
                print(i)
                continue
        else:
            mols += [mol]
        os.remove(s)

    with Chem.SDWriter(merged_sdf) as w:
        for mol in mols:
            if mol is None:
                continue
            else:
                w.write(mol) 
             
    

def prepare_and_merge_ligands(filtered_std_smiles,  pH, header_len=1, output_dir='', delim=','):
    ligands = prepare_ligands_sdf(filtered_std_smiles,  pH, header_len, output_dir, delim)
    merge_sdfs(ligands, output_dir+"/merged.sdf") 




In [None]:
mol = Chem.MolFromSmiles('C@H]2(OC1O[C@@H]([C@@H](O)[C@H](O)[C@H]1O)CO)[C@H](O)[C@@H](O)C(O[C@@H]2CO)O[C@H]3[C@H](O)[C@@H](O)[C@H](O[C@@H]3CO)O')
mol = Chem.AddHs(mol)
ps = AllChem.ETKDGv2()
ps.useRandomCoords = True
result = AllChem.EmbedMolecule(mol, ps)


print(result)

In [76]:
suppl = Chem.SDMolSupplier('../data/smiles/merged.sdf')
sdfs = [x for x in suppl]
smiles = pd.read_csv('../data/smiles/filtered_std_smiles.csv')

print(f"num of std_smiles: {len(smiles)}\tnum of merged sdfs: {len(sdfs)}")

num of std_smiles: 5255	num of merged sdfs: 5206


In [None]:
import csv

with open('../data/test/maintest.csv', 'rb') as master:
    master_indices = dict((r[1], i) for i, r in enumerate(csv.reader(master)))

with open('../data/test/testmain.csv', 'rb') as main:
    with open('../data/test/testfilt.csv', 'wb') as filt:    
        reader = csv.reader(main)
        writer = csv.writer(filt)

        writer.writerow(next(reader, []) + ['ID'])

        for row in reader:
            index = master_indices.get(row[3])
            if index is not None:
                message = 'FOUND in master list (row {})'.format(index)
            else:
                message = 'NOT FOUND in master list'
            writer.writerow(row + [message])

# Docking with smina

- Receptor is prepped manually for this project using ADTools (waters removed, polar hydrogens added, Gasteiger charges added, saved as .pdbqt file)
- Flexible side chains identified (not done below...)
- Ligand (CHO) can be readded as hetatm/different chain for autobox generation
- smina run (search space defined on command line) and outputs saved

In [77]:
# The below is smina, run using the single .sdf file with all ligands, nconf=1. 

receptor = os.path.join(DATAPATH, "protein", "pabb_model1.pdbqt") 
ligands = os.path.join(DATAPATH, "smiles", "merged.sdf")  
log = os.path.join(RESULTSPATH, "outputs", "log.txt") 
docking_output = os.path.join(RESULTSPATH, "outputs", "outputs.sdf") 
smina = os.path.join(SOURCEPATH, "smina.static") 

docking_cmd = smina  + " -r " + receptor + " -l " + ligands + " -o " + docking_output + " --log " + log + " --seed 42 " + " --center_x 74 " + " --center_y 44 " + " --center_z 57 " + " --size_x 22 " + " --size_y 22 " + " --size_z 24 " + " --exhaustiveness 8 " + " --num_modes 1 " + " --addH off " + " --scoring vinardo "

os.system(docking_cmd)

   _______  _______ _________ _        _______ 
  (  ____ \(       )\__   __/( (    /|(  ___  )
  | (    \/| () () |   ) (   |  \  ( || (   ) |
  | (_____ | || || |   | |   |   \ | || (___) |
  (_____  )| |(_)| |   | |   | (\ \) ||  ___  |
        ) || |   | |   | |   | | \   || (   ) |
  /\____) || )   ( |___) (___| )  \  || )   ( |
  \_______)|/     \|\_______/|/    )_)|/     \|


smina is based off AutoDock Vina. Please cite appropriately.

Weights      Terms
-0.045       gauss(o=0,_w=0.8,_c=8)
0.8          repulsion(o=0,_c=8)
-0.035       hydrophobic(g=0,_b=2.5,_c=8)
-0.6         non_dir_h_bond(g=-0.6,_b=0,_c=8)
0            num_tors_div

Using random seed: 42

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

mode |   affinity | dist from best mode
     | (kcal/mol) | rmsd l.b.| rmsd u.b.
-----+------------+----------+----------
1       -5.5       0.000      0.000    
Refi

0

## Analysis

Here, the top poses will be selected and analysed.

- See SDSorter, with 'reduceconf' and 'nbestx'
- Also see 'sminalog_analysis.py'

In [43]:
from rdkit.Chem import PandasTools

fn = os.path.join(RESULTSPATH, "outputs", "output.sdf")

df = PandasTools.LoadSDF(fn, embedProps=True, molColName=None, smilesName='smiles')

In [44]:
df.head()

Unnamed: 0,minimizedAffinity,ID,smiles
0,-5.46148,../data/smiles/4.pdb,C[C@@H]1COc2c(N3CC[NH+](C)CC3)c(F)cc3c(=O)c(C(...
1,-5.93024,../data/smiles/8.pdb,O=C(O)c1cn(C2CC2)c2cc(N3CC[NH2+]CC3)c(F)cc2c1=O
2,-5.66769,../data/smiles/9.pdb,CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CC[NH2+]CC3)cc21
3,-6.49326,../data/smiles/17.pdb,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1
4,-5.89199,../data/smiles/26.pdb,CC[N@@H+]1CCC[C@H]1CNC(=O)c1cc(S(N)(=O)=O)ccc1OC


In [45]:
df['ID'] = df['ID'].map(lambda x: x.lstrip('../data/smiles/').rstrip('.pdb'))

In [46]:
df.head()

Unnamed: 0,minimizedAffinity,ID,smiles
0,-5.46148,4,C[C@@H]1COc2c(N3CC[NH+](C)CC3)c(F)cc3c(=O)c(C(...
1,-5.93024,8,O=C(O)c1cn(C2CC2)c2cc(N3CC[NH2+]CC3)c(F)cc2c1=O
2,-5.66769,9,CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CC[NH2+]CC3)cc21
3,-6.49326,17,NS(=O)(=O)c1cc(Cl)c(Cl)c(S(N)(=O)=O)c1
4,-5.89199,26,CC[N@@H+]1CCC[C@H]1CNC(=O)c1cc(S(N)(=O)=O)ccc1OC


In [48]:
df.to_csv(os.path.join(RESULTSPATH, "outputs", "affinity_and_smiles.csv"), index=False)

In [None]:
#this takes the std_simles csv and links to affinities score, with descriptors

logfile = os.path.join(RESULTSPATH, "outputs", "log.txt")
affinities = os.path.join(RESULTSPATH, "outputs", 'affinities.csv')
affinities_header = os.path.join(RESULTSPATH, "outputs", "affinities_header.csv")
final_smiles = os.path.join(DATAPATH, "smiles", "final_merged_smiles.csv")
affinities_and_smiles = os.path.join(RESULTSPATH, "outputs", "affinities_with_stdsmiles.csv")

def affinities_to_smiles(logfile, final_smiles, affinities_and_smiles):

    with open(logfile, "r") as f:
        i = 0;
        for line in f:
            if '-+' in line:
                nextline = next(f)
                i = i + 1

                nextlinearray  = nextline.split()                       #splitting the first row in different values
                bind_aff = nextlinearray[1]                             #getting the binding affinity of first pose

                with open(affinities, "a") as myfile:
                    print(bind_aff, end='\n', file=myfile)
        

    df = pd.read_csv(affinities, names = ['AFFINITY'])

    headerList = ['AFFINITY']
    df.to_csv(affinities_header, header=headerList, index=False)
    df = pd.read_csv(affinities_header)

    os.remove(affinities)

    df1 = pd.read_csv(final_smiles)
    df2 = pd.read_csv(affinities_header)

    merged = pd.concat([df1, df2], axis="columns")
    # merged.drop(columns = ["ID"], inplace=True) -- this removes the ID line, so output is smiles and affininty only... could keep.

    merged.to_csv(affinities_and_smiles, index=False)

    os.remove(affinities_header)


In [None]:
# Double check the lowest scoring compound manually (low scoring is better).

# The below are the top 5 binders from the cho_sim50 search - .csv output taken and 

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

a = Chem.MolFromSmiles('CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CSC(=O)c3ccco3)CS[C@H]12)c1csc(N)n1')
b = Chem.MolFromSmiles('Nc1nc2c(c(=O)[nH]1)C[C@@H](CCc1ccc(C(=O)N[C@@H](CCC(=O)O)C(=O)O)cc1)CN2')
c = Chem.MolFromSmiles('O=C(N/N=C/c1ccccc1O)c1ccncc1')
d = Chem.MolFromSmiles('COc1ccc(Cc2cnc(N)nc2N)cc1OC')
e = Chem.MolFromSmiles('Fc1ccc(C(c2ccc(F)cc2)N2CCN(C/C=C/c3ccccc3)CC2)cc1')
test = Chem.MolFromSmiles('C[C@@H](NC(=O)[C@@H](CC(=O)N1CCC(N2CCCCC2)CC1)N1C(=O)[C@@H](N2C(=O)OC[C@@H]2c2ccccc2)[C@H]1/C=C/c1ccccc1)c1cccc(C(F)(F)F)c1')
Draw.MolsToGridImage((a,b,c,d,e, test), subImgSize=(250,250))


In [None]:
# finding correlations between the affinity and MW/rotatable bonds/heteroatms

csv_file = os.path.join(RESULTSPATH, "outputs", "affinities_and_smiles.csv")

df = pd.read_csv(csv_file)


print(df['AFFINITY'].corr(df['MolWt']))
print(df['AFFINITY'].corr(df['NumHeteroatoms']))
print(df['AFFINITY'].corr(df['NumRotatableBonds']))

#remember - a lower affinity = predeicited tighter binding, so a negatice correlation means tighter binding.