In [None]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdCoordGen
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA
import random
import string

%matplotlib inline

DATAPATH = "../data"
RESULTSPATH = "../results"
SOURCEPATH = "../src"


In [None]:
# finding correlations between the affinity and MW/rotatable bonds/heteroatms

csv_file = os.path.join(RESULTSPATH, "outputs", "smiles_200descriptors.csv")

df = pd.read_csv(csv_file)


print(df['minimizedAffinity'].corr(df['MolWt']))
print(df['minimizedAffinity'].corr(df['NumHeteroatoms']))
print(df['minimizedAffinity'].corr(df['NumRotatableBonds']))

#remember - a lower affinity = predeicited tighter binding, so a negatice correlation means tighter binding.

In [None]:
# Double check the lowest scoring compound manually (low scoring is better).

# The below are the top 5 binders from the cho_sim50 search - .csv output taken and 

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

a = Chem.MolFromSmiles('Nc1c(-c2cc(Cc3ccc(COc4ccccn4)cc3)no2)ccc[n+]1CO[P@](=O)([O-])O')
b = Chem.MolFromSmiles('Cc1cn(-c2cc(NC(=O)c3ccc(C)c(Nc4nccc(-c5cnccn5)n4)c3)cc(C(F)(F)F)c2)cn1')
c = Chem.MolFromSmiles('O=C(NCc1ccc(OC(F)(F)F)c(F)c1)[C@H]1CN(c2nc3nc(C4CC4)ncc3s2)CCN1S(=O)(=O)c1ccc(C(F)(F)F)cc1')
d = Chem.MolFromSmiles('Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCC[NH+](C)C)c4)n3)ccc2[nH]1')
e = Chem.MolFromSmiles('CNC(=O)c1c(-c2ccc(F)cc2)oc2ccc(-c3cc(C(=O)NC4(c5ncccn5)CC4)ccc3C)c(F)c12')
test = Chem.MolFromSmiles('C[C@@H](NC(=O)[C@@H](CC(=O)N1CCC(N2CCCCC2)CC1)N1C(=O)[C@@H](N2C(=O)OC[C@@H]2c2ccccc2)[C@H]1/C=C/c1ccccc1)c1cccc(C(F)(F)F)c1')
Draw.MolsToGridImage((a,b,c,d,e), subImgSize=(250,250))

In [None]:
#randomly sampling 100 ish rows from the smallmols dataset to dock using default scoring function, for comparison.

csv = os.path.join(DATAPATH, "smiles", "filtered_std_smiles.csv")
vina_csv = os.path.join(DATAPATH, "validation_lists", "smallmols_for_vina.csv")

df = pd.read_csv(csv, 
           skiprows=lambda x: x > 0 and random.random() >=0.03)

df.to_csv(vina_csv, index=False)

In [None]:
#insert descriptors, remove all with MW<300 or MW>800, remove descriptors -- this section contains the code to add all 200 rdkit descripotors to the smiles csv.

csv_200_descriptors = os.path.join(DATAPATH, "validation_lists", "csv_200_desc.csv")
smiles_plus_desc = os.path.join(DATAPATH, "validation_lists", "smiles_descriptors.csv") 
filtered_std_smiles = os.path.join(DATAPATH, "validation_lists", "filtered_std_smiles.csv")                                                          

from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

def mol_descriptor(std_smiles_csv):
    
    mols = [Chem.MolFromSmiles(i) for i in std_smiles_csv] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList]) ##molwt here then append, don't create a csv/df if can be avoided. Ensure ordering indexes
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 


# Function call

def filterer():
    
    df1 = pd.read_csv(std_smiles_csv)

    Mol_descriptors,desc_names = mol_descriptor(df1['ST_SMILES'])

    df_200_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)

    df_200_descriptors.to_csv(csv_200_descriptors, index=False)


    cols = [5] # can alter here depending on how you want to filter
    df_molwt = df_200_descriptors[df_200_descriptors.columns[cols]]

    merged = pd.concat([df1, df_molwt], axis="columns")

    merged.to_csv(smiles_plus_desc, index=False)
    os.remove(csv_200_descriptors)

    df = pd.read_csv(smiles_plus_desc)

    # Filter all rows for which the smiles mw is under 100
    df_filtered = df[df['MolWt'] >= 300]
    df_filtered = df_filtered[df_filtered['MolWt'] <= 800]

    df_filtered.drop(columns = ["MolWt"], inplace=True)

    df_filtered.to_csv(filtered_std_smiles, index=False)

    os.remove(smiles_plus_desc)
    os.remove(std_smiles_csv)

filterer()