* import data

In [None]:
data = pd.read_csv('hob_data_set_MSMcleanup.csv')

# RDKit descriptors
* add descriptor calculations

In [None]:
# we generate a list of all available metrics for view as follows
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Descriptors import *

metric_list = Descriptors._descList

In [None]:
# after deciding which metrics we wish to use, we'll list them here
metrics_to_add = ['MolWt','ExactMolWt','qed','MolLogP','LabuteASA','TPSA',
                  'MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge',
                  'NumHAcceptors','NumHDonors','HeavyAtomCount','NumHeteroatoms','NumRotatableBonds',
                  'NHOHCount','NOCount','FractionCSP3','RingCount','NumAliphaticRings',
                  'NumAromaticRings','NumAliphaticHeterocycles','NumAromaticHeterocycles','NumSaturatedHeterocycles',
                  'NumSaturatedRings','MolMR','BalabanJ','BertzCT','HallKierAlpha','PEOE_VSA1','PEOE_VSA2','PEOE_VSA3',
                  'PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','PEOE_VSA10','PEOE_VSA11',
                  'PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5',
                  'SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA2','SlogP_VSA3',
                  'SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','SlogP_VSA10',
                  'SlogP_VSA11','SlogP_VSA12']

In [None]:
# create dictionary of rdkit calculations, to call the functions using their title
metric_dict = {}
for item in metric_list:
    metric_dict[item[0]] = item[1]
metric_dict.keys()

In [None]:
# now we can actually add the descriptor calculations to the data
df = pd.read_csv('hob_data_set_MSMcleanup.csv')

for index,row in df.iterrows():
    drugname = df.loc[index,'Name']
    drugSMILES = df.loc[index,'smile']
    drugmol = Chem.MolFromSmiles(drugSMILES)
    
    for metric in metrics_to_add:
        function = metric_dict[metric]
        
        result = function(drugmol)
        
        df.loc[index,metric] = result

* some of the metrics we've added provide amount of the drug's surface area (SA) having a given property within a certain range
  * the full possible range of values for a property are divided into 10-14 subranges
  * the metric tells you the amount of surface area for which that property's value lies within that subrange
  * thus, if you add together all 10-14 subranges of a "VSA" metric, you get the molecule's total surface area
* I would like to test whether it would be more relevant to describe instead the percentage of SA fitting that property range
* thus, here I will create 'percent SA' alternatives for each "VSA" metric

In [None]:
# list all VSA type of metrics
VSAs = ['PEOE_VSA1','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7',
        'PEOE_VSA8','PEOE_VSA9','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14',
        'SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8',
        'SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5',
        'SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12']

# list just one VSA metric, which can be summed for total surface area
SMR_VSA = ['SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10']

for index,row in df.iterrows():
    drugname = df.loc[index,'Name']
    drugSMILES = df.loc[index,'smile']
    drugmol = Chem.MolFromSmiles(drugSMILES)
    
    # get the total SA for molecule
    totalVSA = 0
    for VSA_i in SMR_VSA: 
        totalVSA += df.loc[index,VSA_i]
    df.loc[index,'VSA_total'] = totalVSA
    
    # express all VSA metrics in Percentage of total SA
    for each_VSA in VSAs:
        # lookup absolute value of each VSA metric
        VSA_x = df.loc[index,each_VSA]
        
        # derive new name for VSA percentage column
        if each_VSA[-2].isdigit():
            num = f"{each_VSA[-2]}{each_VSA[-1]}"
        else:
            num = f"0{each_VSA[-1]}"
        titleindex = each_VSA.index('_')
        title = each_VSA[0:titleindex]
        new_column_name = 'fracVSA_'+title+num
        
        # store value of relative percent VSA for each VSA metric
        df.loc[index,new_column_name] = (VSA_x/totalVSA)

* clean up and export

In [None]:
# rename bioavailability column to short title
df = df.rename(columns={'value (oral BA %)':'BA_pct'})

# list of all current columns
all_columns = ['Name','smile','BA_pct','MolWt','ExactMolWt','qed','MolLogP','LabuteASA','TPSA','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge','NumHAcceptors','NumHDonors','HeavyAtomCount','NumHeteroatoms','NumRotatableBonds','NHOHCount','NOCount','FractionCSP3','RingCount','NumAliphaticRings','NumAromaticRings','NumAliphaticHeterocycles','NumAromaticHeterocycles','NumSaturatedHeterocycles','NumSaturatedRings','MolMR','BalabanJ','BertzCT','HallKierAlpha','PEOE_VSA1','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','PEOE_VSA1','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','fracVSA_PEOE01','fracVSA_PEOE02','fracVSA_PEOE03','fracVSA_PEOE04','fracVSA_PEOE05','fracVSA_PEOE06','fracVSA_PEOE07','fracVSA_PEOE08','fracVSA_PEOE09','fracVSA_PEOE10','fracVSA_PEOE11','fracVSA_PEOE12','fracVSA_PEOE13','fracVSA_PEOE14','fracVSA_SMR01','fracVSA_SMR02','fracVSA_SMR03','fracVSA_SMR04','fracVSA_SMR05','fracVSA_SMR06','fracVSA_SMR07','fracVSA_SMR08','fracVSA_SMR09','fracVSA_SMR10','fracVSA_SlogP01','fracVSA_SlogP02','fracVSA_SlogP03','fracVSA_SlogP04','fracVSA_SlogP05','fracVSA_SlogP06','fracVSA_SlogP07','fracVSA_SlogP08','fracVSA_SlogP09','fracVSA_SlogP10','fracVSA_SlogP11','fracVSA_SlogP12','VSA_total']

# re-order df columns
new_col_order = ['Name','smile','BA_pct','MolWt','ExactMolWt','qed','MolLogP','MolMR','VSA_total','LabuteASA','TPSA','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge','NumHAcceptors','NumHDonors','HeavyAtomCount','NumHeteroatoms','NumRotatableBonds','NHOHCount','NOCount','FractionCSP3','RingCount','NumAliphaticRings','NumAromaticRings','NumAliphaticHeterocycles','NumAromaticHeterocycles','NumSaturatedHeterocycles','NumSaturatedRings','BalabanJ','BertzCT','HallKierAlpha','PEOE_VSA1','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','PEOE_VSA1','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','fracVSA_PEOE01','fracVSA_PEOE02','fracVSA_PEOE03','fracVSA_PEOE04','fracVSA_PEOE05','fracVSA_PEOE06','fracVSA_PEOE07','fracVSA_PEOE08','fracVSA_PEOE09','fracVSA_PEOE10','fracVSA_PEOE11','fracVSA_PEOE12','fracVSA_PEOE13','fracVSA_PEOE14','fracVSA_SMR01','fracVSA_SMR02','fracVSA_SMR03','fracVSA_SMR04','fracVSA_SMR05','fracVSA_SMR06','fracVSA_SMR07','fracVSA_SMR08','fracVSA_SMR09','fracVSA_SMR10','fracVSA_SlogP01','fracVSA_SlogP02','fracVSA_SlogP03','fracVSA_SlogP04','fracVSA_SlogP05','fracVSA_SlogP06','fracVSA_SlogP07','fracVSA_SlogP08','fracVSA_SlogP09','fracVSA_SlogP10','fracVSA_SlogP11','fracVSA_SlogP12']
temp2 = temp[['Name','smile','BA_pct','MolWt','ExactMolWt','qed','MolLogP','MolMR','VSA_total','LabuteASA','TPSA','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge','NumHAcceptors','NumHDonors','HeavyAtomCount','NumHeteroatoms','NumRotatableBonds','NHOHCount','NOCount','FractionCSP3','RingCount','NumAliphaticRings','NumAromaticRings','NumAliphaticHeterocycles','NumAromaticHeterocycles','NumSaturatedHeterocycles','NumSaturatedRings','BalabanJ','BertzCT','HallKierAlpha','PEOE_VSA1','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','PEOE_VSA1','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','fracVSA_PEOE01','fracVSA_PEOE02','fracVSA_PEOE03','fracVSA_PEOE04','fracVSA_PEOE05','fracVSA_PEOE06','fracVSA_PEOE07','fracVSA_PEOE08','fracVSA_PEOE09','fracVSA_PEOE10','fracVSA_PEOE11','fracVSA_PEOE12','fracVSA_PEOE13','fracVSA_PEOE14','fracVSA_SMR01','fracVSA_SMR02','fracVSA_SMR03','fracVSA_SMR04','fracVSA_SMR05','fracVSA_SMR06','fracVSA_SMR07','fracVSA_SMR08','fracVSA_SMR09','fracVSA_SMR10','fracVSA_SlogP01','fracVSA_SlogP02','fracVSA_SlogP03','fracVSA_SlogP04','fracVSA_SlogP05','fracVSA_SlogP06','fracVSA_SlogP07','fracVSA_SlogP08','fracVSA_SlogP09','fracVSA_SlogP10','fracVSA_SlogP11','fracVSA_SlogP12']]

# export
temp2.to_csv('hob_data_w_calcs_v3.csv')

In [None]:
# reload saved data to verify
df = pd.read_csv('hob_data_w_calcs_v3.csv')
df = df.drop(columns='Unnamed: 0')