# PHASE 4: Functional Groups

"""
PHASE 4: Determine functional groups of the new hypertension molecules 

Created on Tuesday Apr 25 2023 
Updated on Monday May 22 2023 - added new functional groups for ACE
Updated again on Friday May 26 2023  - updated smiles of functional groups
Updated on Thursday 01 June 2023 - removed all non-ACE and ARB functional groups
Updated on Monday 17 July 2023 - updated new smiles
Updated on Monday 16 October 2023 - updated functional group smiles 

@author: Odifentse M Lehasa

The purpose of this notebook is to determine the functional groups present in each molecule.
In this phase, we use the new molecules that meet the physicochemical properties, from the previous phase (phase 3).
Later (in phases 5 and 6) we can determine the class of hypertension molecules, based on their functional groups.

"""

## STEP 0: IMPORT LIBRARIES

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import BRICS
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Lipinski
import pandas as pd


##  STEP 1: GET DATA

In [4]:
# Get the list of all the new molecules that passed the physicochemical property criteria (from phase 3)

df_molProp = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/FINAL - October/Output/3.1. New molecules - passed property criteria.csv')
df_molProp

Unnamed: 0.1,Unnamed: 0,Canonical SMILES,Aromatic Rings (No.),Aliphatic Rings (No.),AVG Molecular weight,Exact Molecular weight,LogP,Hdonors,Hacceptors,Rotatable bonds,Heavy Atoms (No.),QED,Property Forecast Index,PSA,Druggable (Lipinski),Druggable (Physicochemical)
0,0,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,5,473.746,473.398128,4.8494,3,5,8,34,0.494569,4.8494,69.80,1,1
1,2,CO[C@@H]1C[C@H]2CCCC[C@@H]2N1[C@@H](CCCCN)C(=O...,0,4,421.626,421.330442,2.8730,3,6,8,30,0.589344,2.8730,79.03,1,1
2,3,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,1,4,475.634,475.315855,2.3181,4,9,8,34,0.492189,3.3181,128.69,1,1
3,4,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,1,4,473.727,473.307599,4.7033,3,5,8,33,0.536939,5.7033,69.80,1,1
4,5,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,1,4,459.639,459.332174,2.1549,4,9,8,33,0.508852,3.1549,124.26,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5404,9976,CC(C)(O)[C@@H]1C[C@@H]2CCCC[C@@H]2N1[C@@H](CCC...,1,4,489.661,489.331505,2.7066,4,9,8,35,0.478929,3.7066,128.69,1,1
5405,9977,NCCCC[C@@H](C(=O)N1[C@H](c2noc(=O)[nH]2)C[C@@H...,1,4,461.607,461.300205,1.9280,4,9,8,33,0.504578,2.9280,128.69,1,1
5406,9979,CO[C@@H]1C[C@@H]2CCCC[C@@H]2N1[C@@H](CCCCN)C(=...,1,4,461.607,461.300205,2.5396,3,9,8,33,0.571290,3.5396,117.69,1,1
5407,9994,NCCCC[C@@H](C(=O)N1[C@H](c2noc(=O)[nH]2)C[C@@H...,1,4,497.687,497.213047,1.5455,4,9,8,33,0.458140,2.5455,128.69,1,1


# STEP 2: FUNCTIONAL GROUP TEST

In [89]:

# create a list to store results of each new molecule
functional_list= list()

for z in range(len(df_molProp)):

    x = Chem.MolFromSmiles(df_molProp['Canonical SMILES'][z]) # molecule in smiles format
    
    drug_functional_group = ['*C(=O)O','*[SH]','*OP(=O)','c1ccc(cc1)c2ccccc2','c1nc2ccccc2n1C','c1nc2ccccc2[nH]1','c1nn[nH]n1','[nH]1nnnc1']  # add all functional groups  
    
  
    # ACE functional groups
    carboxyl_matches = x.GetSubstructMatches(Chem.MolFromSmarts(drug_functional_group[0]))
    sulfhydryl_matches = x.GetSubstructMatches(Chem.MolFromSmarts(drug_functional_group[1]))
    phosphinyl_matches = x.GetSubstructMatches(Chem.MolFromSmarts(drug_functional_group[2]))

    # ARB functional groups
    biphenyl_matches = x.GetSubstructMatches(Chem.MolFromSmarts(drug_functional_group[3]))
    benzimidazole_matches = x.GetSubstructMatches(Chem.MolFromSmarts(drug_functional_group[5])) or x.GetSubstructMatches(Chem.MolFromSmarts(drug_functional_group[5]))
    tetrazol_matches = x.GetSubstructMatches(Chem.MolFromSmarts(drug_functional_group[6])) or x.GetSubstructMatches(Chem.MolFromSmarts(drug_functional_group[7]))
    
    # combine above results
    functional_total = (Chem.MolToSmiles(x), len(carboxyl_matches),len(sulfhydryl_matches),len(phosphinyl_matches),len(biphenyl_matches),len(benzimidazole_matches),len(tetrazol_matches))
    functional_list.append(functional_total) 


# save list as dataframe
df_functional = pd.DataFrame(functional_list, columns =['Canonical SMILES','Carboxyl Functional Group (No.)','Sulfhydrl Functional Group (No.)','Phosphinyl Functional Group (No.)','Biphenyl Functional Group (No.)','Benzimidazole Functional Group (No.)','Tetrazol Functional Group (No.)'])

df_functional.to_csv('4. Functional group assignment.csv')
df_functional


Unnamed: 0,Canonical SMILES,Carboxyl Functional Group (No.),Sulfhydrl Functional Group (No.),Phosphinyl Functional Group (No.),Biphenyl Functional Group (No.),Benzimidazole Functional Group (No.),Tetrazol Functional Group (No.)
0,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,0
1,CO[C@@H]1C[C@H]2CCCC[C@@H]2N1[C@@H](CCCCN)C(=O...,0,0,0,0,0,0
2,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,0
3,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,0
4,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,1
...,...,...,...,...,...,...,...
5404,CC(C)(O)[C@@H]1C[C@@H]2CCCC[C@@H]2N1[C@@H](CCC...,0,0,0,0,0,0
5405,NCCCC[C@@H](C(=O)N1[C@H](c2noc(=O)[nH]2)C[C@@H...,0,0,0,0,0,0
5406,CO[C@@H]1C[C@@H]2CCCC[C@@H]2N1[C@@H](CCCCN)C(=...,0,0,0,0,0,0
5407,NCCCC[C@@H](C(=O)N1[C@H](c2noc(=O)[nH]2)C[C@@H...,0,0,0,0,0,0


# STEP 3: FILTER OUT THOSE WITHOUT REQUIRED FUNCTIONAL GROUPS

In [90]:
# Count the number of functional groups per new molecule to remove those without any functional groups. 

df_functional['Functional Count'] = df_functional['Carboxyl Functional Group (No.)']+df_functional['Sulfhydrl Functional Group (No.)']+ df_functional['Phosphinyl Functional Group (No.)'] + df_functional['Biphenyl Functional Group (No.)'] + df_functional['Benzimidazole Functional Group (No.)'] + df_functional['Tetrazol Functional Group (No.)']

# Remove those molecules that do not have any functional groups

functional = df_functional[df_functional['Functional Count'] > 0]  
df_drugfunc = functional


### Save the new dataframe as a CSV file



In [91]:
df_drugfunc.to_csv('4.1. New molecules - with functional groups.csv')
df_drugfunc

Unnamed: 0,Canonical SMILES,Carboxyl Functional Group (No.),Sulfhydrl Functional Group (No.),Phosphinyl Functional Group (No.),Biphenyl Functional Group (No.),Benzimidazole Functional Group (No.),Tetrazol Functional Group (No.),Functional Count
4,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,1,1
6,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,1,1
9,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,1,0,0,0,0,0,1
18,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,1,1
20,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...
5376,NCCCC[C@@H](C(=O)N1[C@H](C2CCCCC2)C[C@@H]2CCC[...,1,0,0,0,0,0,1
5387,C[C@H](CS)C(=O)N[C@@H](CCCCN)C(=O)N1[C@H](c2no...,0,1,0,0,0,0,1
5396,CCCCC(=O)O[C@@H](CCCCN)C(=O)N1[C@H](c2noc(=O)[...,1,0,0,0,0,0,1
5397,CCC(=O)O[C@@H](CCCCN)C(=O)N1[C@H](c2noc(=O)[nH...,1,0,0,0,0,0,1


# ---END HERE---