In [1]:
#importing databases

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools
import pandas as pd
import numpy as np
import re

### Data retrieved from drug bank (manual downloaded)

In [2]:
#handling sdf file
approved_molecules_raw = Chem.SDMolSupplier("open structures.sdf")
approved_molecules = [molecule for molecule in approved_molecules_raw]

In [3]:
#transforming sdf files into dataframes
frame = PandasTools.LoadSDF("open structures.sdf",smilesName='SMILES',molColName='Molecule',
           includeFingerprints=True)

In [4]:
#checking the name of columns
frame.columns

Index(['DRUGBANK_ID', 'SECONDARY_ACCESSION_NUMBERS', 'COMMON_NAME',
       'CAS_NUMBER', 'UNII', 'SYNONYMS', 'ID', 'SMILES', 'Molecule'],
      dtype='object')

In [145]:
approved_drugs = pd.read_csv("structure links.csv")
approved_drugs.columns #checking colum names

Index(['DrugBank ID', 'Name', 'CAS Number', 'Drug Groups', 'InChIKey', 'InChI',
       'SMILES', 'Formula', 'KEGG Compound ID', 'KEGG Drug ID',
       'PubChem Compound ID', 'PubChem Substance ID', 'ChEBI ID', 'ChEMBL ID',
       'HET ID', 'ChemSpider ID', 'BindingDB ID'],
      dtype='object')

In [146]:
approved_drugs = approved_drugs[["Name", "DrugBank ID", "Drug Groups", "SMILES"]] #selecting only useful columns
approved_drugs.head(5)

Unnamed: 0,Name,DrugBank ID,Drug Groups,SMILES
0,Bivalirudin,DB00006,approved; investigational,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,Leuprolide,DB00007,approved; investigational,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
2,Goserelin,DB00014,approved,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,Gramicidin D,DB00027,approved,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,Desmopressin,DB00035,approved,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...


In [147]:
#letting the drug names be all LOWER
approved_drugs["Name"] = approved_drugs["Name"].str.lower() 
approved_drugs["active"] = approved_drugs["Name"]

approved_drugs = approved_drugs.drop("Name", axis = 1) #removing the "name variable"
approved_drugs.head(5)

Unnamed: 0,DrugBank ID,Drug Groups,SMILES,active
0,DB00006,approved; investigational,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,bivalirudin
1,DB00007,approved; investigational,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,leuprolide
2,DB00014,approved,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,goserelin
3,DB00027,approved,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,gramicidin d
4,DB00035,approved,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,desmopressin


In [153]:
print(approved_drugs["active"].shape)
print(approved_drugs.drop_duplicates(subset = "active", keep = "first").shape) #we dont have any duplicates

(2715,)
(2715, 4)


### Data retrieved from FDA acess data (manual downloaded)

In [154]:
#this dataset was retrieved from: https://www.fda.gov/drugs/drug-approvals-and-databases/compilation-cder-new-molecular-entity-nme-drug-and-new-biologic-approvals

fda_total = pd.read_excel("NME_NBA_1985_2021_total.xlsx")
fda_total = fda_total[["Active Ingredient/Moiety", "NDA/BLA", "Approval Year"]]

In [155]:
#filtering for further lipinski analysis YEAR > 2004 ANN ONLY NDA (NOT BIOLOGICAL)
fda_lipinski  = fda_total.loc[(fda_total["NDA/BLA"] == "NDA") & (fda_total["Approval Year"] >= 2004)].reset_index(drop = True)

#cleaning the names of the dataset
from skimpy import clean_columns
fda_lipinski = clean_columns(fda_lipinski)

#one thing worth mentioning is the salt is always after the active ingredient (almost always) so lets separate the two columns
fda_lipinski

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year
0,tiotropium bromide,NDA,2004
1,pemetrexed disodium,NDA,2004
2,cinacalcet hydrochloride,NDA,2004
3,telithromycin,NDA,2004
4,human secretin,NDA,2004
...,...,...,...
472,asciminib,NDA,2021
473,vosoritide,NDA,2021
474,maribavir,NDA,2021
475,pafolacianine,NDA,2021


In [160]:
#we split the active ingredients into active and respective salt

fda_lipinski[["active", "salt"]] = fda_lipinski["active_ingredient_moiety"].str.split(expand = True, n = 1)
print(fda_lipinski.shape) #so we have 477 approved drugs in this period (2021-2004) according to FDA database

#lets see if we have any duplicates
fda_lipinski.drop_duplicates(subset = "active", keep = "first").shape #we have only one duplicate



(477, 5)


(471, 5)

In [190]:
actives_unique = list(fda_lipinski["active"].unique())
non_unique = fda_lipinski["active"].duplicated()

fda_lipinski.loc[non_unique] #these are the duplicated

Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,salt
14,pentetate zinc trisodium,NDA,2004,pentetate,zinc trisodium
265,"elvitegravir, cobicistat, emtricitabine, tenof...",NDA,2015,"elvitegravir,","cobicistat, emtricitabine, tenofovir alafenamide"
281,sofosbuvir and velpatasvir,NDA,2016,sofosbuvir,and velpatasvir
328,sodium zirconium cyclosilicate,NDA,2018,sodium,zirconium cyclosilicate
423,decitabine and cedazuridine,NDA,2020,decitabine,and cedazuridine
436,gallium Ga 68 PSMA-11,NDA,2020,gallium,Ga 68 PSMA-11


In [112]:
#merging the two dataframes using the active ingredient as key
fda_smiles = pd.merge(fda_lipinski, approved_drugs, how = "left", on ="active")

#cleaning column names
fda_smiles = clean_columns(fda_smiles)
fda_smiles[fda_smiles["smiles"].isnull()]


Unnamed: 0,active_ingredient_moiety,nda_bla,approval_year,active,salt,name,drug_bank_id,drug_groups,smiles
4,human secretin,NDA,2004,human,secretin,,,,
13,pentetate calcium trisodium,NDA,2004,pentetate,calcium trisodium,,,,
14,pentetate zinc trisodium,NDA,2004,pentetate,zinc trisodium,,,,
15,lanthanum carbonate hydrate,NDA,2004,lanthanum,carbonate hydrate,,,,
16,omega-3-acid ethyl esters,NDA,2004,omega-3-acid,ethyl esters,,,,
...,...,...,...,...,...,...,...,...,...
453,dasiglucagon,NDA,2021,dasiglucagon,,,,,
456,pegcetacoplan,NDA,2021,pegcetacoplan,,,,,
457,piflufolastat F 18,NDA,2021,piflufolastat,F 18,,,,
473,vosoritide,NDA,2021,vosoritide,,,,,


In [104]:
from urllib.request import urlopen
from urllib.parse import quote