# Lindsay Shields
## December 19, 2022

The purpose of this notebook is to take standard output from ChemAxon Metabolizer and create subsets of products containing specific functional groups. Important criteria for this extraction include:

- Route == Amide, imide, or lactam Hydrolysis
- parent compound ONLY uses one of the desired routes (listed above)
    - if parent compound uses an alternative mechanism in addition to one of the desired routes, exclude from subset
- save each group as its own tab


In [228]:
import pandas as pd
import re
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [221]:
df=pd.read_csv('./TestData/EDSP32k_Hydv1.8.csv')
df.head()

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass
0,C=O,Formaldehyde,"50-00-0, 630-08-0, 13007-92-6, 13463-39-3, 134...",1,0,0,0,0,100.00%,< 0.001%,,30.01056468
1,CC1CC2C3CCC4=CC(=O)C=CC4(C)C3(F)C(O)CC2(C)C1(O...,Dexamethasone,"50-02-2, 378-44-9, 1249-18-9, 000050-02-2, 000...",2,0,0,0,0,100.00%,< 0.001%,,392.1999022
2,CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3C(...,Hydrocortisone acetate,"50-03-3, 000050-03-3",3,0,0,0,98,100.00%,< 0.001%,,404.2198888
3,CC(O)=O,,,Carboxylic Acid Ester Hydrolysis(3):1/1,1,0,49,0,50.00%,50.00%,Carboxylic Acid Ester Hydrolysis,60.02112937
4,CC12CC(O)C3C(CCC4=CC(=O)CCC34C)C1CCC2(O)C(=O)CO,,,Carboxylic Acid Ester Hydrolysis(3):1/2,1,0,49,0,50.00%,50.00%,Carboxylic Acid Ester Hydrolysis,362.2093241


In [229]:
#seperate parent compounds and save to their own dataframe
parents=df[df['Generation']==0]

#set synthesis code as parent id
parents['Parent_ID']=parents['Synthesis Code']

parents[['Parent_ID','#SMILES']]

Unnamed: 0,Parent_ID,#SMILES
0,1,C=O
1,2,CC1CC2C3CCC4=CC(=O)C=CC4(C)C3(F)C(O)CC2(C)C1(O...
2,3,CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3C(...
5,4,CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3C(...
8,5,CCC1(C(=O)NC(=O)NC1=O)c1ccccc1
...,...,...
59060,32578,COc1cc(ccc1[N+]#N)[N+]([O-])=O
59061,32579,CC(=O)Oc1ccc(Cl)cc1Cl
59064,32580,CC1CCC(CC1N=C=O)N=C=O
59065,32581,CCC(CC)(C(=O)OC)C(=O)OC


In [230]:
#extract parent number from synthesis code and save as column 'Parent_num'
df['Parent_ID']=df['Synthesis Code'].str.extract(r"\((\d+)\)")

In [231]:
#group data by parent ID and create a list of unique 'Routes'
group=df.groupby(['Parent_ID'])['Route'].unique().reset_index()
group

Unnamed: 0,Parent_ID,Route
0,10001,[Lactam Hydrolysis]
1,10002,[Lactam Hydrolysis]
2,10004,[Nitrile Hydrolysis]
3,10005,[Amide Hydrolysis]
4,10007,[Carboxylic Acid Ester Hydrolysis]
...,...,...
12798,9992,[Lactam Hydrolysis]
12799,9993,[Nitrile Hydrolysis]
12800,9995,[Carboxylic Acid Ester Hydrolysis]
12801,9997,[Carboxylic Acid Ester Hydrolysis]


In [232]:
#create empty lists for each functional group 
lac=[]
imi=[]
ami=[]

#loop through each row in group dataframe
#if there is only one elements in list for column'Route', see if the value is lactam, amide, or imide
# if so, save the parent ID to the appropriate list

for i in group.index:
    check=group['Route'][i]
    par=group['Parent_ID'][i]
    if len(check) == 1:
        if check == ['Lactam Hydrolysis']:
            lac.append(par)
        elif check == ['Imide Hydrolysis']:
            imi.append(par)
        elif check == ['Amide Hydrolysis']:
            ami.append(par)



### Make data subsets for each group

In [233]:
#subset data that only forms lactam using list of parent id
lactam=df[df['Parent_ID'].isin(lac)==True]
lactam

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass,Parent_ID
43,CCCCC(C(O)=O)C(=O)N(NC1=CC=CC=C1)C1=CC=CC=C1,,,Lactam Hydrolysis(22):1,1,0,2401,14,100.00%,99.42%,Lactam Hydrolysis,326.1630426,22
79,OC(=O)C(NC(=O)CC1=CC=CS1)C1NC(C(O)=O)=C(C[N+]2...,,,Lactam Hydrolysis(38):1,1,0,2401,14,99.42%,98.84%,Lactam Hydrolysis,434.0838895,38
134,CCC\C(CC(O)=O)=N\C(N)=S,,,Lactam Hydrolysis(74):1,1,0,2401,0,100.00%,100.00%,Lactam Hydrolysis,188.0619488,74
291,C\C(CC(O)=O)=N\C(N)=S,,,Lactam Hydrolysis(165):1,1,0,2401,0,100.00%,100.00%,Lactam Hydrolysis,160.0306487,165
295,NC(N)=NC(=O)CC(O)=O,,,Lactam Hydrolysis(168):1,1,0,2401,0,100.00%,100.00%,Lactam Hydrolysis,145.0487411,168
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58224,N\C=N/C1=C(C=NN1C(=O)C1=CC=CC=C1)C(O)=O,,,Lactam Hydrolysis(32303):1,1,0,2401,0,100.00%,100.00%,Lactam Hydrolysis,258.0752902,32303
58226,CCCC(=O)N1N=CC(C(O)=O)=C1\N=C/N,,,Lactam Hydrolysis(32304):1,1,0,2401,0,100.00%,100.00%,Lactam Hydrolysis,224.0909403,32304
58269,CCCCCCCCCCCCNCC(CC(O)=O)C(=O)OC,,,Lactam Hydrolysis(32316):1,1,0,2401,98,96.08%,92.16%,Lactam Hydrolysis,329.2566086,32316
58422,CC(=O)NC(=N)NC1=C(N=CN1COCCO)C(O)=O,,,Lactam Hydrolysis(32378):1,1,0,2401,14,99.42%,98.84%,Lactam Hydrolysis,285.1073186,32378


In [234]:
#subset data that only forms imide using list of parent id
imide=df[df['Parent_ID'].isin(imi)==True]
imide

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass,Parent_ID
9,CCC(C(O)=O)(C(=O)NC(N)=O)C1=CC=CC=C1,,,Imide Hydrolysis(5):1,1,0,16807,112,99.71%,99.05%,Imide Hydrolysis,250.0953569,5
15,CCC(CC)(C(O)=O)C(=O)NC(=O)NC,,,Imide Hydrolysis(8):1,1,0,16807,112,49.93%,49.60%,Imide Hydrolysis,216.111007,8
16,CCC(CC)(C(O)=O)C(=O)N(C)C(N)=O,,,Imide Hydrolysis(8):2,1,0,16807,112,49.93%,49.60%,Imide Hydrolysis,216.111007,8
18,CCC(NC(=O)NC)(C(O)=O)C1=CC=CC=C1,,,Imide Hydrolysis(9):1,1,0,16807,98,99.71%,99.13%,Imide Hydrolysis,236.1160924,9
48,OC(=O)C1=CC=CC=C1C(=O)NC1CCC(=O)NC1=O,,,Imide Hydrolysis(24):1,1,0,16807,33628,33.33%,< 0.001%,Imide Hydrolysis,276.0746215,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57923,NC(=O)\N=C/C(C(O)=O)C(O)=O,,,Imide Hydrolysis(32162):1,1,0,16807,0,100.00%,100.00%,Imide Hydrolysis,174.0276713,32162
57970,CC(=O)OC1=CC=CC=C1C(=O)OCNC(=O)CCC(O)=O,,,Imide Hydrolysis(32185):1,1,0,16807,210,98.85%,97.61%,Imide Hydrolysis,309.0848518,32185
58055,NC(=O)NC1(CCC2=CC=CC=C2C1)C(O)=O,,,Imide Hydrolysis(32231):1,1,0,16807,98,99.71%,99.13%,Imide Hydrolysis,234.1004423,32231
58104,CN(C)C(=O)N(\C=C(\F)C(O)=O)C(N)=O,,,Imide Hydrolysis(32253):1,1,0,16807,196,100.00%,98.83%,Imide Hydrolysis,219.065534,32253


In [235]:
#subset data that only forms amide using list of parent id
amide=df[df['Parent_ID'].isin(ami)==True]
amide

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass,Parent_ID
57,CN1CC(C=C2C1CC1=CNC3=CC=CC2=C13)C(O)=O,,,Amide Hydrolysis(26):1/1,1,0,7,0,50.00%,50.00%,Amide Hydrolysis,268.1211778,26
58,CCNCC,,,Amide Hydrolysis(26):1/2,1,0,7,0,50.00%,50.00%,Amide Hydrolysis,73.08914936,26
83,OC(=O)C1=CC(Cl)=CC=C1O,,,Amide Hydrolysis(41):1/1,1,0,7,0,50.00%,50.00%,Amide Hydrolysis,171.9927217,41
84,NC1=C(Cl)C=C(C=C1)[N+]([O-])=O,,,Amide Hydrolysis(41):1/2,1,0,7,0,50.00%,50.00%,Amide Hydrolysis,172.0039551,41
108,NC1=CC=C(C=C1)C(O)=O,,,Amide Hydrolysis(56):1/1,1,0,7,0,50.00%,50.00%,Amide Hydrolysis,137.0476785,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58958,COC1=CC=C(CN)C=C1,,,Amide Hydrolysis(32530):1/2,1,0,7,0,50.00%,50.00%,Amide Hydrolysis,137.084064,32530
58960,OC(=O)C(=O)C1=CNC2=CC=C(C=C12)[N+]([O-])=O,,,Amide Hydrolysis(32531):1/1,1,0,7,0,50.00%,50.00%,Amide Hydrolysis,234.0276713,32531
58961,NCC1=CC=C(O)C=C1,,,Amide Hydrolysis(32531):1/2,1,0,7,0,50.00%,50.00%,Amide Hydrolysis,123.0684139,32531
59031,OC(=O)C1CC=CCC1C(O)=O,,,Amide Hydrolysis(32557):1/1,1,0,7,0,50.00%,50.00%,Amide Hydrolysis,170.0579088,32557


### Get summary counts

#### Before

In [236]:
want=['Amide Hydrolysis','Imide Hydrolysis', 'Lactam Hydrolysis']
before_summary=df[df['Route'].isin(want)==True]

before_summary.groupby('Route')['Parent_ID'].count()

Route
Amide Hydrolysis     6562
Imide Hydrolysis      741
Lactam Hydrolysis    1239
Name: Parent_ID, dtype: int64

#### After

In [237]:
print('Amide Hydrolysis  ', len(amide))
print('Imide Hydrolysis  ' ,len(imide))
print('Lactam Hydrolysis ',len(lactam))

Amide Hydrolysis   5774
Imide Hydrolysis   694
Lactam Hydrolysis  1148


### Check that subsets meet criteria

#### Amide example spot check

In [238]:
#Parent compound 1A88 forms products using both Amide hydrolysis and Halogenated Aliphatics: Nucleophilic Substitution
group[group['Parent_ID']=='188']

Unnamed: 0,Parent_ID,Route
3763,188,[Halogenated Aliphatics: Nucleophilic Substitu...


In [239]:
#show that parent 188 is not in the final subset for amide hydrolysis 
amide[amide['Parent_ID']=='188']

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass,Parent_ID


In [240]:
group[group['Parent_ID']=='274']

Unnamed: 0,Parent_ID,Route
7930,274,"[Carboxylic Acid Ester Hydrolysis, Amide Hydro..."


In [241]:
amide[amide['Parent_ID']=='274']

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass,Parent_ID


#### Lactam example spot check

In [242]:
group[group['Parent_ID']=='8290']

Unnamed: 0,Parent_ID,Route
11939,8290,"[Lactone Hydrolysis, Lactam Hydrolysis]"


In [243]:
lactam[lactam['Parent_ID']=='8290']

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass,Parent_ID


In [244]:
group[group['Parent_ID']=='8323']

Unnamed: 0,Parent_ID,Route
11962,8323,"[Lactam Hydrolysis, Halogenated Aliphatics: Nu..."


In [245]:
lactam[lactam['Parent_ID']=='8323']

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass,Parent_ID


#### Imide example spotcheck

In [246]:
group[group['Parent_ID']=='8625']

Unnamed: 0,Parent_ID,Route
12092,8625,"[Nitrile Hydrolysis, Imide Hydrolysis]"


In [247]:
imide[imide['Parent_ID']=='8625']

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass,Parent_ID


In [248]:
group[group['Parent_ID']=='11339']

Unnamed: 0,Parent_ID,Route
662,11339,"[Imide Hydrolysis, Lactam Hydrolysis]"


In [249]:
imide[imide['Parent_ID']=='11339']

Unnamed: 0,#SMILES,name,CAS #,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Route,Exact Mass,Parent_ID


### Save data to excel

In [250]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('FunctionalGroup_Exctracted.xlsx', engine='xlsxwriter')

# Write each dataframe to a different worksheet.
amide.to_excel(writer, sheet_name='Amide',index=False)
imide.to_excel(writer, sheet_name='Imide',index=False)
lactam.to_excel(writer, sheet_name='Lactam', index=False)
parents[['Parent_ID','#SMILES']].to_excel(writer, sheet_name='ParentCompounds',index=False)

# Close the Pandas Excel writer and output the Excel file.
writer.close()