### Lindsay Shields
#### October 5, 2022

This Jupyter Notebook contains a python function written by Lindsay Shields to format prediction output from Metabolizer. 

In [1]:
# load python library required to run code
import pandas as pd

#### Function

In [2]:
def FormatPrediction(file):
    #read in data
    df=pd.read_table(file)
    
    #make a new column with only product_SMILES
    df['Product_SMILES']=df.loc[df['Generation']!= 0,'#SMILES']
    
    # make a dictionary with parent number as key and parent smiles as value
    prnt_value=df.loc[df['Generation']==0,'#SMILES']
    prnt_key=df.loc[df['Generation']==0, 'Synthesis Code']

    prnt=dict(zip(prnt_key,prnt_value))
    
    #extract parent number from synthesis code and save as column 'Parent_num'
    df['Parent_ID']=df['Synthesis Code'].str.extract(r"\((\d+)\)")
    
    #Make new coluumn 'Parent SMILES' and fill it using parent dicitonary
    df['Parent_SMILES']=df['Parent_ID'].map(prnt)
    
    #final touches
    #remove unnecessary column
    df.drop(columns=['#SMILES','name'],inplace=True)
    #remove rows with only parent smiles
    df.drop(index=df.loc[df['Generation']==0].index,inplace=True)
    
    #reorder column headers for better manual viewing
    df=df.reindex(columns=['Parent_ID','Parent_SMILES','Product_SMILES','Route','Synthesis Code', 'Generation', 'Phase', 'Formation', 'Degradation',
       'Production', 'Accumulation','Exact Mass'])
    return df

#### Apply function to file and save to variable

To use this function on other files, change "Outputfile" to whatever your file is. Be sure the file:
- is located in the same folder as this notebook 
- filename is copied exactly, including file extensions if there is one. 

Ex.newFile=FormatPrediction("Anaerobic_Lit_3gen.out")

In [3]:
newFile=FormatPrediction("OutputFile")
newFile

Unnamed: 0,Parent_ID,Parent_SMILES,Product_SMILES,Route,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Exact Mass
20,20,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(N)=O,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(O)=O,Amide Hydrolysis,Amide Hydrolysis(20):1/1,1,0,7,0,50.000%,50.000%,271.084458
21,20,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(N)=O,N,Amide Hydrolysis,Amide Hydrolysis(20):1/2,1,0,7,0,50.000%,50.000%,17.026549
33,31,O=C(NC1=CC=CC=C1)OCC#C,NC1=CC=CC=C1,Carbamate Hydrolysis,Carbamate Hydrolysis(31):1/1,1,0,343,0,50.000%,50.000%,93.057849
34,31,O=C(NC1=CC=CC=C1)OCC#C,OCC#C,Carbamate Hydrolysis,Carbamate Hydrolysis(31):1/2,1,0,343,0,50.000%,50.000%,56.026215
36,32,CC1=CC=CC(NC(=O)OCC(F)(F)F)=C1,CC1=CC=CC(N)=C1,Carbamate Hydrolysis,Carbamate Hydrolysis(32):1/1,1,0,343,0,50.000%,50.000%,107.073499
...,...,...,...,...,...,...,...,...,...,...,...,...
435,182,CC1=C(C=CC(OC(=O)NC2=CC=CC=C2)=C1)[N+]([O-])=O,CC1=C(C=CC(O)=C1)[N+]([O-])=O,Carbamate Hydrolysis,Carbamate Hydrolysis(182):1/2,1,0,343,0,50.000%,50.000%,153.042593
437,183,CNC(=O)OC1=C(C=C(C=C1)[N+]([O-])=O)[N+]([O-])=O,CN,Carbamate Hydrolysis,Carbamate Hydrolysis(183):1/1,1,0,343,0,50.000%,50.000%,31.042199
438,183,CNC(=O)OC1=C(C=C(C=C1)[N+]([O-])=O)[N+]([O-])=O,OC1=C(C=C(C=C1)[N+]([O-])=O)[N+]([O-])=O,Carbamate Hydrolysis,Carbamate Hydrolysis(183):1/2,1,0,343,0,50.000%,50.000%,184.012021
459,203,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(N)=O,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(O)=O,Amide Hydrolysis,Amide Hydrolysis(203):1/1,1,0,7,0,50.000%,50.000%,271.084458


#### Save newly formatted dataframe (newFile) to csv

Replace "newOutputFile.csv" to whatever you want to name your new file. 

Be sure to include the file extension, '.csv' in this case, but you can also save it in tab-delimited format. 


Ex. newFile.to_csv("newOutputFile.csv",sep='\t', index=False)


index parameter is used to prevent the the inclusion of a row index column (excel already has that so you dont need another) 

sep parameter is used to determine the deliminator, default is comma unless specified as something else.



In [11]:
newFile.to_csv("newOutputFile.csv",index=False)

#### Reformat so all products are on the same row

In [107]:
#group products by parent id and save them as a list
byRow=newFile.groupby(['Parent_ID','Parent_SMILES'])['Product_SMILES'].apply(list).reset_index()
byRow

Unnamed: 0,Parent_ID,Parent_SMILES,Product_SMILES
0,100,O=C(NCCC1=CC=CC=C1)OC1=CC(=CC=C1)C1=CC=CC=C1,"[NCCC1=CC=CC=C1, OC1=CC(=CC=C1)C1=CC=CC=C1]"
1,101,O=C(NCC1=C2C=CC=CC2=CC=C1)OC1=CC(=CC=C1)C1=CC=...,"[NCC1=C2C=CC=CC2=CC=C1, OC1=CC(=CC=C1)C1=CC=CC..."
2,102,FC1=C(C=C(OC(=O)NC2CCCCC2)C=C1)C1=CC=CC=C1,"[NC1CCCCC1, OC1=CC(=C(F)C=C1)C1=CC=CC=C1]"
3,103,O=C(NCC1=CC=CC=C1)OC1=CC=CC=C1,"[NCC1=CC=CC=C1, OC1=CC=CC=C1]"
4,104,O=C(NC1CCCCC1)OC1=CC=C(C=C1)C1=CC=CC=C1,"[NC1CCCCC1, OC1=CC=C(C=C1)C1=CC=CC=C1]"
...,...,...,...
124,95,CCCCNC(=O)OC1=CC(=CC=C1)C1=CC=CC=C1,"[CCCCN, OC1=CC(=CC=C1)C1=CC=CC=C1]"
125,96,O=C(NCCCCC1=CC=CC=C1)OC1=CC(=CC=C1)C1=CC=CC=C1,"[NCCCCC1=CC=CC=C1, OC1=CC(=CC=C1)C1=CC=CC=C1]"
126,97,CNC(=O)OC1=CC(C)=CC(C)=C1[N+]([O-])=O,"[CN, CC1=CC(C)=C(C(O)=C1)[N+]([O-])=O]"
127,98,O=C(NC1CCCCC1)OC1=CC(=CC=C1)C1=CC=CC=C1,"[NC1CCCCC1, OC1=CC(=CC=C1)C1=CC=CC=C1]"


In [108]:
#expand the list of products to their own columns
byRow[['Product_SMILES_1','Product_SMILES_2']]=byRow['Product_SMILES'].to_list()
byRow.drop(columns=['Product_SMILES'],inplace=True)
byRow

Unnamed: 0,Parent_ID,Parent_SMILES,Product_SMILES_1,Product_SMILES_2
0,100,O=C(NCCC1=CC=CC=C1)OC1=CC(=CC=C1)C1=CC=CC=C1,NCCC1=CC=CC=C1,OC1=CC(=CC=C1)C1=CC=CC=C1
1,101,O=C(NCC1=C2C=CC=CC2=CC=C1)OC1=CC(=CC=C1)C1=CC=...,NCC1=C2C=CC=CC2=CC=C1,OC1=CC(=CC=C1)C1=CC=CC=C1
2,102,FC1=C(C=C(OC(=O)NC2CCCCC2)C=C1)C1=CC=CC=C1,NC1CCCCC1,OC1=CC(=C(F)C=C1)C1=CC=CC=C1
3,103,O=C(NCC1=CC=CC=C1)OC1=CC=CC=C1,NCC1=CC=CC=C1,OC1=CC=CC=C1
4,104,O=C(NC1CCCCC1)OC1=CC=C(C=C1)C1=CC=CC=C1,NC1CCCCC1,OC1=CC=C(C=C1)C1=CC=CC=C1
...,...,...,...,...
124,95,CCCCNC(=O)OC1=CC(=CC=C1)C1=CC=CC=C1,CCCCN,OC1=CC(=CC=C1)C1=CC=CC=C1
125,96,O=C(NCCCCC1=CC=CC=C1)OC1=CC(=CC=C1)C1=CC=CC=C1,NCCCCC1=CC=CC=C1,OC1=CC(=CC=C1)C1=CC=CC=C1
126,97,CNC(=O)OC1=CC(C)=CC(C)=C1[N+]([O-])=O,CN,CC1=CC(C)=C(C(O)=C1)[N+]([O-])=O
127,98,O=C(NC1CCCCC1)OC1=CC(=CC=C1)C1=CC=CC=C1,NC1CCCCC1,OC1=CC(=CC=C1)C1=CC=CC=C1


In [106]:
#save new dataframe as a csv
byRow.to_csv('JL_PredFormat_ProdRow.csv',index=False)