## Functions
### Lindsay Shields

#### October 2022

This Notebook contains functions that format standard output from Metabolizer as well as perform other tasks that have been requested using the newly formated data.

In [45]:
#Import python libraries necessary for functions
import pandas as pd
import re
import numpy as np
from openbabel import pybel
from molmass import Formula
from chempy import Substance
import warnings
warnings.filterwarnings('ignore')

### Full Format function

This function was written in Python using the following python libraries:
- pandas
- numpy

The purpose of this function is to format the standard output from Metabolizer into a more readable format with features that include:
- Separating out parent and products
- Extracting Parent ID
- Providing Intial and immediate parent for each product
- Proividing immediate scheme for each product

This function will not work on output with only one generation

Expect this to fun for a little while

In [5]:
def FullFormatPrediction(file):
    #read in data
    df=pd.read_table(file)
    
    #make a new column with only product_SMILES
    df['Product_SMILES']=df.loc[df['Generation']!= 0,'#SMILES']
    
    # make a dictionary with parent number as key and parent smiles as value
    prnt_value=df.loc[df['Generation']==0,'#SMILES']
    prnt_key=df.loc[df['Generation']==0, 'Synthesis Code']

    prnt=dict(zip(prnt_key,prnt_value))
    
    #extract parent number from synthesis code and save as column 'Parent_num'
    df['Parent_ID']=df['Synthesis Code'].str.extract(r"\((\d+)\)")
    
    #Make new coluumn 'Inital Parent SMILES' and fill it using parent dicitonary
    df['Initial_Parent_SMILES']=df['Parent_ID'].map(prnt)
    
    #remove unnecessary column
    df.drop(columns=['#SMILES','name'],inplace=True)
    #remove rows with only parent smiles
    df.drop(index=df.loc[df['Generation']==0].index,inplace=True)
    #reset the index 
    df.reset_index(drop=True,inplace=True)
    
    #seperate out reaction steps listed in 'Route' and save each step to its own column
    g=df['Route'].str.split(r'(?<!\d),(?!\d)',regex=True,expand=True).add_prefix("Step_")
    newdf=pd.concat([df,g],axis=1)

    #loop that assigns immediate parent of predicted product based on the product from previous steps in reaction scheme
    newdf['Immediate_Parent_SMILES']=''
    leader=list(newdf['Step_1'].unique()) #second steps is the branch determining factor
    for i in newdf.index:
        l=newdf['Step_1'][i]

        if l is not None:
            tmp=newdf[newdf['Step_1']==l]
            gen=tmp['Generation'][i]

            if gen ==2:
                originalProduct=newdf.loc[newdf['Generation']==1,'Product_SMILES'].iloc[0]
                newdf['Immediate_Parent_SMILES'][i]=originalProduct
            elif gen > 2:
                prior=tmp.loc[tmp['Generation']==(gen-1),'Product_SMILES'].iloc[0]
                newdf['Immediate_Parent_SMILES'][i]=prior
        else:
            newdf['Immediate_Parent_SMILES'][i]=newdf['Initial_Parent_SMILES'][i]

    #make a dictionary where generation is the key and a column name is the value
    gens={1:'Step_0',2:'Step_1',3:'Step_2',4:'Step_3'}
    newdf['Scheme']=''

    #loop that assigns immediate reaction step name to the approprate reaction in the scheme
    for i in newdf.index:
        gen=newdf['Generation'][i]
        newdf['Scheme'][i]=newdf[gens[gen]][i]

    #reorder column headers for better manual viewing
    newdf=newdf.reindex(columns=['Parent_ID','Initial_Parent_SMILES','Scheme','Immediate_Parent_SMILES','Product_SMILES','Route',
                           'Synthesis Code', 'Generation', 'Phase', 'Formation', 'Degradation','Production', 'Accumulation',
                           'Exact Mass'])
    return newdf

#### Example

In [34]:
FFP_df=FullFormatPrediction('./TestData/Anaerobic_Lit_3gen.out')
FFP_df.head()

Unnamed: 0,Parent_ID,Initial_Parent_SMILES,Scheme,Immediate_Parent_SMILES,Product_SMILES,Route,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Exact Mass
0,1,CC1=CC2=CC=CC=C2C=C1,Carboxylation: Napthalene to 2-napthyl carboxy...,CC1=CC2=CC=CC=C2C=C1,OC(=O)C1=CC2=CC=CC=C2C=C1,Carboxylation: Napthalene to 2-napthyl carboxy...,Carboxylation: Napthalene to 2-napthyl carboxy...,1,0,16807,2401,87.500%,75.000%,172.052429
1,1,CC1=CC2=CC=CC=C2C=C1,Addition: Methylbenzene to fumarate adduct,CC1=CC2=CC=CC=C2C=C1,OC(=O)CC(CC1=C[C]=2=[C](C=C1)=CC=CC=2)C(O)=O,Addition: Methylbenzene to fumarate adduct,Addition: Methylbenzene to fumarate adduct(1):1,1,0,2401,16807,12.500%,< 0.001%,258.089209
2,1,CC1=CC2=CC=CC=C2C=C1,Activation: CoA addition to 2-naphthoic acid,OC(=O)C1=CC2=CC=CC=C2C=C1,CC(C)(COP(O)(=O)OP(O)(=O)OC[C@H]1O[C@H]([C@H](...,Carboxylation: Napthalene to 2-napthyl carboxy...,Activation: CoA addition to 2-naphthoic acid(C...,2,0,2401,16807,12.500%,< 0.001%,921.157075
3,1,CC1=CC2=CC=CC=C2C=C1,Rearrangement: Alkyl fumurate adduct,OC(=O)C1=CC2=CC=CC=C2C=C1,OC(=O)C(CCC1=C[C]=2=[C](C=C1)=CC=CC=2)C(O)=O,"Addition: Methylbenzene to fumarate adduct,Rea...",Rearrangement: Alkyl fumurate adduct(Addition...,2,0,16807,16807,12.500%,< 0.001%,258.089209
4,1,CC1=CC2=CC=CC=C2C=C1,"Reduction: 2-Napthoyl-CoA to 5,6,7,8-tetrahyd...",CC(C)(COP(O)(=O)OP(O)(=O)OC[C@H]1O[C@H]([C@H](...,CC(C)(COP(O)(=O)OP(O)(=O)OC[C@H]1O[C@H]([C@H](...,Carboxylation: Napthalene to 2-napthyl carboxy...,"Reduction: 2-Napthoyl-CoA to 5,6,7,8-tetrahyd...",3,0,16807,16807,12.500%,< 0.001%,925.188375


### Basic Format Function

This function was written in Python using the following python libraries:
- pandas
- numpy

The purpose of this function is to format the standard output from Metabolizer into a more readable format with features that include:
- Separating out parent and products
- Extracting Parent ID

This function will work on all output from Metabolizer

In [10]:
def BasicFormatPrediction(file):
    #read in data
    df=pd.read_table(file)
    
    #make a new column with only product_SMILES
    df['Product_SMILES']=df.loc[df['Generation']!= 0,'#SMILES']

    # make a dictionary with parent number as key and parent smiles as value
    prnt_value=df.loc[df['Generation']==0,'#SMILES']
    prnt_key=df.loc[df['Generation']==0, 'Synthesis Code']

    prnt=dict(zip(prnt_key,prnt_value))

    #extract parent number from synthesis code and save as column 'Parent_num'
    df['Parent_ID']=df['Synthesis Code'].str.extract(r"\((\d+)\)")

    #Make new coluumn 'Inital Parent SMILES' and fill it using parent dicitonary
    df['Initial_Parent_SMILES']=df['Parent_ID'].map(prnt)

    #remove unnecessary column
    df.drop(columns=['#SMILES','name'],inplace=True)
    #remove rows with only parent smiles
    df.drop(index=df.loc[df['Generation']==0].index,inplace=True)
    #reset the index 
    df.reset_index(drop=True,inplace=True)
    
    #reorder column headers for better manual viewing
    df=df.reindex(columns=['Parent_ID','Initial_Parent_SMILES','Product_SMILES','Route',
                       'Synthesis Code', 'Generation', 'Phase', 'Formation', 'Degradation','Production', 'Accumulation',
                       'Exact Mass'])

    return df

#### Example

In [35]:
BFP_df=BasicFormatPrediction('./TestData/OutputFile')
BFP_df.head()

Unnamed: 0,Parent_ID,Initial_Parent_SMILES,Product_SMILES,Route,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Exact Mass
0,20,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(N)=O,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(O)=O,Amide Hydrolysis,Amide Hydrolysis(20):1/1,1,0,7,0,50.000%,50.000%,271.084458
1,20,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(N)=O,N,Amide Hydrolysis,Amide Hydrolysis(20):1/2,1,0,7,0,50.000%,50.000%,17.026549
2,31,O=C(NC1=CC=CC=C1)OCC#C,NC1=CC=CC=C1,Carbamate Hydrolysis,Carbamate Hydrolysis(31):1/1,1,0,343,0,50.000%,50.000%,93.057849
3,31,O=C(NC1=CC=CC=C1)OCC#C,OCC#C,Carbamate Hydrolysis,Carbamate Hydrolysis(31):1/2,1,0,343,0,50.000%,50.000%,56.026215
4,32,CC1=CC=CC(NC(=O)OCC(F)(F)F)=C1,CC1=CC=CC(N)=C1,Carbamate Hydrolysis,Carbamate Hydrolysis(32):1/1,1,0,343,0,50.000%,50.000%,107.073499


### List products in single row

This function was written in Python using the following python libraries:
- pandas
- numpy

The purpose of this function is to generate a data table where predited products are listed in a single row. This function is dependent on the output from either FullFormatPrediction() or BasicFormatPrediction().

This function will work on ouput with two predicted products

In [36]:
def ByRow(df):
    #group products by parent id and save them as a list
    new_df=df.groupby(['Parent_ID','Initial_Parent_SMILES'])['Product_SMILES'].apply(list).reset_index()
    
    #expand the list of products to their own columns
    new_df[['Product_SMILES_1','Product_SMILES_2']]=new_df['Product_SMILES'].to_list()
    new_df.drop(columns=['Product_SMILES'],inplace=True)
    
    return new_df

#### Example

In [39]:
table=ByRow(BFP_df)
table

Unnamed: 0,Parent_ID,Initial_Parent_SMILES,Product_SMILES_1,Product_SMILES_2
0,100,O=C(NCCC1=CC=CC=C1)OC1=CC(=CC=C1)C1=CC=CC=C1,NCCC1=CC=CC=C1,OC1=CC(=CC=C1)C1=CC=CC=C1
1,101,O=C(NCC1=C2C=CC=CC2=CC=C1)OC1=CC(=CC=C1)C1=CC=...,NCC1=C2C=CC=CC2=CC=C1,OC1=CC(=CC=C1)C1=CC=CC=C1
2,102,FC1=C(C=C(OC(=O)NC2CCCCC2)C=C1)C1=CC=CC=C1,NC1CCCCC1,OC1=CC(=C(F)C=C1)C1=CC=CC=C1
3,103,O=C(NCC1=CC=CC=C1)OC1=CC=CC=C1,NCC1=CC=CC=C1,OC1=CC=CC=C1
4,104,O=C(NC1CCCCC1)OC1=CC=C(C=C1)C1=CC=CC=C1,NC1CCCCC1,OC1=CC=C(C=C1)C1=CC=CC=C1
...,...,...,...,...
124,95,CCCCNC(=O)OC1=CC(=CC=C1)C1=CC=CC=C1,CCCCN,OC1=CC(=CC=C1)C1=CC=CC=C1
125,96,O=C(NCCCCC1=CC=CC=C1)OC1=CC(=CC=C1)C1=CC=CC=C1,NCCCCC1=CC=CC=C1,OC1=CC(=CC=C1)C1=CC=CC=C1
126,97,CNC(=O)OC1=CC(C)=CC(C)=C1[N+]([O-])=O,CN,CC1=CC(C)=C(C(O)=C1)[N+]([O-])=O
127,98,O=C(NC1CCCCC1)OC1=CC(=CC=C1)C1=CC=CC=C1,NC1CCCCC1,OC1=CC(=CC=C1)C1=CC=CC=C1


### Adding mol information function

This function was written in Python using the following python libraries:
- pandas
- numpy
- Pybel
- molmass
- Chempy

The purpose of this function is to gather additional information on predicted products and add them to the end of the dataframe. Features include:
- Molecular formula
- Average molecular weight
- Exact molecilar weight

This function is dependent on the output from either FullFormatPrediction() or BasicFormat().

This function will work on all output from Metabolizer

In [40]:
def AddMolInfo(df):

    form=[]
    agwt=[]
    exwt=[]

    for i in df['Product_SMILES']:
        #read smile strings as smile strings
        mol=pybel.readstring('smi',str(i)) #use pybel to get molecular formula from smiles
        #from smiles strings calculate molecular formula
        formula=mol.formula
        f=Substance.from_formula(formula)
        ef=Formula(formula)
        
        #from formula calculate average molecular weight of molecule rounded to three decimals
        avgWt=float(round(f.mass,3))
        #from smiles calculate exact molecular weight of a molecule
        exaWt=ef.isotope.mass
        
        #store variable to respective list
        form.append(formula)
        agwt.append(avgWt)
        exwt.append(exaWt)
        
    #make new column for new variables 
    df['Molecular_Formula']=form
    df['Avg_Molecular_Weight']=agwt
    df['Exact_Molecular_Weight']=exwt
    
    return df

#### Example

In [43]:
MoreInfo=AddMolInfo(BFP_df)
MoreInfo.head()

#can also use
#MoreInfo=AddMolInfo(FFP_df)
#MoreInfo.head()

Unnamed: 0,Parent_ID,Initial_Parent_SMILES,Product_SMILES,Route,Synthesis Code,Generation,Phase,Formation,Degradation,Production,Accumulation,Exact Mass,Molecular_Formula,Avg_Molecular_Weight,Exact_Molecular_Weight
0,20,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(N)=O,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(O)=O,Amide Hydrolysis,Amide Hydrolysis(20):1/1,1,0,7,0,50.000%,50.000%,271.084458,C15H13NO4,271.272,271.084458
1,20,CN(C(=O)OC1=CC=CC=C1)C1=CC=CC=C1C(N)=O,N,Amide Hydrolysis,Amide Hydrolysis(20):1/2,1,0,7,0,50.000%,50.000%,17.026549,H3N,17.031,17.026549
2,31,O=C(NC1=CC=CC=C1)OCC#C,NC1=CC=CC=C1,Carbamate Hydrolysis,Carbamate Hydrolysis(31):1/1,1,0,343,0,50.000%,50.000%,93.057849,C6H7N,93.129,93.057849
3,31,O=C(NC1=CC=CC=C1)OCC#C,OCC#C,Carbamate Hydrolysis,Carbamate Hydrolysis(31):1/2,1,0,343,0,50.000%,50.000%,56.026215,C3H4O,56.064,56.026215
4,32,CC1=CC=CC(NC(=O)OCC(F)(F)F)=C1,CC1=CC=CC(N)=C1,Carbamate Hydrolysis,Carbamate Hydrolysis(32):1/1,1,0,343,0,50.000%,50.000%,107.073499,C7H9N,107.156,107.073499
