In [8]:
import numpy as np
import pandas as pd
import math
import ast

df_cpd = pd.read_csv('../datasets/df_cpd.csv', index_col = 0)
df_rxn = pd.read_csv('../datasets/df_rxns.csv', index_col = 0)
df_enz = pd.read_csv('../datasets/df_enzrxns.csv', index_col = 0)
df_cpd = df_cpd.set_index(keys ='UNIQUE-ID')
df_rxn = df_rxn.set_index(keys = 'UNIQUE-ID')
df_enz = df_enz.set_index(keys = 'UNIQUE-ID')

In [2]:
def recover_list(df, column):
    """This function will recover a list formatted string read from .csv into a list"""
    assert type(df[column][0]) != type([]), "TypeError: The data type is already a list, it should not be converted again"
    replacement = []
    for index, row in df.iterrows():

        data = []

        if type(row[column]) == type('string'):
            data = ast.literal_eval(row[column])
        else:
            pass
        replacement.append(data)
    df[column] = replacement
    return

In [5]:
# Change PubChemID into int type in df_cpd
def recover_cpd_pubchemid(df = df_cpd):

    PubChemID_int = df['PubChemID'].fillna(0).astype(int)
    df['PubChemID'] = PubChemID_int
    return df

# Recover list format of df_rxn
def recover_rxn(df = df_rxn):
    rxn_list_fix = ['EC-NUMBER', 'ERXN-NUMBER', 'SUBSTRATES', 'PRODUCTS']
    for col in rxn_list_fix:
        recover_list(df, col)
    return df

# Recover list format of df_enz
def recover_enz(df = df_enz):

    enz_list_fix = ['REACTION', 'ALTERNATIVE-SUBSTRATES', '^SUBSTRATE', 'KM', 'KCAT', 'VMAX']
    for col in enz_list_fix:
        recover_list(df, col)
    return df


In [19]:
df_cpd = recover_cpd_pubchemid(df_cpd)
df_rxn = recover_rxn(df_rxn)
df_enz = recover_enz(df_enz)

In [4]:
def get_inchi(ID):

    """This function accept UNIQUE-ID and return InChI string of a certain compound"""
    if ID in df_cpd['PubChemID']:
        inchi = df_cpd['INCHI'][ID]
    else:
        inchi = '0'


    return inchi

def get_smiles(ID):

    """This function accept UNIQUE-ID and return SMILES string of a certain compound"""

    if ID in df_cpd['PubChemID']:
        smiles = df_cpd['SMILES'][ID]
    else:
        smiles = '0'

    return smiles

def get_pubchem(ID):

    """This function accept UNIQUE-ID and return InChI string of a certain compound"""
    if ID in df_cpd['PubChemID']:
        pubchem = df_cpd['PubChemID'][ID]
    else:
        pubchem = '0'

    return pubchem

In [2]:

# df_master here

def rxn_to_EC(df = df_rxn):

    EC = []
    rxn = []

    for index, row in df.iterrows():

        if len(row['EC-NUMBER']) > 1:
            for i in range(len(row['EC-NUMBER'])):
                EC.append(row['EC-NUMBER'][i])
                rxn.append(index)
        elif len(row['EC-NUMBER']) == 1:
            EC.append(row['EC-NUMBER'][0])
            rxn.append(index)
        else:
            EC.append('No_Data')
            rxn.append(index)

    df_master = pd.DataFrame({'EC-NUMBER' : EC,
                              'UNIQUE-ID' : rxn})

    rxn_num = []
    subs = []
    pdts = []
    gibbs = []

    for index, row in df_master.iterrows():
        ID = row['UNIQUE-ID']
        rxn_num.append(df['ERXN-NUMBER'][ID])
        subs.append(df['SUBSTRATES'][ID])
        pdts.append(df['PRODUCTS'][ID])
        gibbs.append(df['GIBBS'][ID])

    df_master['ERXN-NUMBER'] = rxn_num
    df_master['SUBSTRATES'] = subs
    df_master['PRODUCTS'] = pdts
    df_master['GIBBS'] = gibbs

    return df_master
# df_sorted here

def sort_df(df):
    df_sorted = df.sort_values(by=['EC-NUMBER'])
    df_sorted.reset_index(inplace=True, drop=True)
    for index, row in df_sorted.iterrows():

        if math.isnan(row['GIBBS']):
            df_sorted['GIBBS'][index] = 'No-Data'

    return df_sorted

def combine_EC(df):

    EC_a = 'EC-1'

    EC = []
    ID = []
    erxn = []
    subs = []
    pdts = []
    gibbs = []
    counter = 0

    ID_temp = []
    erxn_temp = []
    subs_temp = []
    pdts_temp = []
    gibbs_temp = []

    for index, row in df.iterrows():

        if row['EC-NUMBER'] == EC_a:
            ID_temp.append(row['UNIQUE-ID'])
            erxn_temp.append(row['ERXN-NUMBER'])
            subs_temp.append(row['SUBSTRATES'])
            pdts_temp.append(row['PRODUCTS'])
            gibbs_temp.append(row['GIBBS'])
            counter += 1

        elif counter == 0:
            ID.append(row['UNIQUE-ID'])
            erxn.append(row['ERXN-NUMBER'])
            subs.append(row['SUBSTRATES'])
            pdts.append(row['PRODUCTS'])
            gibbs.append(row['GIBBS'])

            EC.append(EC_a)
            EC_a = row['EC-NUMBER']
        else:
            ID.append(ID_temp)
            erxn.append(erxn_temp)
            subs.append(subs_temp)
            pdts.append(pdts_temp)
            gibbs.append(gibbs_temp)

            ID_temp = []
            erxn_temp = []
            subs_temp = []
            pdts_temp = []
            gibbs_temp = []

            EC.append(EC_a)
            counter = 0
            EC_a = row['EC-NUMBER']

    df_sorted_master = pd.DataFrame({'EC-NUMBER' : EC,
                                    'UNIQUE-ID' : ID,
                                    'ERXN-NUMBER' : erxn,
                                    'SUBSTRATES' : subs,
                                    'PRODUCTS' : pdts,
                                    'GIBBS' : gibbs})

    df_sorted_master.set_index(keys=['EC-NUMBER'], inplace=True)

    return df_sorted_master
# drop redundancy

def drop_EC_redundancy(df):

    drop = []
    for index, row in df.iterrows():
        if index.count('.') < 'EC-1.1.1.1'.count('.'):
            #print(index)
            drop.append(index)

    df_sorted_master_drop = df
    for item in drop:
        df_sorted_master_drop = df_sorted_master_drop.drop(item)

    return df_sorted_master_drop
# first version df_master_1st

def master_direction(df):

#    df_master_1st = pd.read_csv('df_master_1st.csv')
    df_master = df.drop(['UNIQUE-ID', 'ERXN-NUMBER'], axis = 1)
    df_master['DIRECTION'] = 1

    temp = []
    for index, row in df_master.iterrows():
        if type(row['GIBBS']) == str and row['GIBBS'][0] == '[':
            temp.append(row['GIBBS'][1:-1].split(","))
        elif type(row['GIBBS']) == list:
            temp.append(row['GIBBS'])
        elif row['GIBBS'] == 'No-Data':
            temp.append(row['GIBBS'])
        else:
            temp.append(float(row['GIBBS']))
    df_master['GIBBS'] = temp

    recover_list(df_master, 'SUBSTRATES')
    recover_list(df_master, 'PRODUCTS')

    return df_master

# single liner

def flatten_df(df):

    EC = []
    substrate = []
    products = []
    gibbs = []
    for index, row in df.iterrows():
        if type(row['SUBSTRATES'][0]) == list:
            for item in row['SUBSTRATES']:
                EC.append(row['EC-NUMBER'])
                substrate.append(item)
            for item in row['PRODUCTS']:
                products.append(item)
            for item in row['GIBBS']:
                gibbs.append(item)
        else:
            EC.append(row['EC-NUMBER'])
            substrate.append(row['SUBSTRATES'])
            products.append(row['PRODUCTS'])
            gibbs.append(row['GIBBS'])
    #print ('EC-NUMBER count', len(EC), '\n SUBSTRATES count', len(substrate), '\n PRODUCTS count', len(products), '\n GIBBS count', len(gibbs))

    df_master_flattened = pd.DataFrame({'EC-NUMBER': EC,
                                       'SUBSTRATES': substrate,
                                       'PRODUCTS': products,
                                       'GIBBS': gibbs})
    #df_master_flattened.to_csv('df_master_flattened.csv')

    return df_master_flattened

def single_liner(df):

    EC = []
    subs = []
    for index, row in df.iterrows():
        if type(row['SUBSTRATES']) == list:
            for item in row['SUBSTRATES']:
                EC.append(row['EC-NUMBER'])
                subs.append(item)
        else:
            EC.append(row['EC-NUMBER'])
            subs.append(row['SUBSTRATES'])

    df_subs = pd.DataFrame({'EC-NUMBER': EC,
                           'SUBSTRATES': subs})

    EC = []
    pdts = []
    for index, row in df_master_flattened.iterrows():
        if type(row['PRODUCTS']) == list:
            for item in row['PRODUCTS']:
                EC.append(row['EC-NUMBER'])
                pdts.append(item)
        else:
            EC.append(row['EC-NUMBER'])
            pdts.append(row['PRODUCTS'])

    df_pdts = pd.DataFrame({'EC-NUMBER': EC,
                           'PRODUCTS': pdts})

    return [df_subs, df_pdts]

# dropping
# dropping the weird EC-NUMBER

def drop_single(df):

    drop = []
    for index, row in df.iterrows():
        if row['EC-NUMBER'].count('.') < 'EC-1.1.1.1'.count('.'):
            #print(index)
            #drop.append(row['EC-NUMBER'])
            drop.append(index)
        elif row['EC-NUMBER'][0] == '|' and row['EC-NUMBER'][-1] == '|':
            #drop.append(row['EC-NUMBER'])
            drop.append(index)
        else:
            pass

    df_subs_dropped = df
    for item in drop:
        df_subs_dropped = df.drop(item)

    return df_subs_dropped

def subs_EC(df, ec:str):

    EC = []
    subs = []
    for index, row in df.iterrows():
        if row['EC-NUMBER'][:4] == ec:
            EC.append(row['EC-NUMBER'])
            subs.append(row['SUBSTRATES'])
    df_subs_ec = pd.DataFrame({'EC-NUMBER': EC,
                               'SUBSTRATES': subs})
    return df_subs_ec


FileNotFoundError: [Errno 2] File b'../datasets/df_cpd.csv' does not exist: b'../datasets/df_cpd.csv'