In [6]:
import pandas as pd
import numpy as np
from os.path import join
import os
import re
from goatools import obo_parser
from pubchempy import *

import warnings
warnings.filterwarnings('ignore')

In [7]:
import warnings
from rdkit import RDLogger

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning,
                        message=".*Chem.MolFromInchi.*")
warnings.filterwarnings("ignore", category=UserWarning,
                        message=".*SettingWithCopyWarning.*")
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


# Suppress RDKit warnings
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

## 1. Reading information about all GO Terms:

We downloaded the "go.obo" file with information about all GO Terms from: http://geneontology.org/docs/download-ontology/

### (a) Storing the definition, name, and ID of all GO terms in a pandas DataFrame:

In [8]:
# filename_go_obo = join("..", "..", "data", "GOA", "go_terms", 'go.obo')
# obo_reader = obo_parser.OBOReader(filename_go_obo)
# df = pd.DataFrame(columns = ["GO ID", "Definition", "Name"])

# file1 = open(filename_go_obo, 'r')
# Lines = file1.read()
# start = 0
# while start != -1:
#     start = Lines.index('[Term]\n')
#     Lines = Lines[start+1:]
#     GO_Term = Lines[: Lines.index('[Term]\n')]
#     definition =  GO_Term[GO_Term.index("\ndef")+6:]
#     definition = definition[:definition.index("\n")]
#     name = GO_Term[GO_Term.index("\nname")+7:]
#     name = name[:name.index("\n")]
#     namespace = GO_Term[GO_Term.index("\nnamespace")+12:]
#     namespace = namespace[:namespace.index("\n")]
    
#     ID = GO_Term[GO_Term.index("\nid")+5:]
#     ID = ID[:ID.index("\n")]
    
#     df = df.append({"GO ID" : ID , "Definition" : definition, "Name" : name, "Namespace": namespace}, ignore_index = True)

In [31]:
# pd.concat used instead of append, append deprecated
# code improved when handling cases when there are no more [Term] sections left in the file

filename_go_obo = join("..", "..", "data", "GOA", "go_terms", 'go.obo')
obo_reader = obo_parser.OBOReader(filename_go_obo)
df = pd.DataFrame(columns=["GO ID", "Definition", "Name", "Namespace"])

file1 = open(filename_go_obo, 'r')
Lines = file1.read()
start = 0

while True:
    try:
        start = Lines.index('[Term]\n')
        Lines = Lines[start+1:]
        GO_Term = Lines[:Lines.index('[Term]\n')] if '[Term]\n' in Lines else Lines

        definition = GO_Term[GO_Term.index("\ndef")+6:GO_Term.index("\n", GO_Term.index("\ndef")+6)]
        name = GO_Term[GO_Term.index("\nname")+7:GO_Term.index("\n", GO_Term.index("\nname")+7)]
        namespace = GO_Term[GO_Term.index("\nnamespace")+12:GO_Term.index("\n", GO_Term.index("\nnamespace")+12)]
        ID = GO_Term[GO_Term.index("\nid")+5:GO_Term.index("\n", GO_Term.index("\nid")+5)]

        new_row = {"GO ID": ID, "Definition": definition, "Name": name, "Namespace": namespace}
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    except ValueError:
        break

file1.close()
print(df)


            GO ID                                         Definition  \
0      GO:0000001  "The distribution of mitochondria, including t...   
1      GO:0000002  "The maintenance of the structure and integrit...   
2      GO:0000003  "OBSOLETE. The production of new individuals t...   
3      GO:0000005  "OBSOLETE. Assists in the correct assembly of ...   
4      GO:0000006  "Enables the transfer of zinc ions (Zn2+) from...   
...           ...                                                ...   
47820  GO:2001313  "The chemical reactions and pathways involving...   
47821  GO:2001314  "The chemical reactions and pathways resulting...   
47822  GO:2001315  "The chemical reactions and pathways resulting...   
47823  GO:2001316  "The chemical reactions and pathways involving...   
47824  GO:2001317  "The chemical reactions and pathways resulting...   

                                                    Name           Namespace  
0                              mitochondrion inheritance

In [35]:
df.to_pickle(join("..", "..", "data", "GOA", "go_terms", "all_GO_terms.pkl"))
df

Unnamed: 0,GO ID,Definition,Name,Namespace
0,GO:0000001,"""The distribution of mitochondria, including t...",mitochondrion inheritance,biological_process
1,GO:0000002,"""The maintenance of the structure and integrit...",mitochondrial genome maintenance,biological_process
2,GO:0000003,"""OBSOLETE. The production of new individuals t...",obsolete reproduction,biological_process
3,GO:0000005,"""OBSOLETE. Assists in the correct assembly of ...",obsolete ribosomal chaperone activity,molecular_function
4,GO:0000006,"""Enables the transfer of zinc ions (Zn2+) from...",high-affinity zinc transmembrane transporter a...,molecular_function
...,...,...,...,...
47820,GO:2001313,"""The chemical reactions and pathways involving...",UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process
47821,GO:2001314,"""The chemical reactions and pathways resulting...",UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process
47822,GO:2001315,"""The chemical reactions and pathways resulting...",UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process
47823,GO:2001316,"""The chemical reactions and pathways involving...",kojic acid metabolic process,biological_process


### (b) Removing all GO terms without information about transporter:

In [None]:
# droplist = []
# for ind in df.index:
#     name, definition = df["Name"][ind].lower(), df["Definition"][ind].lower()
#     if (not "transport" in name and not "abc-type" in name and not "permease" in name 
#          and not "transport" in definition and not "abc-type" in definition and not "permease" in definition):
#         droplist.append(ind)
# df.drop(droplist, inplace = True)
# df.reset_index(inplace = True, drop = True)
# df

#### (b.1) The following code expands the search for transporter proteins!

In [39]:
filename = '/data_link/servilla/SPOT2/data/GOA/go_terms/all_GO_terms.pkl'

df = pd.read_pickle(filename)
print(df)

droplist = []
keywords = ["transport", "abc-type", "permease", "transporter", "carrier", "channel", "pump",
            "symporter", "antiporter", "translocase", "ion transport", "solute transport", 
            "sodium transport", "potassium transport", "calcium transport", "hydrogen transport", 
            "proton transport", "anion transport", "cation transport", "glucose transport", 
            "amino acid transport", "lipid transport", "nucleotide transport", "phosphate transport", 
            "sulfate transport", "facilitated diffusion", "active transport", "passive transport", 
            "secondary active transport", "primary active transport"]

for ind in df.index:
    name, definition = df["Name"][ind].lower(), df["Definition"][ind].lower()
    if not any(keyword in name or keyword in definition for keyword in keywords):
        droplist.append(ind)

df.drop(droplist, inplace=True)
df.reset_index(inplace=True, drop=True)
df


            GO ID                                         Definition  \
0      GO:0000001  "The distribution of mitochondria, including t...   
1      GO:0000002  "The maintenance of the structure and integrit...   
2      GO:0000003  "OBSOLETE. The production of new individuals t...   
3      GO:0000005  "OBSOLETE. Assists in the correct assembly of ...   
4      GO:0000006  "Enables the transfer of zinc ions (Zn2+) from...   
...           ...                                                ...   
47820  GO:2001313  "The chemical reactions and pathways involving...   
47821  GO:2001314  "The chemical reactions and pathways resulting...   
47822  GO:2001315  "The chemical reactions and pathways resulting...   
47823  GO:2001316  "The chemical reactions and pathways involving...   
47824  GO:2001317  "The chemical reactions and pathways resulting...   

                                                    Name           Namespace  
0                              mitochondrion inheritance

Unnamed: 0,GO ID,Definition,Name,Namespace
0,GO:0000006,"""Enables the transfer of zinc ions (Zn2+) from...",high-affinity zinc transmembrane transporter a...,molecular_function
1,GO:0000007,"""Enables the transfer of a solute or solutes f...",low-affinity zinc ion transmembrane transporte...,molecular_function
2,GO:0000017,"""The directed movement of alpha-glucosides int...",alpha-glucoside transport,biological_process
3,GO:0000036,"""Binding an acyl group and presenting it for p...",acyl carrier activity,molecular_function
4,GO:0000039,"""OBSOLETE. (Was not defined before being made ...",obsolete plasma membrane long-chain fatty acid...,molecular_function
...,...,...,...,...
3835,GO:2001225,"""Any process that modulates the frequency, rat...",regulation of chloride transport,biological_process
3836,GO:2001226,"""Any process that stops, prevents or reduces t...",negative regulation of chloride transport,biological_process
3837,GO:2001257,"""Any process that modulates the frequency, rat...",regulation of cation channel activity,biological_process
3838,GO:2001258,"""Any process that stops, prevents or reduces t...",negative regulation of cation channel activity,biological_process


#### (b.2) The following code expands the search for transporter proteins and removes regulation!

In [40]:
filename = '/data_link/servilla/SPOT2/data/GOA/go_terms/all_GO_terms.pkl'

df = pd.read_pickle(filename)
print(df)

droplist = []
transport_keywords = ["transport", "abc-type", "permease", "transporter", "carrier", "channel", "pump",
                      "symporter", "antiporter", "translocase", "ion transport", "solute transport",
                      "sodium transport", "potassium transport", "calcium transport", "hydrogen transport",
                      "proton transport", "anion transport", "cation transport", "glucose transport",
                      "amino acid transport", "lipid transport", "nucleotide transport", "phosphate transport",
                      "sulfate transport", "facilitated diffusion", "active transport", "passive transport",
                      "secondary active transport", "primary active transport"]

for ind in df.index:
    name = df["Name"][ind].lower()
    if "regulation" in name or not any(keyword in name for keyword in transport_keywords):
        droplist.append(ind)

df.drop(droplist, inplace=True)
df.reset_index(inplace=True, drop=True)
df


            GO ID                                         Definition  \
0      GO:0000001  "The distribution of mitochondria, including t...   
1      GO:0000002  "The maintenance of the structure and integrit...   
2      GO:0000003  "OBSOLETE. The production of new individuals t...   
3      GO:0000005  "OBSOLETE. Assists in the correct assembly of ...   
4      GO:0000006  "Enables the transfer of zinc ions (Zn2+) from...   
...           ...                                                ...   
47820  GO:2001313  "The chemical reactions and pathways involving...   
47821  GO:2001314  "The chemical reactions and pathways resulting...   
47822  GO:2001315  "The chemical reactions and pathways resulting...   
47823  GO:2001316  "The chemical reactions and pathways involving...   
47824  GO:2001317  "The chemical reactions and pathways resulting...   

                                                    Name           Namespace  
0                              mitochondrion inheritance

Unnamed: 0,GO ID,Definition,Name,Namespace
0,GO:0000006,"""Enables the transfer of zinc ions (Zn2+) from...",high-affinity zinc transmembrane transporter a...,molecular_function
1,GO:0000007,"""Enables the transfer of a solute or solutes f...",low-affinity zinc ion transmembrane transporte...,molecular_function
2,GO:0000017,"""The directed movement of alpha-glucosides int...",alpha-glucoside transport,biological_process
3,GO:0000036,"""Binding an acyl group and presenting it for p...",acyl carrier activity,molecular_function
4,GO:0000039,"""OBSOLETE. (Was not defined before being made ...",obsolete plasma membrane long-chain fatty acid...,molecular_function
...,...,...,...,...
2341,GO:2001103,"""The directed movement of a maltohexaoseacetat...",maltohexaose transport,biological_process
2342,GO:2001104,"""The directed movement of a heptasaccharideace...",heptasaccharide transport,biological_process
2343,GO:2001105,"""The directed movement of a maltoheptaoseaceta...",maltoheptaose transport,biological_process
2344,GO:2001142,"""The directed movement of a nicotinateacetate ...",nicotinate transport,biological_process


### (c) Removing all regulation proteins:

In [None]:
# droplist = []
# for ind in df.index:
#     if  "regulation" in df["Name"][ind].lower():
#         droplist.append(ind)
        
# df.drop(droplist, inplace = True)
# len(droplist)
# df

## 2. Mapping substrate names to metabolite IDs:

### (a) Extracting substrate names:

Summary:
Lists (starter and endings): Contain substrings to remove from protein names.
Function get_substrate: Cleans a given protein name by removing specified prefixes and suffixes and additional processing if specific phrases are found.
Function ends_with: Checks if a given string ends with a specific substring.
These functions and lists are likely used to standardize or clean protein names by removing common prefixes and suffixes and handling special cases, aiding in consistent analysis or comparison of protein names.

In [26]:
# starter = ["obsolete ", 
#            "high-affinity secondary active ", "low-affinity ", "secondary active ",
#            "high-affinity ", "low-affinity ", "abc-type ", "proton-dependent ", "atpase-coupled ",
#           "mitochondrion to", "mitochondrial "]
# endings = [" secondary active transmembrane transporter activity",
#            " transmembrane transporter activity",
#            " transporter activity"
#            " transmembrane transporter",
#            " transmembrane transport",
#            " activity",
#           " transporter",
#           " transport",
#           " transfer",
#           " autotransporter",
#           " channel activity"]


# def get_substrate(name):
#     for end in endings:
#         name = name.replace(end, "")
#     for start in starter:
#         name = name.replace(start, "")
#     if "import into cell" in name:
#         name = name[:name.find(" involved")]
#     return(name)

# def ends_with(full_string, sub_string):
#     if not sub_string in full_string:
#         return(False)
#     if full_string[full_string.find(sub_string): ] == sub_string:
#         return(True)
#     else:
#         return(False)

In [16]:
# df["substrate"] = ""


# transmembrane_transporter_activity = []
# for ind in df.index:
#     name = df["Name"][ind].lower()
#     if ends_with(full_string = name, sub_string = "transporter activity"):
#         transmembrane_transporter_activity.append(name)
#         substrate = get_substrate(name = name)
#     elif ends_with(full_string = name, sub_string = " transfer activity"):
#         transmembrane_transporter_activity.append(name)
#         substrate = get_substrate(name = name)
#     elif ends_with(full_string = name, sub_string = "transport"):
#         transmembrane_transporter_activity.append(name)
#         substrate = get_substrate(name = name)
#     elif ends_with(full_string = name, sub_string = "transporter"):
#         transmembrane_transporter_activity.append(name)
#         substrate = get_substrate(name = name)
#     elif ends_with(full_string = name, sub_string = "channel activity"):
#         transmembrane_transporter_activity.append(name)
#         substrate = get_substrate(name = name)
#     else: 
#         substrate = ""
#     try:
#         if substrate[0] == " ":
#             substrate = substrate[1:]
#     except IndexError: pass
#     df["substrate"][ind] = substrate

# df = df.loc[df["substrate"] != ""]

# df.to_pickle(join("..", "..", "data", "GOA", "go_terms", "df_GO_with_substrates.pkl"))

##### (a.1) By adding these additional terms, you expand the scope of transport-related activities that your code can identify and process. This ensures a more comprehensive capture of relevant transmembrane transporter activities and related substrates.

In [41]:
starter = ["obsolete ", 
           "high-affinity secondary active ", "low-affinity ", "secondary active ",
           "high-affinity ", "low-affinity ", "abc-type ", "proton-dependent ", "atpase-coupled ",
           "mitochondrion to", "mitochondrial "]

endings = [" secondary active transmembrane transporter activity",
           " transmembrane transporter activity",
           " transporter activity",
           " transmembrane transporter",
           " transmembrane transport",
           " activity",
           " transporter",
           " transport",
           " transfer",
           " autotransporter",
           " channel activity",
           " carrier activity",
           " symporter activity",
           " antiporter activity",
           " uniporter activity",
           " pore activity",
           " ion channel activity",
           " substrate-specific transmembrane transporter activity",
           " solute:proton symporter activity",
           " solute:proton antiporter activity",
           " voltage-gated channel activity",
           " anion channel activity",
           " cation channel activity",
           " sodium ion transport",
           " potassium ion transport",
           " calcium ion transport",
           " chloride ion transport",
           " glucose transport",
           " amino acid transport",
           " lipid transport"]


In [42]:
df["substrate"] = ""

transmembrane_transporter_activity = []
for ind in df.index:
    name = df["Name"][ind].lower()
    if any(ends_with(full_string=name, sub_string=ending) for ending in endings):
        transmembrane_transporter_activity.append(name)
        substrate = get_substrate(name=name)
    else:
        substrate = ""
    
    try:
        if substrate[0] == " ":
            substrate = substrate[1:]
    except IndexError:
        pass
    df.at[ind, "substrate"] = substrate

df = df.loc[df["substrate"] != ""]

df.to_pickle(join("..", "..", "data", "GOA", "go_terms", "df_GO_with_substrates.pkl"))
df


Unnamed: 0,GO ID,Definition,Name,Namespace,substrate
0,GO:0000006,"""Enables the transfer of zinc ions (Zn2+) from...",high-affinity zinc transmembrane transporter a...,molecular_function,zinc
1,GO:0000007,"""Enables the transfer of a solute or solutes f...",low-affinity zinc ion transmembrane transporte...,molecular_function,zinc ion
2,GO:0000017,"""The directed movement of alpha-glucosides int...",alpha-glucoside transport,biological_process,alpha-glucoside
3,GO:0000036,"""Binding an acyl group and presenting it for p...",acyl carrier activity,molecular_function,acyl carrier
4,GO:0000039,"""OBSOLETE. (Was not defined before being made ...",obsolete plasma membrane long-chain fatty acid...,molecular_function,plasma membrane long-chain fatty acid
...,...,...,...,...,...
2341,GO:2001103,"""The directed movement of a maltohexaoseacetat...",maltohexaose transport,biological_process,maltohexaose
2342,GO:2001104,"""The directed movement of a heptasaccharideace...",heptasaccharide transport,biological_process,heptasaccharide
2343,GO:2001105,"""The directed movement of a maltoheptaoseaceta...",maltoheptaose transport,biological_process,maltoheptaose
2344,GO:2001142,"""The directed movement of a nicotinateacetate ...",nicotinate transport,biological_process,nicotinate


### (b)  Mapping substrates to IDs:

 ##### The code extracts unique substrate names from the "substrate" column of the original DataFrame df and creates a new DataFrame df_unmapped with a single column named "metabolites" containing these unique substrate names.

In summary, this following line creates a new DataFrame containing a single column 'metabolites', which lists all unique values from the 'substrate' column of the df DataFrame.

In [46]:
df_unmapped = pd.DataFrame({"metabolites" : list(set(list(df["substrate"])))})

In [47]:
df_unmapped

Unnamed: 0,metabolites
0,"zinc, cadmium uptake permease"
1,large uncharged polar molecule
2,d-xylose:proton symporter
3,[acyl-carrier-protein] s-malonyltransferase
4,urea
...,...
1316,fatty acid
1317,sodium:dicarboxylate symporter
1318,l-alpha-amino acid
1319,microcin


#### (b)(i) Mapping to  KEGG Compound IDs

In [None]:
# drugs_df = pd.read_pickle(join("..", "..", "data", "substrates", "KEGG_drugs_df.pkl"))
# compounds_df = pd.read_pickle(join("..", "..", "data", "substrates",  "KEGG_substrate_df.pkl"))
# KEGG_substrate_df = compounds_df.append(drugs_df).reset_index(drop = True)

# ##If we have multiple IDs for the same substrate name, we keep the first ID:
# droplist = []
# for ind in KEGG_substrate_df.index:
#     if not ind in droplist:
#         substrate = KEGG_substrate_df["substrate"][ind]
#         help_df = KEGG_substrate_df.loc[KEGG_substrate_df["substrate"] == substrate]
#         if len(help_df) > 1 :
#             droplist = droplist + list(help_df.index)[1:]

# KEGG_substrate_df.drop(droplist, inplace = True)

# KEGG_substrate_df["substrate"] = [name.lower() for name in KEGG_substrate_df["substrate"]]
# df_unmapped["substrate"] = [name.lower() for name in df_unmapped["metabolites"]]

# df_unmapped = df_unmapped.merge(KEGG_substrate_df, on = "substrate", how = "left")
# print("For %s out of %s substrates, we could not map the substrate name to a KEGG ID." %
#       (sum(pd.isnull(df_unmapped["KEGG ID"])), len(df_unmapped)))

In [48]:
# Append error again.
import pandas as pd
from os.path import join

# Load the DataFrames from pickle files
drugs_df = pd.read_pickle(
    join("..", "..", "data", "substrates", "KEGG_drugs_df.pkl"))
compounds_df = pd.read_pickle(
    join("..", "..", "data", "substrates", "KEGG_substrate_df.pkl"))

# Concatenate the DataFrames
KEGG_substrate_df = pd.concat([compounds_df, drugs_df]).reset_index(drop=True)

# If we have multiple IDs for the same substrate name, we keep the first ID
droplist = []
for ind in KEGG_substrate_df.index:
    if ind not in droplist:
        substrate = KEGG_substrate_df["substrate"][ind]
        help_df = KEGG_substrate_df.loc[KEGG_substrate_df["substrate"] == substrate]
        if len(help_df) > 1:
            droplist = droplist + list(help_df.index)[1:]

KEGG_substrate_df.drop(droplist, inplace=True)

# Convert substrate names to lowercase
KEGG_substrate_df["substrate"] = KEGG_substrate_df["substrate"].str.lower()
df_unmapped["substrate"] = df_unmapped["metabolites"].str.lower()

print(KEGG_substrate_df)
print(df_unmapped)

# Merge with df_unmapped
df_unmapped = df_unmapped.merge(KEGG_substrate_df, on="substrate", how="left")

# Print the result
print("For %s out of %s substrates, we could not map the substrate name to a KEGG ID." %
      (sum(pd.isnull(df_unmapped["KEGG ID"])), len(df_unmapped)))

df_unmapped

      KEGG ID                        substrate
0      C00001                              h2o
1      C00001                            water
2      C00002                              atp
3      C00002        adenosine 5'-triphosphate
4      C00003                             nad+
...       ...                              ...
54866  D12903                apraglutide (inn)
54867  D12904         apraglutide sodium (jan)
54868  D12905          gadoquatrane (usan/inn)
54869  D12906       gadoquatrane hydrate (jan)
54870  D12907  actinium ac 225 dotatate (usan)

[54286 rows x 2 columns]
                                      metabolites  \
0                   zinc, cadmium uptake permease   
1                  large uncharged polar molecule   
2                       d-xylose:proton symporter   
3     [acyl-carrier-protein] s-malonyltransferase   
4                                            urea   
...                                           ...   
1316                                   

Unnamed: 0,metabolites,substrate,KEGG ID
0,"zinc, cadmium uptake permease","zinc, cadmium uptake permease",
1,large uncharged polar molecule,large uncharged polar molecule,
2,d-xylose:proton symporter,d-xylose:proton symporter,
3,[acyl-carrier-protein] s-malonyltransferase,[acyl-carrier-protein] s-malonyltransferase,
4,urea,urea,C00086
...,...,...,...
1316,fatty acid,fatty acid,C00162
1317,sodium:dicarboxylate symporter,sodium:dicarboxylate symporter,
1318,l-alpha-amino acid,l-alpha-amino acid,
1319,microcin,microcin,


#### (b)(ii) Mapping to PubChem IDs

In [49]:
def get_ID_from_name(name):
    cs = get_compounds(name, 'name')
    inchi, cid = np.nan, np.nan
    
    for c in cs:
        
        try: inchi = c.inchi
        except AttributeError: pass
        
        try: cid = c.cid
        except AttributeError: pass
        
        if not pd.isnull(inchi) and not pd.isnull(cid):
            return(inchi, cid)
    return(inchi, cid)

In [50]:
# Add columns for PubChem CID and InChI to df_unmapped
df_unmapped["PubChem CID"] = np.nan
df_unmapped["InChI"] = np.nan

In [51]:
df_unmapped

Unnamed: 0,metabolites,substrate,KEGG ID,PubChem CID,InChI
0,"zinc, cadmium uptake permease","zinc, cadmium uptake permease",,,
1,large uncharged polar molecule,large uncharged polar molecule,,,
2,d-xylose:proton symporter,d-xylose:proton symporter,,,
3,[acyl-carrier-protein] s-malonyltransferase,[acyl-carrier-protein] s-malonyltransferase,,,
4,urea,urea,C00086,,
...,...,...,...,...,...
1316,fatty acid,fatty acid,C00162,,
1317,sodium:dicarboxylate symporter,sodium:dicarboxylate symporter,,,
1318,l-alpha-amino acid,l-alpha-amino acid,,,
1319,microcin,microcin,,,


In [52]:
for ind in df_unmapped.index:
    if ind > -1:
        if pd.isnull(df_unmapped["KEGG ID"][ind]):
            df_unmapped["InChI"][ind], df_unmapped["PubChem CID"][ind] = get_ID_from_name(name = df_unmapped["substrate"][ind])

In [None]:
df_unmapped = df_unmapped.drop(columns = ["metabolites"])
df_unmapped

#### (b)(iii) Mapping all substrate IDs to CHEBI IDs

First, we create a txt files with all KEGG CIDs and alls PubChem CIDs:

In [55]:
#create txt file with all CIDs in matches:
all_KEGG_IDs = list(set(df_unmapped["KEGG ID"].loc[~pd.isnull(df_unmapped["KEGG ID"])]))

f = open(join("..", "..", "data", "GOA", "go_terms", "all_KEGG_CIDs.txt"),"w") 
for cid in all_KEGG_IDs:
    f.write(str(cid) + "\n")
f.close()

#create txt file with all CIDs in matches:
all_PubChem_IDs = list(set(df_unmapped["PubChem CID"].loc[~pd.isnull(df_unmapped["PubChem CID"])]))

f = open(join("..", "..", "data", "GOA", "go_terms", "all_PubChem_CIDs.txt"),"w") 
for cid in all_PubChem_IDs:
    f.write(str(int(cid)) + "\n")
f.close()

The txt-files can be used as the input for the webservice http://csbg.cnb.csic.es/mbrole2/conversion.php to map the the IDs to CHEBI IDs. 

In [58]:
KEGG_to_CHEBI = pd.read_csv(join("..", "..", "data", "GOA", "go_terms",  "mbrole2_conversion_KEGG.tsv"), sep= "\t")
KEGG_to_CHEBI.rename(columns = {"Input" : "KEGG ID", "Output" : "ChEBI"}, inplace = True)
KEGG_to_CHEBI.drop(columns = ["Input_source", "Output_source"], inplace = True)
KEGG_to_CHEBI

df_unmapped["ChEBI"] = np.nan
for ind in df_unmapped.index:
    CID = df_unmapped["KEGG ID"][ind]
    try:
        df_unmapped["ChEBI"][ind] = list(KEGG_to_CHEBI["ChEBI"].loc[KEGG_to_CHEBI["KEGG ID"] == CID])[0]
    except IndexError:
        pass
df_unmapped

Unnamed: 0,substrate,KEGG ID,PubChem CID,InChI,ChEBI
0,"zinc, cadmium uptake permease",,,,
1,large uncharged polar molecule,,,,
2,d-xylose:proton symporter,,,,
3,[acyl-carrier-protein] s-malonyltransferase,,,,
4,urea,C00086,,,CHEBI:16199
...,...,...,...,...,...
1316,fatty acid,C00162,,,CHEBI:35366
1317,sodium:dicarboxylate symporter,,,,
1318,l-alpha-amino acid,,,,
1319,microcin,,,,


In [59]:
Pubchem_to_CHEBI = pd.read_csv(join("..", "..", "data", "GOA", "go_terms",  "mbrole2_conversion_Pubchem.tsv"), sep= "\t")
Pubchem_to_CHEBI.rename(columns = {"Input" : "PubChem CID", "Output" : "ChEBI"}, inplace = True)
Pubchem_to_CHEBI.drop(columns = ["Input_source", "Output_source"], inplace = True)
Pubchem_to_CHEBI

for ind in df_unmapped.index:
    if pd.isnull(df_unmapped["ChEBI"][ind]):
        try:
            CID = int(df_unmapped["PubChem CID"][ind])
            df_unmapped["ChEBI"][ind] = list(Pubchem_to_CHEBI["ChEBI"].loc[Pubchem_to_CHEBI["PubChem CID"] == CID])[0]
        except:
            pass
df_unmapped

Unnamed: 0,substrate,KEGG ID,PubChem CID,InChI,ChEBI
0,"zinc, cadmium uptake permease",,,,
1,large uncharged polar molecule,,,,
2,d-xylose:proton symporter,,,,
3,[acyl-carrier-protein] s-malonyltransferase,,,,
4,urea,C00086,,,CHEBI:16199
...,...,...,...,...,...
1316,fatty acid,C00162,,,CHEBI:35366
1317,sodium:dicarboxylate symporter,,,,
1318,l-alpha-amino acid,,,,
1319,microcin,,,,


In [60]:
df = df.merge(df_unmapped, how = "left", on = "substrate")
print(df)

df_GO_metabolite =df.loc[~pd.isnull(df["ChEBI"])]
df_GO_metabolite

           GO ID                                         Definition  \
0     GO:0000006  "Enables the transfer of zinc ions (Zn2+) from...   
1     GO:0000007  "Enables the transfer of a solute or solutes f...   
2     GO:0000017  "The directed movement of alpha-glucosides int...   
3     GO:0000036  "Binding an acyl group and presenting it for p...   
4     GO:0000039  "OBSOLETE. (Was not defined before being made ...   
...          ...                                                ...   
1899  GO:2001103  "The directed movement of a maltohexaoseacetat...   
1900  GO:2001104  "The directed movement of a heptasaccharideace...   
1901  GO:2001105  "The directed movement of a maltoheptaoseaceta...   
1902  GO:2001142  "The directed movement of a nicotinateacetate ...   
1903  GO:2001143  "The directed movement of a N-methylnicotinate...   

                                                   Name           Namespace  \
0     high-affinity zinc transmembrane transporter a...  molecular_f

Unnamed: 0,GO ID,Definition,Name,Namespace,substrate,KEGG ID,PubChem CID,InChI,ChEBI
0,GO:0000006,"""Enables the transfer of zinc ions (Zn2+) from...",high-affinity zinc transmembrane transporter a...,molecular_function,zinc,,23994.0,InChI=1S/Zn,CHEBI:30185
1,GO:0000007,"""Enables the transfer of a solute or solutes f...",low-affinity zinc ion transmembrane transporte...,molecular_function,zinc ion,C00038,,,CHEBI:29105
6,GO:0000064,"""Enables the transfer of L-ornithine from one ...",L-ornithine transmembrane transporter activity,molecular_function,l-ornithine,C00077,,,CHEBI:15729
7,GO:0000095,"""Enables the transfer of S-adenosylmethionine ...",S-adenosyl-L-methionine transmembrane transpor...,molecular_function,s-adenosyl-l-methionine,C00019,,,CHEBI:67040
11,GO:0000102,"""Enables the transfer of L-methionine from one...",L-methionine secondary active transmembrane tr...,molecular_function,l-methionine,C00073,,,CHEBI:16643
...,...,...,...,...,...,...,...,...,...
1895,GO:2001099,"""The directed movement of a maltotetraoseaceta...",maltotetraose transport,biological_process,maltotetraose,C02052,,,CHEBI:28460
1899,GO:2001103,"""The directed movement of a maltohexaoseacetat...",maltohexaose transport,biological_process,maltohexaose,C01936,,,CHEBI:27445
1901,GO:2001105,"""The directed movement of a maltoheptaoseaceta...",maltoheptaose transport,biological_process,maltoheptaose,,13908996.0,InChI=1S/C42H72O36/c43-1-8-15(50)16(51)24(59)3...,CHEBI:143183
1902,GO:2001142,"""The directed movement of a nicotinateacetate ...",nicotinate transport,biological_process,nicotinate,C00253,,,CHEBI:32544


In [61]:
df_GO_metabolite.to_pickle(join("..", "..", "data", "GOA", "go_terms", "GO_terms_with_sub_IDs.pkl")) # GO terms with substrate IDs