In [1]:
import re
import cPickle as pickle
from bioservices.kegg import KEGG
import pandas as pd

In [3]:
k = KEGG()
k.get('C00037')

u'ENTRY       C00037                      Compound\nNAME        Glycine;\n            Aminoacetic acid;\n            Gly\nFORMULA     C2H5NO2\nEXACT_MASS  75.032\nMOL_WEIGHT  75.0666\nREMARK      Same as: D00011\nREACTION    R00364 R00365 R00366 R00367 R00368 R00369 R00371 R00372 \n            R00373 R00374 R00395 R00478 R00497 R00565 R00588 R00610 \n            R00611 R00652 R00751 R00775 R00830 R00899 R00945 R01221 \n            R01424 R01723 R01766 R01957 R02452 R02551 R02729 R03121 \n            R03284 R03425 R03579 R03654 R03718 R03956 R03975 R04144 \n            R04486 R04777 R04951 R05055 R05704 R05835 R05841 R06171 \n            R07226 R07463 R08195 R08196 R08252 R08701 R08744 R09099 \n            R09717 R09718 R10060 R10062 R10179 R10685 R10722 R10908 \n            R10994\nPATHWAY     map00120  Primary bile acid biosynthesis\n            map00230  Purine metabolism\n            map00260  Glycine, serine and threonine metabolism\n            map00310  Lysine degradation\n      

## Load the queries

In [6]:
kegg_queries = pickle.load( open( "../data/kegg_queries.p", "rb" ) )

## Find queries that had 404 error

In [25]:
df = pd.DataFrame(kegg_queries.items())
df.head()

In [26]:
df.head()

Unnamed: 0,0,1
0,K02040,ENTRY K02040 KO\nNA...
1,K01369,ENTRY K01369 KO\nNA...
2,K01368,ENTRY K01368 KO\nNA...
3,K01365,ENTRY K01365 KO\nNA...
4,K01364,ENTRY K01364 KO\nNA...


In [39]:
df[df.iloc[:,1] == 404].to_csv("../data/bad_keggs.csv", index = False)

## Get Level 1 and Level 2 classifications from the KEGG orthology

In [115]:
df_good_keggs = df[df.iloc[:,1] != 404]
df_good_keggs.head()

Unnamed: 0,0,1
0,K02040,ENTRY K02040 KO\nNA...
1,K01369,ENTRY K01369 KO\nNA...
2,K01368,ENTRY K01368 KO\nNA...
3,K01365,ENTRY K01365 KO\nNA...
4,K01364,ENTRY K01364 KO\nNA...


In [None]:
#match = re.search('KEGG Orthology \(KO\) .*\\n.*\\n.*', kegg_queries[kegg_query])

In [43]:
good_keggs = list(df_good_keggs.iloc[:,0])

In [65]:
metabolism_level_1_dict = {}
metabolism_level_2_dict = {}


for i, good_kegg in enumerate(good_keggs):
    if good_kegg == 'K88':
        continue
    match = re.search(u'KEGG Orthology \(KO\) .*\\n.*\\n.*', kegg_queries[good_kegg])
    if match:
        metabolism_level_1_dict[good_kegg] = match.group().split("\n")[1].strip()
        metabolism_level_2_dict[good_kegg] = match.group().split("\n")[2].strip()
    else:
        metabolism_level_1_dict[good_kegg] = 'NA'
        metabolism_level_2_dict[good_kegg] = 'NA'

In [66]:
metabolism_level_1_df = pd.DataFrame(metabolism_level_1_dict.items())
metabolism_level_2_df = pd.DataFrame(metabolism_level_2_dict.items())

In [71]:
pd.unique(metabolism_level_1_df.iloc[:,1])

array([u'Cellular Processes', u'Human Diseases', 'NA', u'Metabolism',
       u'Environmental Information Processing',
       u'Genetic Information Processing', u'Organismal Systems'], dtype=object)

In [72]:
pd.unique(metabolism_level_2_df.iloc[:,1])

array([u'Transport and catabolism', u'Drug resistance', 'NA',
       u'Lipid metabolism', u'Signal transduction', u'Membrane transport',
       u'Amino acid metabolism', u'Overview',
       u'Metabolism of cofactors and vitamins', u'Carbohydrate metabolism',
       u'Folding, sorting and degradation', u'Energy metabolism',
       u'Translation', u'Excretory system', u'Digestive system',
       u'Metabolism of other amino acids',
       u'Glycan biosynthesis and metabolism',
       u'Metabolism of terpenoids and polyketides', u'Infectious diseases',
       u'Xenobiotics biodegradation and metabolism',
       u'Nucleotide metabolism', u'Development', u'Replication and repair',
       u'Cell growth and death', u'Cancers', u'Cellular commiunity',
       u'Endocrine and metabolic diseases', u'Immune system',
       u'Biosynthesis of other secondary metabolites',
       u'Signaling molecules and interaction', u'Transcription',
       u'Cell motility', u'Circulatory system', u'Endocrine syste

In [117]:
metabolism_level_1_df.shape

(10012, 2)

In [118]:
metabolism_level_2_df.shape

(10012, 2)

In [121]:
metabolism_level_1_df.columns = ["kegg", "level_1"]
metabolism_level_2_df.columns = ["kegg", "level_2"]

In [131]:
KOresult = pd.merge(metabolism_level_1_df, metabolism_level_2_df, on='kegg')

In [132]:
KOresult[KOresult.iloc[:,2] == "NA"].head()

Unnamed: 0,kegg,level_1,level_2
5,K01361,,
6,K01360,,
8,K01362,,
9,K09173,,
12,K05841,,


In [135]:
KOresult.to_csv("../data/KO_levels.csv", index = False)

## Deal with missing KEGG Orthology's (NA) 

In [75]:
na_keggs = metabolism_level_1_df[metabolism_level_1_df.iloc[:,1] == 'NA']
na_keggs_list = list(na_keggs.iloc[:,0])

In [89]:
na_keggs_list[0:4]

['K01361', 'K01360', 'K01362', 'K09173']

## Find KEGGs that have BRITE level

In [96]:
brite_level = {}
for i, kegg in enumerate(na_keggs_list):
    if kegg == 'K88':
        continue
    match = re.search(u'BRITE.*', kegg_queries[kegg])
    if match:
        brite_level[kegg] = match.group().split("BRITE")[1].strip()
    else:
        brite_level[kegg] = 'NA'

In [97]:
brite_level_df = pd.DataFrame(brite_level.items())

In [98]:
brite_level_df.head()

Unnamed: 0,0,1
0,K00184,
1,K00185,
2,K00183,
3,K00180,Enzymes [BR:ko01000]
4,K01361,Enzymes [BR:ko01000]


In [99]:
pd.unique(brite_level_df.iloc[:,1])

array(['NA', u'Enzymes [BR:ko01000]',
       u'Transcription factors [BR:ko03000]',
       u'Secretion system [BR:ko02044]',
       u'G protein-coupled receptors [BR:ko04030]',
       u'GTP-binding proteins [BR:ko04031]', u'Transporters [BR:ko02000]',
       u'Chaperones and folding catalysts [BR:ko03110]',
       u'KEGG modules [BR:ko00002]', u'Ribosome biogenesis [BR:ko03009]',
       u'Transfer RNA biogenesis [BR:ko03016]',
       u'Polyketide biosynthesis proteins [BR:ko01008]',
       u'Lectins [BR:ko04091]', u'Spliceosome [BR:ko03041]',
       u'Ubiquitin system [BR:ko04121]', u'Proteoglycans [BR:ko00535]',
       u'Prokaryotic defense system [BR:ko02048]',
       u'Cell adhesion molecules and their ligands [BR:ko04516]',
       u'Messenger RNA biogenesis [BR:ko03019]', u'Exosome [BR:ko04147]',
       u'Chromosome [BR:ko03036]',
       u'Lipopolysaccharide biosynthesis proteins [BR:ko01005]',
       u'Ion channels [BR:ko04040]',
       u'Transcription machinery [BR:ko03021]',
   

In [114]:
pd.DataFrame(pd.unique(brite_level_df.iloc[:,1])).to_csv("Brite_list.csv", index = False)

In [112]:
brite_level_df[brite_level_df.iloc[:,1] == 'Secretion system [BR:ko02044]'].head()

Unnamed: 0,0,1
9,K11909,Secretion system [BR:ko02044]
68,K02663,Secretion system [BR:ko02044]
203,K12276,Secretion system [BR:ko02044]
205,K12273,Secretion system [BR:ko02044]
207,K12271,Secretion system [BR:ko02044]


In [137]:
brite_level_df.columns = ["kegg", "brite_level"]

In [138]:
brite_level_df.to_csv("../data/BRITE_levels.csv", index = False)