# Compare KEGG lists

Notebook for comparing different KEGG candidates as produced by different methods

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 30 days


In [2]:
df_pca = pd.read_csv("../results/kegg-pca-list.csv")


print("pca df shape:", df_pca.shape)
df_pca.head()

('pca df shape:', (10193, 6))


Unnamed: 0,name,ks_score,ks_sign,p_value,t_test_p_value,diff_means
0,K00867.type_I_pantothenate_kinase.,0.971429,-,3.613776e-13,0.000920864,-7e-05
1,K04108.4.hydroxybenzoyl.CoA_reductase_subunit_...,0.942857,-,1.917799e-12,1.018815e-05,-7.8e-05
2,K00721.dolichol.phosphate_mannosyltransferase.,0.942857,+,1.917799e-12,4.79536e-14,0.000444
3,K03753.molybdopterin.guanine_dinucleotide_bios...,0.935714,-,2.575939e-12,1.185121e-06,-6.6e-05
4,K13641.IclR_family_transcriptional_regulator._...,0.935714,-,2.575939e-12,0.0001708861,-0.000159


In [3]:
df_extreme_mean = pd.read_csv("../results/KEGG-mean-HE-ratios-03-01-2018.csv")

print("df shape:", df_extreme_mean.shape)
df_extreme_mean.head()

('df shape:', (10012, 4))


Unnamed: 0,CD_HE_mean_ratio,KEGG_names,LS_HE_mean_ratio,UC_HE_mean_ratio
0,1.319099,K00001(alcohol_dehydrogenase),1.689633,1.004756
1,7.043951,K00002(alcohol_dehydrogenase_(NADP+)),4.649663,0.210511
2,13.911981,K00003(homoserine_dehydrogenase),10.850482,1.594664
3,1.615451,"K00004((R,R)-butanediol_dehydrogenase_/_diacet...",4.744729,0.011772
4,21.931516,K00005(glycerol_dehydrogenase),21.087698,0.902234


In [4]:
df_extreme_median = pd.read_csv("../results/KEGG-median-HE-ratios-15-12-2017.csv")

print("df shape:", df_extreme_median.shape)

df_extreme_median.head()

('df shape:', (10012, 4))


Unnamed: 0,CD_HE_median_ratio,KEGG_names,LS_HE_median_ratio,UC_HE_median_ratio
0,1.377436,K00001(alcohol_dehydrogenase),1.785583,0.998465
1,4.88728,K00002(alcohol_dehydrogenase_(NADP+)),2.597071,0.751038
2,4.087813,K00003(homoserine_dehydrogenase),3.138586,0.57882
3,5.447837,"K00004((R,R)-butanediol_dehydrogenase_/_diacet...",5.358153,0.838109
4,4.884947,K00005(glycerol_dehydrogenase),5.966119,1.313708


In [5]:
df_ayasdi_group_4 = pd.read_csv("../results/ayasdi/ayasdi-group-4_21-12-2017.csv")

print(df_ayasdi_group_4.shape)
df_ayasdi_group_4.head()

(267, 4)


Unnamed: 0,CD_HE_median_ratio,KEGG_names,LS_HE_median_ratio,UC_HE_median_ratio
0,1.377436,K00001(alcohol_dehydrogenase),1.785583,0.998465
1,4.087813,K00003(homoserine_dehydrogenase),3.138586,0.57882
2,5.447837,"K00004((R,R)-butanediol_dehydrogenase_/_diacet...",5.358153,0.838109
3,4.884947,K00005(glycerol_dehydrogenase),5.966119,1.313708
4,4.607254,K00011(aldehyde_reductase),1.753996,2.296813


In [6]:
df_ayasdi_group_6 = pd.read_csv("../results/ayasdi/ayasdi-group-6_21-12-2017.csv")
print(df_ayasdi_group_6.shape)
df_ayasdi_group_6.head()

(272, 4)


Unnamed: 0,CD_HE_median_ratio,KEGG_names,LS_HE_median_ratio,UC_HE_median_ratio
0,1.904074,K00457(4-hydroxyphenylpyruvate_dioxygenase),2.234622,4.5e-05
1,1.0,K00660(chalcone_synthase),1.0,1.0
2,3.228943,K00821(acetylornithine/N-succinyldiaminopimela...,3.583197,0.531882
3,1.719006,K00868(pyridoxine_kinase),1.499722,0.809106
4,4618.807672,K00883(2-dehydro-3-deoxygalactonokinase),40268.472348,6389.835896


In [7]:
df_pattern = pd.read_excel("/Users/myazdaniUCSD/Downloads/PC KEGGs.xlsx")
print(df_pattern.shape)
df_pattern.head()

(39, 1)


Unnamed: 0,PC KEGGs
0,K00323
1,K00324
2,K00325
3,K00330
4,K00348


In [8]:
def clean_up_keggs(keggs_list, delim = "("):
    return [kegg.split(delim)[0] for kegg in keggs_list]

## Intersection with PCA/KS test

In [9]:
ks_keggs = clean_up_keggs(list(df_pca["name"][df_pca["ks_score"] > .8]),
                          delim = ".")

set(df_pattern["PC KEGGs"]) & set(ks_keggs)

set()

In [10]:
ks_keggs = clean_up_keggs(list(df_pca["name"][df_pca["ks_score"] > .5]),
                          delim = ".")

set(df_pattern["PC KEGGs"]) & set(ks_keggs)

{u'K00324',
 u'K00325',
 u'K00330',
 u'K00348',
 u'K00351',
 u'K00371',
 u'K00425',
 u'K00426',
 u'K00607',
 u'K00633',
 u'K00666',
 u'K03671',
 u'K11472',
 u'K14779'}

In [11]:
def intersection_with_ratios_helper(df_ratio, field, over_abund_thresh = 100, under_abund_thresh = .01):
    extereme_indx = (df_ratio[field] > over_abund_thresh) | (df_ratio[field] < under_abund_thresh)

    extreme_keggs = clean_up_keggs(list(df_extreme_mean["KEGG_names"][extereme_indx]))
    
    print(len(extreme_keggs), "KEGGs are extreme")
    return extreme_keggs

## Intersection with CD-to-HE mean ratios

In [12]:
cd_he_mean_keggs = intersection_with_ratios_helper(df_extreme_mean, "CD_HE_mean_ratio")

set(df_pattern["PC KEGGs"]) & set(cd_he_mean_keggs)

(885, 'KEGGs are extreme')


{u'K00324', u'K00325', u'K00330', u'K00371', u'K14779'}

## Intersection with LS-to-HE mean ratios

In [13]:


ls_he_mean_keggs = intersection_with_ratios_helper(df_extreme_mean, "LS_HE_mean_ratio")

set(df_pattern["PC KEGGs"]) & set(ls_he_mean_keggs)

(1728, 'KEGGs are extreme')


{u'K00324', u'K00325', u'K00360', u'K00371', u'K01085', u'K11472', u'K14271'}

## Intersection with UC-to-HE mean ratios

In [14]:
uc_he_mean_keggs = intersection_with_ratios_helper(df_extreme_mean, "UC_HE_mean_ratio")

set(df_pattern["PC KEGGs"]) & set(uc_he_mean_keggs)

(888, 'KEGGs are extreme')


{u'K11472', u'K11548', u'K14779'}

## Intersection with CD-to-HE median ratios

In [15]:
cd_he_median_keggs = intersection_with_ratios_helper(df_extreme_median, "CD_HE_median_ratio")

set(df_pattern["PC KEGGs"]) & set(cd_he_median_keggs)

(1361, 'KEGGs are extreme')


{u'K00324', u'K00325', u'K00371', u'K10094', u'K14779'}

## Intersection with LS-to-HE median ratios

In [16]:
ls_he_median_keggs = intersection_with_ratios_helper(df_extreme_median, "LS_HE_median_ratio")

set(df_pattern["PC KEGGs"]) & set(ls_he_median_keggs)

(2204, 'KEGGs are extreme')


{u'K00324',
 u'K00325',
 u'K00360',
 u'K00371',
 u'K00643',
 u'K01085',
 u'K11472',
 u'K12242',
 u'K14271',
 u'K14766'}

## Intersection with UC-to-HE median ratios

In [17]:
uc_he_median_keggs = intersection_with_ratios_helper(df_extreme_median, "UC_HE_median_ratio")

set(df_pattern["PC KEGGs"]) & set(uc_he_median_keggs)

(1558, 'KEGGs are extreme')


{u'K00324',
 u'K00371',
 u'K01085',
 u'K11472',
 u'K11548',
 u'K12242',
 u'K14779',
 u'K14780'}

## Intersection with Ayasdi group 4

In [18]:
ayasdi_keggs_group_4 = clean_up_keggs(df_ayasdi_group_4["KEGG_names"])
set(df_pattern["PC KEGGs"]) & set(ayasdi_keggs_group_4)

{u'K00325',
 u'K00351',
 u'K00371',
 u'K00426',
 u'K00430',
 u'K00604',
 u'K00609',
 u'K00610',
 u'K00633'}

## Intersection with Ayasdi group 6

In [19]:
ayasdi_keggs_group_6 = clean_up_keggs(df_ayasdi_group_6["KEGG_names"])
set(df_pattern["PC KEGGs"]) & set(ayasdi_keggs_group_6)

set()