# Merge KEGG lists and visualize over 3D scatter plots

Notebook for comparing different KEGG candidates as produced by different methods

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 27 days


In [2]:
df_extreme_mean = pd.read_csv("../results/KEGG-mean-HE-ratios-03-01-2018.csv")

print("df shape:", df_extreme_mean.shape)
df_extreme_mean.head()

('df shape:', (10012, 4))


Unnamed: 0,CD_HE_mean_ratio,KEGG_names,LS_HE_mean_ratio,UC_HE_mean_ratio
0,1.319099,K00001(alcohol_dehydrogenase),1.689633,1.004756
1,7.043951,K00002(alcohol_dehydrogenase_(NADP+)),4.649663,0.210511
2,13.911981,K00003(homoserine_dehydrogenase),10.850482,1.594664
3,1.615451,"K00004((R,R)-butanediol_dehydrogenase_/_diacet...",4.744729,0.011772
4,21.931516,K00005(glycerol_dehydrogenase),21.087698,0.902234


In [3]:
df_extreme_median = pd.read_csv("../results/KEGG-median-HE-ratios-15-12-2017.csv")

print("df shape:", df_extreme_median.shape)

df_extreme_median.head()

('df shape:', (10012, 4))


Unnamed: 0,CD_HE_median_ratio,KEGG_names,LS_HE_median_ratio,UC_HE_median_ratio
0,1.377436,K00001(alcohol_dehydrogenase),1.785583,0.998465
1,4.88728,K00002(alcohol_dehydrogenase_(NADP+)),2.597071,0.751038
2,4.087813,K00003(homoserine_dehydrogenase),3.138586,0.57882
3,5.447837,"K00004((R,R)-butanediol_dehydrogenase_/_diacet...",5.358153,0.838109
4,4.884947,K00005(glycerol_dehydrogenase),5.966119,1.313708


In [4]:
df_pattern = pd.read_excel("/Users/myazdaniUCSD/Downloads/PC KEGGs.xlsx")
print(df_pattern.shape)
df_pattern.head()

(39, 1)


Unnamed: 0,PC KEGGs
0,K00323
1,K00324
2,K00325
3,K00330
4,K00348


In [5]:
def clean_up_keggs(keggs_list, delim = "("):
    return [kegg.split(delim)[0] for kegg in keggs_list]

In [6]:
df_extreme_mean["kegg_id"] = clean_up_keggs(list(df_extreme_mean["KEGG_names"]))
df_extreme_median["kegg_id"] = clean_up_keggs(list(df_extreme_median["KEGG_names"]))

In [7]:
df_extreme_mean_pattern = pd.merge(df_extreme_mean, df_pattern, left_on="kegg_id", right_on="PC KEGGs", how = "inner")
df_extreme_median_pattern = pd.merge(df_extreme_median, df_pattern, left_on="kegg_id",right_on="PC KEGGs",how="inner")

In [8]:
df_rf = pd.read_csv("../results/KEGGs-RF-multiclassificaiton_2018-03-11.csv")
df_rf["kegg_id"] = clean_up_keggs(list(df_rf["RF KEGGS"]), delim="(")

df_extreme_mean_rf = pd.merge(df_extreme_mean, df_rf, how = "inner")
df_extreme_median_rf = pd.merge(df_extreme_median, df_rf, how="inner")


In [9]:
df_ayasdi_group_4 = pd.read_csv("../results/ayasdi/ayasdi-group-4_21-12-2017.csv")
df_ayasdi_group_4["kegg_id"] = clean_up_keggs(list(df_ayasdi_group_4["KEGG_names"]), delim="(")

df_extreme_mean_ayasdi_group_4 = pd.merge(df_extreme_mean, df_ayasdi_group_4, how = "inner")
df_extreme_median_ayasdi_group_4 = pd.merge(df_extreme_median, df_ayasdi_group_4, how="inner")

In [10]:
df_ayasdi_group_6 = pd.read_csv("../results/ayasdi/ayasdi-group-6_21-12-2017.csv")
df_ayasdi_group_6["kegg_id"] = clean_up_keggs(list(df_ayasdi_group_6["KEGG_names"]), delim="(")

df_extreme_mean_ayasdi_group_6 = pd.merge(df_extreme_mean, df_ayasdi_group_6, how = "inner")
df_extreme_median_ayasdi_group_6 = pd.merge(df_extreme_median, df_ayasdi_group_6, how="inner")

In [11]:
df_pc_loadings = pd.read_csv("../results/KEGGs-top2pc-loadings_2018-03-11.csv")
df_pc_loadings["kegg_id"] = clean_up_keggs(list(df_pc_loadings["PC loadings keggs"]), delim="(")

df_extreme_mean_pc_loadings = pd.merge(df_extreme_mean, df_pc_loadings, how = "inner")
df_extreme_median_pc_loadings = pd.merge(df_extreme_median, df_pc_loadings, how="inner")

In [12]:
import plotly.plotly as py
import plotly.graph_objs as go

import plotly.offline as offline
import plotly.plotly as py
from plotly.graph_objs import *




from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

offline.init_notebook_mode()

In [13]:
over_abund_thresh = 100
under_abund_thresh = .01

extereme_indx = ((df_extreme_mean["LS_HE_mean_ratio"] > over_abund_thresh)  | 
                 (df_extreme_mean["LS_HE_mean_ratio"] < under_abund_thresh) |
                 (df_extreme_mean["CD_HE_mean_ratio"] > over_abund_thresh)  |
                 (df_extreme_mean["CD_HE_mean_ratio"] < under_abund_thresh) |
                 (df_extreme_mean["UC_HE_mean_ratio"] > over_abund_thresh)  |
                 (df_extreme_mean["UC_HE_mean_ratio"] < under_abund_thresh))

df_extreme_mean_thresh = df_extreme_mean[extereme_indx]

df_extreme_mean_thresh.shape

(2810, 5)

In [14]:
#x, y, z = np.random.multivariate_normal(np.array([0,0,0]), np.eye(3), 200).transpose()



trace1 = go.Scatter3d(
    x=np.log10(df_extreme_mean_pattern["LS_HE_mean_ratio"].values),
    y=np.log10(df_extreme_mean_pattern["CD_HE_mean_ratio"].values),
    z=np.log10(df_extreme_mean_pattern["UC_HE_mean_ratio"].values),
    text = list(df_extreme_mean_pattern["KEGG_names"]),
    mode='markers',
    name = "PC KEGGs",
    marker=dict(
        size=12,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.5
        ),
        opacity=0.8
    )
)





trace2 = go.Scatter3d(
    x=np.log10(df_extreme_mean_thresh["LS_HE_mean_ratio"].values),
    y=np.log10(df_extreme_mean_thresh["CD_HE_mean_ratio"].values),
    z=np.log10(df_extreme_mean_thresh["UC_HE_mean_ratio"].values),
    text = list(df_extreme_mean_thresh["KEGG_names"]),
    mode='markers',
    name = "Union of mean ratios <br> over 1e2 and 1e-2",
    marker=dict(
        color='rgb(127, 127, 127)',
        size=6,
        symbol='circle',
        line=dict(
            color='rgb(204, 204, 204)',
            width=1
        ),
        opacity=0.2
    )
)

trace3 = go.Scatter3d(
    x=np.log10(df_extreme_mean_rf["LS_HE_mean_ratio"].values),
    y=np.log10(df_extreme_mean_rf["CD_HE_mean_ratio"].values),
    z=np.log10(df_extreme_mean_rf["UC_HE_mean_ratio"].values),
    text = list(df_extreme_mean_rf["KEGG_names"]),
    mode='markers',
    name = "RF",
    marker=dict(
        color='rgb(254, 104, 104)',
        size=12,
        symbol='circle',
        line=dict(
            color='rgb(254, 104, 104)',
            width=1
        ),
        opacity=0.3
    )
)


trace4 = go.Scatter3d(
    x=np.log10(df_extreme_mean_ayasdi_group_4["LS_HE_mean_ratio"].values),
    y=np.log10(df_extreme_mean_ayasdi_group_4["CD_HE_mean_ratio"].values),
    z=np.log10(df_extreme_mean_ayasdi_group_4["UC_HE_mean_ratio"].values),
    text = list(df_extreme_mean_ayasdi_group_4["KEGG_names"]),
    mode='markers',
    name = "Ayasdi G4",
    marker=dict(
        color='rgb(104, 254, 104)',
        size=6,
        symbol='circle',
        line=dict(
            color='rgb(104, 254, 104)',
            width=1
        ),
        opacity=0.3
    )
)

trace5 = go.Scatter3d(
    x=np.log10(df_extreme_mean_ayasdi_group_6["LS_HE_mean_ratio"].values),
    y=np.log10(df_extreme_mean_ayasdi_group_6["CD_HE_mean_ratio"].values),
    z=np.log10(df_extreme_mean_ayasdi_group_6["UC_HE_mean_ratio"].values),
    text = list(df_extreme_mean_ayasdi_group_6["KEGG_names"]),
    mode='markers',
    name = "Ayasdi G6",
    marker=dict(
        color='rgb(254, 4, 254)',
        size=6,
        symbol='circle',
        line=dict(
            color='rgb(254, 4, 254)',
            width=1
        ),
        opacity=0.3
    )
)

trace6 = go.Scatter3d(
    x=np.log10(df_extreme_mean_pc_loadings["LS_HE_mean_ratio"].values),
    y=np.log10(df_extreme_mean_pc_loadings["CD_HE_mean_ratio"].values),
    z=np.log10(df_extreme_mean_pc_loadings["UC_HE_mean_ratio"].values),
    text = list(df_extreme_mean_pc_loadings["KEGG_names"]),
    mode='markers',
    name = "PC loadings",
    marker=dict(
        color='rgb(4, 4, 254)',
        size=6,
        symbol='circle',
        line=dict(
            color='rgb(4, 4, 254)',
            width=1
        ),
        opacity=0.3
    )
)

data = [trace1, trace2, trace3, trace4, trace5, trace6]
layout = go.Layout(
                    scene = dict(
                    xaxis = dict(
                        title='x: LS to HE ratio'),
                    yaxis = dict(
                        title='y: CD to HE ratio'),
                    zaxis = dict(
                        title='z: UC to HE ratio'),),
                    width=900,
                    margin=dict(
        l=-50,
        r=0,
        b=0,
        t=0)
                  )
fig = go.Figure(data=data, layout=layout)



In [15]:
## pattern KEGGs inner joined with mean ratios


iplot(fig)


In [16]:
over_abund_thresh = 100
under_abund_thresh = .01

extereme_indx = ((df_extreme_median["LS_HE_median_ratio"] > over_abund_thresh)  | 
                 (df_extreme_median["LS_HE_median_ratio"] < under_abund_thresh) |
                 (df_extreme_median["CD_HE_median_ratio"] > over_abund_thresh)  |
                 (df_extreme_median["CD_HE_median_ratio"] < under_abund_thresh) |
                 (df_extreme_median["UC_HE_median_ratio"] > over_abund_thresh)  |
                 (df_extreme_median["UC_HE_median_ratio"] < under_abund_thresh))

df_extreme_median_thresh = df_extreme_median[extereme_indx]

df_extreme_median_thresh.shape

(3568, 5)

In [17]:
#x, y, z = np.random.multivariate_normal(np.array([0,0,0]), np.eye(3), 200).transpose()



trace1 = go.Scatter3d(
    x=np.log10(df_extreme_median_pattern["LS_HE_median_ratio"].values),
    y=np.log10(df_extreme_median_pattern["CD_HE_median_ratio"].values),
    z=np.log10(df_extreme_median_pattern["UC_HE_median_ratio"].values),
    text = list(df_extreme_median_pattern["KEGG_names"]),
    mode='markers',
    name = "PC KEGGs",
    marker=dict(
        size=12,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.5
        ),
        opacity=0.8
    )
)


trace2 = go.Scatter3d(
    x=np.log10(df_extreme_median_thresh["LS_HE_median_ratio"].values),
    y=np.log10(df_extreme_median_thresh["CD_HE_median_ratio"].values),
    z=np.log10(df_extreme_median_thresh["UC_HE_median_ratio"].values),
    text = list(df_extreme_median_thresh["KEGG_names"]),
    mode='markers',
    name = "Union of median ratios<br> over 1e2 and 1e-2",
    marker=dict(
        color='rgb(127, 127, 127)',
        size=6,
        symbol='circle',
        line=dict(
            color='rgb(204, 204, 204)',
            width=1
        ),
        opacity=0.2
    )
)


trace3 = go.Scatter3d(
    x=np.log10(df_extreme_median_rf["LS_HE_median_ratio"].values),
    y=np.log10(df_extreme_median_rf["CD_HE_median_ratio"].values),
    z=np.log10(df_extreme_median_rf["UC_HE_median_ratio"].values),
    text = list(df_extreme_median_rf["KEGG_names"]),
    mode='markers',
    name = "RF",
    marker=dict(
        color='rgb(254, 104, 104)',
        size=12,
        symbol='circle',
        line=dict(
            color='rgb(254, 104, 104)',
            width=1
        ),
        opacity=0.3
    )
)


trace4 = go.Scatter3d(
    x=np.log10(df_extreme_median_ayasdi_group_4["LS_HE_median_ratio"].values),
    y=np.log10(df_extreme_median_ayasdi_group_4["CD_HE_median_ratio"].values),
    z=np.log10(df_extreme_median_ayasdi_group_4["UC_HE_median_ratio"].values),
    text = list(df_extreme_median_ayasdi_group_4["KEGG_names"]),
    mode='markers',
    name = "Ayasdi G4",
    marker=dict(
        color='rgb(104, 254, 104)',
        size=5,
        symbol='circle',
        line=dict(
            color='rgb(104, 254, 104)',
            width=1
        ),
        opacity=0.3
    )
)

trace5 = go.Scatter3d(
    x=np.log10(df_extreme_median_ayasdi_group_6["LS_HE_median_ratio"].values),
    y=np.log10(df_extreme_median_ayasdi_group_6["CD_HE_median_ratio"].values),
    z=np.log10(df_extreme_median_ayasdi_group_6["UC_HE_median_ratio"].values),
    text = list(df_extreme_median_ayasdi_group_6["KEGG_names"]),
    mode='markers',
    name = "Ayasdi 64",
    marker=dict(
        color='rgb(254, 4, 254)',
        size=5,
        symbol='circle',
        line=dict(
            color='rgb(254, 4, 254)',
            width=1
        ),
        opacity=0.3
    )
)



trace5 = go.Scatter3d(
    x=np.log10(df_extreme_median_pc_loadings["LS_HE_median_ratio"].values),
    y=np.log10(df_extreme_median_pc_loadings["CD_HE_median_ratio"].values),
    z=np.log10(df_extreme_median_pc_loadings["UC_HE_median_ratio"].values),
    text = list(df_extreme_median_pc_loadings["KEGG_names"]),
    mode='markers',
    name = "PC loadings",
    marker=dict(
        color='rgb(4, 4, 254)',
        size=5,
        symbol='circle',
        line=dict(
            color='rgb(4, 4, 254)',
            width=1
        ),
        opacity=0.3
    )
)

data = [trace1, trace2, trace3, trace4, trace5]
layout = go.Layout(
                    scene = dict(
                    xaxis = dict(
                        title='x: LS to HE ratio'),
                    yaxis = dict(
                        title='y: CD to HE ratio'),
                    zaxis = dict(
                        title='z: UC to HE ratio'),),
                    width=900,
                    margin=dict(
        l=-50,
        r=0,
        b=0,
        t=0)
                  )
fig = go.Figure(data=data, layout=layout)




In [18]:
## pattern KEGGs inner joined with median ratios


iplot(fig)