In [1]:
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode
import re
import numpy as np
import yaml
import pandas as pd
init_notebook_mode(connected=True)


Matplotlib is building the font cache using fc-list. This may take a moment.



In [2]:
config = yaml.load(open('config.yml'))

In [3]:
tools.set_credentials_file(username=config['plotly_user'], api_key=config['plotly_api_key'])
plotly_online = False
if plotly_online:
    from plotly.plotly import iplot
else:
    from plotly.offline import iplot

In [4]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
client = Elasticsearch(hosts=config['elasticsearch_host'])

### SM dist results

In [5]:
s = (Search(using=client, index="sm") \
    #.filter("term", category="search") \
    .query("match", ds_name="RB_a2s1"))
    #.query("match", ds_name="Bruker//Rat//Brain//a2s1"))
    #.query(~Q("match", description="beta"))

In [6]:
#response = s.execute()
# r = list(response)[0]
response = s.scan()

In [7]:
dist_df = pd.DataFrame([(r.sf, r.adduct, r.msm, r.chaos, r.image_corr, r.pattern_match) for r in response],
                       columns=['sf', 'adduct', 'msm', 'moc', 'spat', 'spec']).set_index(['sf', 'adduct'])

In [8]:
dist_df[dist_df.index.duplicated()].shape

(0, 4)

In [9]:
dist_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,msm,moc,spat,spec
sf,adduct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C40H80NO8P,+H,0.974686,0.999397,0.987155,0.987964
C40H80NO8P,+Na,0.958924,0.999108,0.981799,0.977573
C41H83N2O6P,+Na,0.954489,0.999095,0.9828,0.972073
C27H44O2,+H,0.939304,0.999454,0.948845,0.990486
C37H68O4,+H,0.913609,0.998794,0.93683,0.976391


In [10]:
#dist_df = pd.read_csv('sm_dist_msm.csv').set_index(['sf', 'adduct']).drop(['formula_db', 'ds_name'], axis=1)

In [11]:
#dist_df.head()

### Ref pipeline results

In [12]:
pwd

u'/Users/palmer/Documents/python_codebase/engine_annotation_utils/example'

In [13]:
ref_df = pd.read_csv('RB_a2s1_spatial_all_adducts_full_results.txt').dropna()
ref_df = ref_df[ref_df.adduct.isin(['+H', '+Na', '+K'])].set_index(['sf', 'adduct'])

In [14]:
ref_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mz,moc,spat,spec,msm
sf,adduct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C12H18O8,+K,329.063346,0.996271,0.037435,0.96796,0.036101
C12H18O8,+H,291.107447,0.990395,0.0,0.965248,0.0
C12H18O8,+Na,313.089418,0.996748,0.0,0.964289,0.0
C12H18O9,+K,345.058243,0.998403,0.038135,0.957136,0.036442
C12H18O9,+H,307.102382,0.995199,0.036683,0.959727,0.035037


In [15]:
ref_df[ref_df.index.duplicated()]

Unnamed: 0_level_0,Unnamed: 1_level_0,mz,moc,spat,spec,msm
sf,adduct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [16]:
ref_df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,mz,moc,spat,spec,msm
sf,adduct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C9H17N4O9P,+K,395.036494,0.991296,0.0,0.97345,0.0
C9H17N4O9P,+Na,379.062553,0.99476,0.027157,0.974596,0.026328
C17H15NO,+K,288.078538,0.986694,0.009046,0.944659,0.008432
C17H15NO,+H,250.122653,0.991016,0.0,0.942523,0.0
C17H15NO,+Na,272.104605,0.993179,0.0,0.952641,0.0


### Plotting

In [17]:
plot_df = dist_df.join(ref_df, rsuffix='_ref', how='inner').dropna()
plot_df.shape

(15408, 9)

In [18]:
plot_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,msm,moc,spat,spec,mz,moc_ref,spat_ref,spec_ref,msm_ref
sf,adduct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C102H170N2O22P2,+H,0.0,0.0,0.0,0.0,1839.18254,0.98432,0.166277,0.66927,0.109539
C102H170N2O22P2,+Na,0.044213,0.980158,0.067293,0.670313,1861.164463,0.989823,0.075237,0.674998,0.050268
C10Cl10O,+H,0.0,0.0,0.0,0.0,490.684687,0.997411,0.0,0.599449,0.0
C10Cl12,+H,0.0,0.0,0.0,0.0,546.62466,0.996009,0.0,0.579306,0.0
C10H10N2O,+K,0.0,0.0,0.0,0.0,213.042476,0.988157,0.003408,0.97355,0.003279


In [19]:
text_tmpl = '{}{}<br>Ref pipe: moc={:.3f} spat={:.3f} spec={:.3f}<br>SM dist: moc={:.3f} spat={:.3f} spec={:.3f}'

plot_df_H = plot_df.xs('+H', level='adduct')
text_H = plot_df_H.reset_index().apply(lambda r: text_tmpl.format(
        r.sf, '+H', r.moc, r.spat, r.spec, r.moc_ref, r.spat_ref, r.spec_ref), axis=1)

plot_df_K = plot_df.xs('+K', level='adduct')
text_K = plot_df_K.reset_index().apply(lambda r: text_tmpl.format(
        r.sf, '+K', r.moc, r.spat, r.spec, r.moc_ref, r.spat_ref, r.spec_ref), axis=1)

traceH = go.Scatter(
    x = plot_df_H['msm'],
    y = plot_df_H['msm_ref'],
    text = text_H,
    mode = 'markers',
    name = '+H'
)
traceK = go.Scatter(
    x = plot_df_K['msm'],
    y = plot_df_K['msm_ref'],
    text = text_K,
    mode = 'markers',
    name = '+K'
)

data = go.Data([traceH, traceK])
fig = go.Figure(data=data, layout = go.Layout(
    autosize=False,
    height=800,
    hovermode='closest',
    title='MSM values',
    width=800,
    xaxis=go.XAxis(
        autorange=True,
        range=[-0.05675070028979684, 1.0323925590539844],
        title='Reference MSM values',
        type='linear'
    ),
    yaxis=go.YAxis(
        autorange=True,
        range=[-0.0015978995361995152, 1.0312345837176764],
        title='SM engine MSM values',
        type='linear'
    )
))
iplot(fig, filename='ref_dist_msm_scatter')

In [20]:
tmp_df = plot_df.dropna()
np.corrcoef(tmp_df['msm'].values, tmp_df['msm_ref'].values)

array([[ 1.        ,  0.84704339],
       [ 0.84704339,  1.        ]])