In [None]:
import numpy as np
import pandas as pd
import itertools
import glob
import peptide_forest
pd.set_option("max_columns", 1000)
import plotly
import plotly.graph_objs as go
import plotly.offline as offline
import plotly.io as pio
pio.templates.default = "plotly_white"

In [None]:
import ursgal
ursgal.__version__

# Reading percolator outputs

In [None]:
dfs = []
for csv in glob.glob("../data/E32/*percolator_3_4_0_validated.csv"):
    _df = pd.read_csv(csv)
#     _df['csv'] = csv
    for e in ['mascot', 'msfragger', 'msgfplus', 'omssa', 'xtandem']:
        if e in csv:
            _df['engine'] = e
    dfs.append(_df)
df = pd.concat(dfs)

In [None]:
df.describe()

In [None]:
df['Modifications'].fillna('', inplace=True)


In [None]:
def create_seqmod(r):
    if r['Modifications'] == '':
        r_value = r['Sequence']
    else:
        r_value = r['Sequence'] + "#" + r['Modifications']
    return r_value

df['seq#mod'] = df.apply(create_seqmod, axis=1)
df.head()

In [None]:
for name, grp in df.groupby('engine'):
    print(name, grp[(~grp['Is decoy']) & (grp['q-value'] < 0.01)]['seq#mod'].nunique())

In [None]:
uc = ursgal.UController(verbose=False)
venn_main = uc.unodes['venndiagram_1_1_0']['class'].import_engine_as_python_function()
# ^-- this is ugly and our AirFlow nodes would make this prettier ! :)

In [None]:
data = []
for name, grp in df.groupby('engine'):
    data.append(
        {
            'label': name,
            'data': set(grp[(~grp['Is decoy']) & (grp['q-value'] < 0.01)]['seq#mod'].tolist())
        }
    )

results = venn_main(
    data=data,
    width=1500,
    height=1500
#     **default_kwargs
)


In [None]:
from IPython.core.display import SVG
SVG(filename="VennDiagram.svg")

# reading peptide forest output

In [None]:
output = "../01Apr_E13.csv"
output = "../01Apr_E32.csv"
final_df = pd.read_csv(output)
final_df.Modifications.fillna("", inplace=True)
final_df['seq#mod'] = final_df.apply(create_seqmod, axis=1)

In [None]:
final_df.head()

In [None]:
all_eng = [
    c.split("Score_processed_")[1] for c in final_df.columns if "Score_processed" in c
]

In [None]:
q_val_cuts = np.logspace(-4, -1, num=10)
print(q_val_cuts)

In [None]:
color_set = {
    0 : ['sandybrown', 'chocolate', 'goldenrod'],
    1 : ['mediumvioletred', 'deeppink', 'lightpink'],
    2 : ['blueviolet', 'orchid', 'plum'],
    3 : ['royalblue', 'skyblue', 'slateblue'],
    4 : ['olive', 'springgreen', 'lawngreen'],
    5 : ['maroon', 'brown', 'sienna']
}

data = []
for pos, eng in sorted(enumerate(all_eng)):
    y = []
    for q_value in q_val_cuts:
        y.append(
            final_df[(~final_df['Is decoy']) & (final_df[f'q-value_{eng}'] < q_value)]['seq#mod'].nunique()
        )
    data.append(
        go.Scatter(
            x=q_val_cuts, 
            y=y,
            mode='lines+markers',
            name="{0}".format(eng),
            line=dict(
                color=color_set[pos][0]
            )
        )
    )

# for eng in all_eng:
#     y = []
#     for q_value in q_val_cuts:
#         y.append(
#             f2df[(~f2df['Is decoy']) & (f2df[f'q-value_{eng}'] < q_value)]['seq#mod'].nunique()
#         )
#     data.append(
#         go.Scatter(
#             x=q_val_cuts, 
#             y=y,
#             mode='lines+markers',
#             name="f2:{0}".format(eng)
#         )
#     )    

# for eng in all_eng:
#     y = []
#     if f'q-value_{eng}' not in f3df.columns:
#         continue
#     for q_value in q_val_cuts:
#         y.append(
#             f2df[(~f3df['Is decoy']) & (f3df[f'q-value_{eng}'] < q_value)]['seq#mod'].nunique()
#         )
#     data.append(
#         go.Scatter(
#             x=q_val_cuts, 
#             y=y,
#             mode='lines+markers',
#             name="{0}-training-without_msgf+".format(eng)
#         )
#     )        

for pos, (name, grp) in enumerate(sorted(df.groupby('engine'))):
    y = []
    for q_value in q_val_cuts:
        y.append(
            grp[(~grp['Is decoy']) & (grp['q-value'] < q_value)]['seq#mod'].nunique()
        )
    data.append(
        go.Scatter(
            x=q_val_cuts, 
            y=y,
            mode='lines+markers',
            name="{0}:percolator v3.4".format(name),
            line = dict(
                dash='dash',
                color=color_set[pos][1]
            )
        )
    ) 

In [None]:
fig = go.Figure(data=data)
fig.update_layout(
    xaxis_type="log",
    xaxis_title='q-value',
    yaxis_title='accepted unique seq#mod',
)
fig.show()

# Comparing engine vanilla results

In [None]:
data = []
# final_df[(~final_df['Is decoy']) & (final_df[f'q-value_{eng}'] < q_value)]['seq#mod'].nunique()
q_value = 0.01

for eng in all_eng:
    if eng == 'RF-reg':
        continue
    data.append(
        {
            'label': eng,
            'data': set(final_df[(~final_df['Is decoy']) & (final_df[f'q-value_{eng}'] < q_value)]['seq#mod'].tolist())
        }
    )

results = venn_main(
    data=data,
    output_file="Venn_Comparison.svg",
    width = 1200,
    height = 1000,
)


In [None]:
from IPython.core.display import SVG
SVG(filename="Venn_Comparison.svg")

# Score distributions

In [None]:
eng = "mascot_1_0_0"
# eng = "msfragger_20190222"
fig = plotly.subplots.make_subplots(specs=[[{"secondary_y": True}]])


final_df.sort_values([f'Score_processed_{eng}'], inplace=True)
fig.add_trace(
    go.Histogram(
        x= final_df[(~final_df['Is decoy']) & (final_df[f'Score_processed_{eng}'] > 5)][f'Score_processed_{eng}'],
        name="target"
    )
)
fig.add_trace(
    go.Histogram(
        x=final_df[(final_df['Is decoy']) & (final_df[f'Score_processed_{eng}'] > 5)][f'Score_processed_{eng}'],
        name="decoy"
    )
)
fig.add_trace(
    go.Scatter(
        x=final_df[
            (final_df[f'Score_processed_{eng}'] > 5) & \
            (final_df[f'q-value_{eng}'] < 1)]\
        [f'Score_processed_{eng}'],
        y=final_df[
            (final_df[f'Score_processed_{eng}'] > 5) & \
             (final_df[f'q-value_{eng}'] < 1)]\
        [f'q-value_{eng}'],
        name="q-value",
        marker_color='rgba(22, 22, 22, .9)'
 
    ),
    secondary_y=True
)
# Overlay both histograms
fig.update_layout(
    barmode='overlay',
    title=f"Score distribution for target and decoy for {eng}"
)
fig.update_xaxes(title="Score processed")
fig.update_yaxes(title="Frequency")
fig.update_yaxes(title="q-value", secondary_y=True)
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.45)
fig.show()

In [None]:
final_df[final_df['Spectrum ID'] == 14673]

In [None]:
data = []
for name in ['msgfplus', 'mascot']:
    grp = df[df['engine'] == name]
    data.append(
        {
            'label': "{0}:percolator v3.4".format(name),
            'data': set(grp[(~grp['Is decoy']) & (grp['q-value'] < 0.01)]['seq#mod'].tolist())
        }
    )

for name in ['RF-reg', 'mascot_1_0_0', 'msgfplus_v2018_06_28']:
    data.append(
        {
            'label': name,
            'data': set(final_df[(~final_df['Is decoy']) & (final_df[f'q-value_{name}'] < 0.01)]['seq#mod'].tolist())            
        }
)
    
results = venn_main(
    data=data,
#     **default_kwargs
)


In [None]:
from IPython.core.display import SVG
SVG(filename="VennDiagram.svg")