In [None]:
import numpy as np
import pandas as pd
import itertools
from sklearn.preprocessing import maxabs_scale, StandardScaler, minmax_scale
from scipy.spatial import distance
import plotly
import plotly.graph_objs as go
import plotly.offline as offline
from plotly.colors import n_colors
import plotly.express as px
import plotly.io as pio
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 500)

In [None]:
import plotly.io as pio
pio.kaleido.scope.default_width = 1200

In [None]:
DATASET = "E13"

In [None]:
tmts = ['126', '127L', '127H', '128L', '128H', '129L', '129H', '130L', '130H', '131L']

#label: (EColi, Human)
expected_values = {
    "126": (0, 1),
    "127L": (1, 1),
    "127H": (1, 0),
    "128L": (0.5, 1),
    "128H": (0.5, 0),
    "129L": (0.2, 1),
    "129H": (0.2, 0),
    "130L": (0.1, 1),
    "130H": (0.1, 0),
    "131L": (1, 1),
}

In [None]:
quant_file = f"/Users/tr341516/Downloads/04854_F1_R8_P0109699{DATASET}_TMT10_quant_pots.csv"
quant_df = pd.read_csv(quant_file , index_col=0)
quant_df.rename(columns={'spectrum_id': 'Spectrum ID',}, inplace=True)
quant_df = quant_df.pivot(index='Spectrum ID', columns='label', values='quant_value')

In [None]:
for c in quant_df.columns:
    quant_df[c].fillna(quant_df[c].min(), inplace=True)
quant_df = quant_df.apply(lambda x: np.log2(x+1))
quant_df.loc[:,:] = maxabs_scale(quant_df.values, axis=1)

In [None]:
result_csv = f"/Users/tr341516/Downloads/PeptideForestPerm/mzml/peptide_forest_3_b7c293e21fc9526b001e0cad761d595e/04854_F1_R8_P0109699{DATASET}_TMT10_________unified_peptide_forest_3.csv.bz2"
output_df = pd.read_csv(result_csv, index_col=0, compression="bz2")
output_df.head()

In [None]:
all_eng = [c.split("Score_processed_")[1] for c in output_df.columns if "Score_processed" in c]

# Add species column
def determine_species(row):
    code = 0
    if "HUMAN" in row["Protein ID"].upper():
        code += 1  # 2^^0
    if "ECOLI" in row["Protein ID"].upper():
        code += 2  # 2^^1
    if "CONT" in row["Protein ID"].upper():
        code += 4  # 2^^2
    return min(code, 4)

output_df["species code"] = output_df.apply(determine_species, axis=1)

In [None]:
_df = pd.merge(output_df, quant_df, on="Spectrum ID").reset_index(drop=True)

In [None]:
df = _df.copy()

In [None]:
repl_dict = {
    "comet_2020_01_4": "Comet 2020.01.4",
    "msamanda_2_0_0_17442": "MS Amanda 2.0.0.17442",
    "msfragger_3_0": "MSFragger 3.0",
    "msgfplus_2021_03_22": "MSGF+ 2021.03.22",
    "omssa_2_1_9": "OMSSA 2.1.9",
    "xtandem_alanine": "X!Tandem Alanine",
    "mascot_2_6_2": "Mascot 2.6.2",
    "peptide_forest": "PeptideForest (all)",
    "peptide forest only": "PeptideForest (unique)",
    "any engine": "Any engine (excluding PeptideForest)",
    "most engine": "Majority of all engines (excluding PeptideForest)"
    }

## some UMAP

In [None]:
import umap
from sklearn.decomposition import PCA

In [None]:
def create_fig_data(df=None, quant_df=None, mask=None, n=None, metric="euclidean"):
    reducer = umap.UMAP(
        metric=metric
    )
    if n is None:
        tmt_data = df[mask][quant_df.columns]
    else:
        tmt_data = df[mask][quant_df.columns].sample(n)
    scaled_tmt_data = StandardScaler().fit_transform(tmt_data.values)
    embedding = reducer.fit_transform(scaled_tmt_data)
    edf = pd.DataFrame(embedding, columns=["umap_1", "umap_2"])
    tmt_data.reset_index(inplace=True)
    tmt_data.rename(columns={"index":"Spectrum ID"}, inplace=True)
    fig_data = tmt_data.join(pd.DataFrame(embedding, columns=["umap_1", "umap_2"]))
    fig_data = fig_data.join(df, on="Spectrum ID", rsuffix="_big_table")
    return fig_data

In [None]:
for c in [c for c in df.columns if "reported_by" in c]:
    print(len(df[~df["Is decoy"] & df[c]]), c)

In [None]:
from plotly.subplots import make_subplots
from sklearn.linear_model import Perceptron

fig = make_subplots(rows=4, cols=2, vertical_spacing=0.05, subplot_titles=("Mapped Species", "Predicted Species"))

eng = "mascot_2_6_2"
for i, cut in enumerate([0.5, 0.1, 0.01, 0.001]):
    mask = ~df["Is decoy"] & (df[f'q-value_{eng}'] <= cut) & (df["species code"].isin([1,2]))
    clf = Perceptron(alpha=0.001, random_state=42)
    clf.fit(df[mask][quant_df.columns].values, df[mask]["species code"].values)
    print(eng, clf.score(df[mask][quant_df.columns].values, df[mask]["species code"].values))
    
    fig_data = create_fig_data(df=df, quant_df=quant_df, mask=mask, n=None, metric="canberra")
    
    
    data = []
    species = {
        1 : "human",
        2 : "ecoli",
    }
    for code, name in species.items():
        f_mask = fig_data["species code"] == code
        fig.add_trace(
            go.Scattergl(
                x = fig_data[f_mask]["umap_1"],
                y = fig_data[f_mask]["umap_2"],
                mode='markers',
                marker=dict(color=plotly.colors.DEFAULT_PLOTLY_COLORS[0] if code == 1 else plotly.colors.DEFAULT_PLOTLY_COLORS[1], size=1.5),
                name=name,
                legendgroup=name,
                showlegend=False if i != 0 else True,
            ),
            row=i+1,
            col=1
        )
        
    for code, name in species.items():
        f_mask = clf.predict(df[mask][quant_df.columns].values) == code
        fig.add_trace(
            go.Scattergl(
                x = fig_data[f_mask]["umap_1"],
                y = fig_data[f_mask]["umap_2"],
                mode='markers',
                marker=dict(color=plotly.colors.DEFAULT_PLOTLY_COLORS[0] if code == 1 else plotly.colors.DEFAULT_PLOTLY_COLORS[1], size=1.5),
                name=name,
                legendgroup="skip",
                showlegend=False,
            ),
            row=i+1,
            col=2
        )
        
    fig.update_yaxes(title_text=f"{repl_dict[eng]} @ {cut} q-cut", row=i+1, col=1)

fig.update_layout(
    autosize=True,
    width=800,
    height=4*400,
    legend={"itemsizing": "constant"})

fig.write_image(f"masterplots/fig_{DATASET}_clustering_qcuts.png")
fig.show()

In [None]:
from plotly.subplots import make_subplots
from sklearn.linear_model import Perceptron

fig = make_subplots(rows=8, cols=2, vertical_spacing=0.05, subplot_titles=("Mapped Species", "Predicted Species"))

aucs = {}
for i, eng in enumerate(all_eng):
    mask = ~df["Is decoy"] & (df[f'q-value_{eng}'] <= 0.01) & (df["species code"].isin([1,2]))
    clf = Perceptron(alpha=0.001, random_state=42)
    clf.fit(df[mask][quant_df.columns].values, df[mask]["species code"].values)
    y_pred = clf.predict(df[mask][quant_df.columns].values)
    auc = roc_auc_score(df[mask]["species code"].values, y_pred, average="weighted")
    print(eng, auc)
    aucs[eng] = auc
    
    fig_data = create_fig_data(df=df, quant_df=quant_df, mask=mask, n=None, metric="canberra")
    
    
    data = []
    species = {
        1 : "human",
        2 : "ecoli",
    }
    for code, name in species.items():
        f_mask = fig_data["species code"] == code
        fig.add_trace(
            go.Scattergl(
                x = fig_data[f_mask]["umap_1"],
                y = fig_data[f_mask]["umap_2"],
                mode='markers',
                marker=dict(color=plotly.colors.DEFAULT_PLOTLY_COLORS[0] if code == 1 else plotly.colors.DEFAULT_PLOTLY_COLORS[1], size=1.5),
                name=name,
                legendgroup=name,
                showlegend=False if i != 0 else True,
            ),
            row=i+1,
            col=1
        )
        
    for code, name in species.items():
        f_mask = clf.predict(df[mask][quant_df.columns].values) == code
        fig.add_trace(
            go.Scattergl(
                x = fig_data[f_mask]["umap_1"],
                y = fig_data[f_mask]["umap_2"],
                mode='markers',
                marker=dict(color=plotly.colors.DEFAULT_PLOTLY_COLORS[0] if code == 1 else plotly.colors.DEFAULT_PLOTLY_COLORS[1], size=1.5),
                name=name,
                legendgroup="skip",
                showlegend=False,
            ),
            row=i+1,
            col=2
        )
        
    fig.update_yaxes(title_text=repl_dict[eng], row=i+1, col=1)

fig.update_layout(
    autosize=True,
    width=800,
    height=8*400,
    legend={"itemsizing": "constant"})

fig.write_image(f"masterplots/fig_{DATASET}_clustering.png")
fig.show()
print(aucs)

# Figure 1

In [None]:
q_val_cuts = np.logspace(-4, -1, num=20)

data={}
for q_cut in q_val_cuts:
    data[q_cut] = {}
    for c in [x for x in df.columns if "q-value_" in x]:
        plt_df1 = df.copy(deep=True)
        plt_df1 = plt_df1[plt_df1[c] <= q_cut]
        if len(plt_df1) == 0:
            data[q_cut][c.replace("q-value_", "")] = ("with engine only", pd.NA, 0, 0, 0)
        else:
            peptidoforms = plt_df1[~plt_df1["Is decoy"]].groupby(["Sequence", "Modifications"]).ngroups
            proteins = len(set().union(*plt_df1[~plt_df1["Is decoy"]]["Protein ID"].str.split(r"<|>").apply(set).to_list()))
            plt_df1 = plt_df1.value_counts("Is decoy")
            if not True in plt_df1.index:
                plt_df1 = pd.Series([plt_df1[False], 0], index=[False, True])

            data[q_cut][c.replace("q-value_", "")] = ("with engine only", plt_df1[True]/plt_df1.sum(), plt_df1[False], peptidoforms, proteins)
        
        # Percolator
        eng = c.replace("q-value_", "")
        if "peptide_forest" in eng:
            data[q_cut][eng + "Perc"] = ("with Percolator 3.5.0", *data[q_cut][eng][1:])
            continue
        try:
            pdf = pd.read_csv(f"/Users/tr341516/Downloads/PeptideForestPerm/mzml/percolator_3_5_0_a2e99f70a9d13c13e5ff16cfad6cad18/04854_F1_R8_P0109699{DATASET}_TMT10_{eng.replace('msfragger_3_0', 'msfragger_3')}_unified_percolator_3_5_0_validated.csv")
            pdf = pdf[pdf["q-value"] <= q_cut]
            if len(pdf) == 0:
                data[q_cut][eng.replace("msfragger_3_0", "MSFragger 3.0") + "Perc"] = ("with Percolator 3.5.0", pd.NA, 0, 0, 0)
                continue
            peptidoforms = pdf[~pdf["Is decoy"]].groupby(["Sequence", "Modifications"]).ngroups
            proteins = len(set().union(*pdf[~pdf["Is decoy"]]["Protein ID"].str.split(r"<|>").apply(set).to_list()))
            pdf = pdf.value_counts("Is decoy")
            if not True in pdf.index:
                pdf = pd.Series([pdf[False], 0], index=[False, True])

            data[q_cut][eng.replace("msfragger_3_0", "MSFragger 3.0") + "Perc"] = ("with Percolator 3.5.0", pdf[True]/pdf.sum(), pdf[False], peptidoforms, proteins)
        except:
            data[q_cut][eng.replace("msfragger_3_0", "MSFragger 3.0") + "Perc"] = ("with Percolator 3.5.0", pd.NA, 0, 0, 0)


In [None]:
plt_df1 = pd.DataFrame(data)
plt_df1 = pd.melt(plt_df1.reset_index(), id_vars=['index'], value_vars=q_val_cuts)

In [None]:
plt_df1 = pd.DataFrame(data)
plt_df1 = pd.melt(plt_df1.reset_index(), id_vars=['index'], value_vars=q_val_cuts)
plt_df1.columns = ["Search Engine", "q-value cutoff", "stack"]
plt_df1[["Search performed", "FDR [%]", "n PSMs", "n Peptidoforms", "n Proteins"]] = plt_df1["stack"].apply(pd.Series)
plt_df1["Search Engine"] = plt_df1["Search Engine"].str.replace("Perc", "")
plt_df1["Search Engine"] = plt_df1["Search Engine"].replace(repl_dict, regex=True)

In [None]:
plt_df1 = pd.melt(plt_df1, id_vars=["Search performed", "Search Engine", "q-value cutoff", "stack"], value_vars=["n PSMs", "n Peptidoforms", "n Proteins"])
plt_df1.columns = ["Search performed", "Search Engine", "q-value cutoff", "stack", "Type", "n"]

In [None]:
plt_df1["Search performed"] = plt_df1["Search performed"].astype(str)

In [None]:
plt_df1[plt_df1["Search Engine"] == "PeptideForest (all)"]

In [None]:
fig = px.line(plt_df1, x="q-value cutoff", y="n", color="Search Engine", log_x=True, markers=True, category_orders={"Search Engine": list(repl_dict.values())[:-3],}, facet_row="Type", facet_col="Search performed")
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(
    paper_bgcolor='rgba(255,255,255,1)',
    plot_bgcolor='rgba(255,255,255,1)',
)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', gridcolor="lightgray")
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', gridcolor="lightgray")
fig.update_xaxes(type="log", showexponent="none", exponentformat="power", tickmode = 'array',tickvals = [0.0001, 0.001, 0.01, 0.1,],)
fig.update_layout(
    autosize=False,
    width=800,
    height=1100,)
fig.write_image(f"masterplots/test.png", scale=2)
fig.show()

In [None]:
fig = px.line(plt_df1[~plt_df1["Search Engine"].str.contains("Percolator")], x="q-value cutoff", y="n", color="Search Engine", log_x=True, markers=True, category_orders={"Search Engine": list(repl_dict.values())[:-3],}, facet_row="Type", facet_col="Search performed")
fig.update_layout( 
    xaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    )
)
fig.update_layout(
    autosize=False,
    width=885,
    height=700,)
fig.write_image(f"masterplots/fig_{DATASET}_n_wo_percolator.png")
fig.show()
fig = px.line(plt_df1[plt_df1["Search Engine"].str.contains("Percolator") | (plt_df1["Search Engine"] == "PeptideForest (all)")], x="q-value cutoff", y="n", color="Search Engine", log_x=True, markers=True, category_orders={"Search Engine": [x + " + Percolator 3.5.0" if not "(all)" in x else x for x in list(repl_dict.values())[:-3]],}, facet_row="Type", facet_col="Search performed")
fig.update_layout(
    xaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    )
)
fig.update_layout(
    autosize=False,

fig.show()

In [None]:
plt_df1

In [None]:
plt_df1["Search performed"].value_counts()

fig = px.line(plt_df1[~plt_df1["Search Engine"].str.contains("Percolator")].dropna(how="any", axis=0), x="q-value cutoff", y="FDR [%]", color="Search Engine", log_x=True, log_y=True, markers=True, category_orders={"Search Engine": list(repl_dict.values())[:-3],})
fig.update_layout(
    xaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    ),
    yaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    ),
)
fig.write_image("masterplots/fig_E32_fdr_wo_percolator.png")
fig.show()
fig = px.line(plt_df1[plt_df1["Search Engine"].str.contains("Percolator") | (plt_df1["Search Engine"] == "PeptideForest (all)")].dropna(how="any", axis=0), x="q-value cutoff", y="FDR [%]", color="Search Engine", log_x=True, log_y=True, markers=True, category_orders={"Search Engine": list(repl_dict.values())[:-3],})
fig.update_layout(
    xaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    ),
    yaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    ),
)
fig.write_image("masterplots/fig_E32_fdr_w_percolator.png")
fig.show()

In [None]:
q_val_cuts = np.logspace(-4, -1, num=4)
sorted(q_val_cuts, reverse=True)

for cut in sorted(q_val_cuts, reverse=True):
    for eng in all_eng:
        idx_below_cut_off_for_e = df[(df[f'q-value_{eng}'] <= cut)].index
        eng_per_q_col = f"top_target_{eng}_at_{cut}"
        df[eng_per_q_col] = False
        df.loc[idx_below_cut_off_for_e, eng_per_q_col] = True
    
    idx_below_cut_off_for_e = df[(df[[c for c in df.columns if "q-value_" in c and not "peptide_forest" in c]] <= cut).any(axis=1)].index
    eng_per_q_col = f"top_target_any_at_{cut}"
    df[eng_per_q_col] = False
    df.loc[idx_below_cut_off_for_e, eng_per_q_col] = True
    
    idx_below_cut_off_for_e = df[(df[[c for c in df.columns if "q-value_" in c and not "peptide_forest" in c]] <= cut).sum(axis=1) >= ((len(all_eng)-1)//2)].index
    eng_per_q_col = f"top_target_majority_at_{cut}"
    df[eng_per_q_col] = False
    df.loc[idx_below_cut_off_for_e, eng_per_q_col] = True
    
    idx_below_cut_off_for_e = df[(df[f'q-value_peptide_forest'] <= cut) & ~(df[f"top_target_any_at_{cut}"])].index
    eng_per_q_col = f"top_target_peptide_forest_only_at_{cut}"
    df[eng_per_q_col] = False
    df.loc[idx_below_cut_off_for_e, eng_per_q_col] = True

In [None]:
plt_df1 = pd.concat([plt_df1, plt_df1row], axis=0)

In [None]:
plt_df1

In [None]:
df["zom off tmts tschaennels"] = df[(df["species code"].isin([1,2])) & (~df["Is decoy"])][expected_values.keys()].sum(axis=1)
inds_ecoli = df[df["species code"] == 2].sort_values("zom off tmts tschaennels", ascending=False).head(5000).index
inds_human = df[df["species code"] == 1].sort_values("zom off tmts tschaennels", ascending=False).head(5000).index
df = df.iloc[inds_ecoli.union(inds_human)]

for channel in expected_values.keys():
    df.loc[:, channel] /= df["zom off tmts tschaennels"]

df["reported_by_peptide_forest"] = True
data = {"ecoli": {}, "human": {}}
ref_ecoli = [v[0] for v in expected_values.values()]
ref_human = [v[1] for v in expected_values.values()]
q_val_cut = 0.01
for eng in all_eng:
    col_name = f"top_target_{eng}_at_{q_val_cut}"
    val_human = minmax_scale(df[(df["species code"] == 1) & (~df["Is decoy"]) & (df[col_name]) & (df[f"reported_by_{eng}"])][expected_values.keys()], axis=1)
    val_ecoli = minmax_scale(df[(df["species code"] == 2) & (~df["Is decoy"]) & (df[col_name]) & (df[f"reported_by_{eng}"])][expected_values.keys()], axis=1)
    data["human"][f"{eng}"] = pd.DataFrame(val_human).apply(lambda x: distance.canberra(x, ref_human), axis=1).values
    data["ecoli"][f"{eng}"] = pd.DataFrame(val_ecoli).apply(lambda x: distance.canberra(x, ref_ecoli), axis=1).values
    
col_name = f"top_target_any_at_{q_val_cut}"
val_human = minmax_scale(df[(df["species code"] == 1) & (~df["Is decoy"]) & (df[col_name])][expected_values.keys()], axis=1)
val_ecoli = minmax_scale(df[(df["species code"] == 2) & (~df["Is decoy"]) & (df[col_name])][expected_values.keys()], axis=1)
data["human"]["any engine"] = pd.DataFrame(val_human).apply(lambda x: distance.canberra(x, ref_human), axis=1).values
data["ecoli"]["any engine"] = pd.DataFrame(val_ecoli).apply(lambda x: distance.canberra(x, ref_ecoli), axis=1).values

col_name = f"top_target_majority_at_{q_val_cut}"
val_human = minmax_scale(df[(df["species code"] == 1) & (~df["Is decoy"]) & (df[col_name])][expected_values.keys()], axis=1)
val_ecoli = minmax_scale(df[(df["species code"] == 2) & (~df["Is decoy"]) & (df[col_name])][expected_values.keys()], axis=1)
data["human"]["most engine"] = pd.DataFrame(val_human).apply(lambda x: distance.canberra(x, ref_human), axis=1).values
data["ecoli"]["most engine"] = pd.DataFrame(val_ecoli).apply(lambda x: distance.canberra(x, ref_ecoli), axis=1).values

col_name = f"top_target_peptide_forest_only_at_{q_val_cut}"
val_human = minmax_scale(df[(df["species code"] == 1) & (~df["Is decoy"]) & (df[col_name])][expected_values.keys()], axis=1)
val_ecoli = minmax_scale(df[(df["species code"] == 2) & (~df["Is decoy"]) & (df[col_name])][expected_values.keys()], axis=1)
data["human"]["peptide forest only"] = pd.DataFrame(val_human).apply(lambda x: distance.canberra(x, ref_human), axis=1).values
data["ecoli"]["peptide forest only"] = pd.DataFrame(val_ecoli).apply(lambda x: distance.canberra(x, ref_ecoli), axis=1).values

In [None]:
plt_df = pd.DataFrame(data)
plt_df = pd.melt(plt_df.reset_index(), id_vars=['index'], value_vars=["ecoli", "human"])
plt_df = plt_df.explode("value").reset_index(drop=True)
plt_df.rename(columns={"index": "Search Engine", "variable": "Species", "value": "Canberra distance"}, inplace=True)
plt_df

In [None]:
repl_dict = {
    "comet_2020_01_4": "Comet 2020.01.4",
    "msamanda_2_0_0_17442": "MS Amanda 2.0.0.17442",
    "msfragger_3_0": "MSFragger 3.0",
    "msgfplus_2021_03_22": "MSGF+ 2021.03.22",
    "omssa_2_1_9": "OMSSA 2.1.9",
    "xtandem_alanine": "X!Tandem Alanine",
    "mascot_2_6_2": "Mascot 2.6.2",
    "peptide_forest": "PeptideForest (all)",
    "peptide forest only": "PeptideForest (unique)",
    "any engine": "Any engine (excluding PeptideForest)",
    "most engine": "Majority of all engines (excluding PeptideForest)"
    }
plt_df["Search Engine"] = plt_df["Search Engine"].replace(repl_dict)

In [None]:
plt_df["Search Engine"].unique()

In [None]:
fig = go.Figure()
grpby = plt_df.groupby(["Search Engine"])
for name in list(repl_dict.values())[::-1]:
    grp = grpby.get_group(name)
    fig.add_trace(go.Box(x=grp["Canberra distance"], name=name))#, scalegroup="default", spanmode="hard"))

#fig.update_traces(orientation='h', side='positive', points=False, scalemode="width", width=1.75)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False, showlegend=False)
fig.update_yaxes(title="Search Engine")
fig.update_xaxes(title="Canberra distance")
fig.write_image(f"masterplots/fig_{DATASET}_distance_dist.png")


fig.show()

In [None]:
from scipy.stats import ks_2samp, kstest

dist_peptide_forest = plt_df[plt_df["Search Engine"] == "PeptideForest (all)"]["Canberra distance"].values
for name, d in plt_df.groupby("Search Engine"): 
    print(name)
    print(
        ks_2samp(
            dist_peptide_forest, 
            d["Canberra distance"].values, 
            mode="exact"
        )
    , "\n")