In [None]:
# @IMPORT-MERGE
import numpy as np
import pandas as pd
from munch import Munch
from plaster.tools.zplots import zplots
from plaster.run.plots import plots
from plaster.run.plots import plots_dev as pdev
from plaster.run.plots import plots_dev_pro as ppro
from plaster.run.plots import plots_dev_ptm as pptm
from plaster.run.run import RunResult
from plaster.run.job import JobResult,MultiJobResult
from plaster.tools.ipynb_helpers.displays import hd
from plaster.tools.log.log import error, debug
from plaster.tools.utils.utils import json_print,np_safe_divide,munch_abbreviation_string


In [None]:
# @REMOVE-FROM-TEMPLATE
#
z = zplots.setup()

job = JobResult("../../../jobs_folder/yoda_small_multi_2__survey/")

# If you are running this report by dropping it into a job folder,
# then comment the above and uncomment this line before running:
# job = JobResult("./") 


## Optionally change proteins or PTMs of interest

In [None]:
# You typically do not need to edit this cell, just execute it. 

# Your job will have defined proteins-of-interest (POI) as well as 
# any PTMs for proteins in the job.  You can however set this here,
# and it will affect how the survey decides which protease/label-schemes
# are "best".  With POI and PTMs set, whether from the original job definition
# or in this cell, you can further *reduce* this domain with pro_subset and
# ptm_subset filters in the next cell.

# job.set_pros_of_interest( protein_ids=[ 'P10636-8', 'P2798'] )  # can be empty list to set none

# job.set_pro_ptm_locs( protein_id='P10636-8', ptms='181;182;185') # can be empty string or ;-delimited list.
# job.get_pro_ptm_locs( protein_id='P10636-8' ) # to see the current setting

# Here we'll print the current proteins of interest - this will include
# any PTMs that are set on them.

print( "Proteins of Interest")
poi_df = job.get_pros_of_interest().drop( ['run_i','run_name'],axis='columns').drop_duplicates('pro_id')
display(poi_df)


## Edit Filters and Find Best Runs

In [None]:
def best_runs_for_objective(filters,title_extra=''):
    best_runs = job.get_nn_stats_df( filters=filters )

    hd('h1',f'Best runs for objective: {filters.objective} {title_extra}')
    hd('h3', 'Filters' )
    json_print(filters)
    print()
    
    pd.set_option('display.max_columns',None)
    display(best_runs)
    return best_runs


# Edit the filters here, then run this cell
#
filters = Munch(
    allow_proline_at_2=True,        # True or False
    exclude_runs=[],                # [] or List of runs to exclude, e.g. ['gluc_ph4_c_k_de_y_9880']   
    include_runs=[],                # [] or List of runs to consider, e.g. ['gluc_ph4_c_k_de_y_9880']
    max_dyes_per_ch=5,              # None, or integer
    max_pep_len=50,                 # None, or integer
    max_ptms_per_pep=None,          # None, or integer
    multi_peptide_metric='dist_min',# None, 'dist_min', or 'dist_avg'
    n_best_schemes=10,              # integer - display top n best protease/label schemes
    n_peps_per_scheme=1,            # integer - display top n peps per best scheme found
    objective='protein_id',         # 'protein_id', 'coverage', or 'ptms'
    poi_only=True,                  # limit to 'proteins of interest'?
    pro_subset=[],                  # Reduce domain of proteins to consider, e.g. ['Q14997']
    ptm_subset=[],                  # Reduce domain of ptms to consider, e.g. [181,184]
    verbose=0,                      # set to 1 for various info on filtering (dev)
)

best_runs = best_runs_for_objective(filters)

# The following line saves your best_runs dataframe to a CSV named for the filter settings.
# Uncomment to save your csv.

# user = ''
# best_runs.to_csv(f'./survey_best_runs_{user}_{munch_abbreviation_string(filters)}.csv',index=False,float_format="%g")


In [None]:
# The following is an example of how you might choose to look at best runs for protein
# identification for two proteins, first individually to see which runs are the very 
# best for each protein individually, and then together to see which runs produce the
# best combined result via composite nearest-neighbor distance for their best peptides.
# This example uses the yoda_small_multi_2__survey job (or similar) which seeks to identify
# two proteins in the mixture.

if False:
    filters.poi_only = True     # only look at proteins of interest, which we'll further limit below
    filters.multi_peptide_metric = 'dist_min' # it's ok if this is on even when doing 1 protein

    filters.pro_subset = ['Q14997']  # find best runs for this protein
    best_runs = best_runs_for_objective(filters, 'Q14997' )

    filters.pro_subset = ['P40306'] # find best runs for this protein
    best_runs = best_runs_for_objective(filters, 'P40306')

    filters.pro_subset = []  # remove specific subset, so we'll be looking at all proteins of interest (those two)
    best_runs = best_runs_for_objective(filters, 'Both together')

    filters.multi_peptide_metric=None  # Now just get best peptides from *any* POI to see which proteins need help
    filters.n_best_schemes=50
    best_runs = best_runs_for_objective(filters, 'Best peps either protein')



In [None]:
# The following is an example of how you might choose to look at best runs for ptm
# identification for a handful of PTM locations - first by individual location and
# then together.  It depends on what question you are asking.  Do you want to find 
# the best individual runs per PTM location (like the train_and_test_template_ptm
# will illustrate for you), or do you want to find runs that, while probably not as
# optimal for any given location, will give you some measure of the "best" result
# for all locations combined?

if False:
    filters.ptm_subset = [181]  # find best runs for this ptm
    best_runs = best_runs_for_objective(filters, 'PTM 181' )

    filters.ptm_subset = [199] # find best runs for this ptm
    best_runs = best_runs_for_objective(filters, 'PTM 199')

    filters.ptm_subset = [181,199]  # find best runs if you want one run to see both PTMs
    best_runs = best_runs_for_objective(filters, 'Both together')

    filters.multi_peptide_metric=None  # Now just get best PTM peptides across runs to see which PTMs need help
    filters.n_best_schemes=50
    best_runs = best_runs_for_objective(filters, 'Best runs either PTM')
