In [None]:
# @IMPORT-MERGE
import numpy as np
import pandas as pd
from munch import Munch
from plaster.tools.zplots import zplots
from plaster.run.plots import plots
from plaster.run.plots import plots_dev as pdev
from plaster.run.plots import plots_dev_ptm as pptm
from plaster.run.run import RunResult
from plaster.run.job import JobResult
from plaster.tools.ipynb_helpers.displays import hd
from plaster.tools.log.log import error, debug
from plaster.tools.utils.utils import json_print,np_safe_divide,munch_abbreviation_string


In [None]:
# @REMOVE-FROM-TEMPLATE
#
z = zplots.setup()

job = JobResult("../../../jobs_folder/tau8_local_bg3_3color_may27/")

# If you are running this report by dropping it into a job folder,
# then comment the above and uncomment this line before running the report:
# job = JobResult("./") 


## Optionally Edit your Proteins or PTMs of interest
* These are typically specified when you create a plaster job
* You needn't specify anything here unless you want to change this
* Execute this cell to see the current setting.
* This cell loads all PR data and may take some minutes.

In [None]:
which_classifier = None  # None to use best available, or e.g. 'rf' to request specific

#
# Add entries to this list to specify proteins of interest, and optionally locations
# of interest on those proteins.  Note that if you don't set anything here, any
# --protein_of_interest you specified via pgen will be used instead, and any PTM
# locations given in a --protein_csv file will be used for that protein.
#
proteins_locations = [
# These are examples.  Add your own that are not commented out.
#     ( 'P10636-8', '181;184;185;199;202;214;231;237;404' ),
#     ( 'P02768'  , '25' ),
#     ( 'Q13885'  , '' )
]

# You should not edit anything below this point unless you're adventurous.  :)
#
# TODO: much/all of this code should get moved into a python file that is called from here.
#===========================================================================================

if len(proteins_locations) > 0:    
    job.set_pros_of_interest( protein_ids=[tup[0] for tup in proteins_locations] )
    for poi,ptms in proteins_locations:
        job.set_pro_ptm_locs( protein_id=poi, ptms=ptms )

# If there are proteins of interest, reporting will be focused on those.
# If those have PTM locations of interest, reporting will further focused on those.
# The flags include_xxx_only determine which proteins/peptides are pulled into
# the reporting -- you can override those if you want.
#
columns = ['pro_id','pro_ptm_locs']
if 'abundance' in job.runs[0].prep.pros().columns:
    columns += ['abundance']
proteins_of_interest = job.get_pros_of_interest().drop_duplicates('pro_id')[columns]
ptms_for_proteins = [ job.get_pro_ptm_locs(poi) for poi in proteins_of_interest['pro_id'].unique() ]

include_poi_only = len(proteins_of_interest) > 0  # poi only if there are some specified
include_ptm_only = include_poi_only and all( ptms_for_proteins ) 

# This section tells you what the reporting will be based on, and
# loads precision/recall/scoring information for that domain.
#
if not proteins_of_interest.empty:
    print( "Proteins of interest:" )
    display( proteins_of_interest )
    print()

# Choose a classifier based on availability and user request at top of cell.
#
available_classifiers = job.runs[0].get_available_classifiers()
chosen_classifier = which_classifier if which_classifier in available_classifiers else available_classifiers[0]
print( f"Available classifiers : {available_classifiers}\n")


prs_args = Munch(
    include_poi_only=include_poi_only,
    include_ptm_only=include_ptm_only,
    force_compute_prs=False,
    classifier=chosen_classifier,
)

print( "Loading PR information for peptides based on this:" )
json_print( prs_args )
print( "\nTakes a minute...")

all_runs_pr = job.peps_prs_report_df(**prs_args)
all_runs_pr_abund = job.peps_prs_report_df(**prs_args, pr_with_abundance=True)
print( "done." )

## Edit your filters and find best runs

In [None]:
def best_runs_for_ptm_locations(filters):
    hd('h1','Best runs (protease+labels) per PTM location')
    hd('h3', 'Filters' )
    json_print(filters)
    print()
    
    best_pr,remain_ptms,removed_ptms=job.get_best_precision_runs_for_ptms( all_runs_pr, filters )

    print(f"PTMs removed by filtering: {removed_ptms}")
    print(f"PTMs observable          : {remain_ptms}")
    pptm.plot_pr_breakout_peps_runs( job, best_pr, filters, _size=640, _max_legend_items=15, )
    
    display(best_pr)
    return best_pr
    

# Edit the filters here, then run this cell
#
filters = Munch(
    allow_proline_at_2=True,       # True or False
    classifier=chosen_classifier,  # edit which_classifier in cell above to change this.
    exclude_runs=[],               # [] or List of runs to exclude, e.g. ['gluc_ph4_c_k_de_y_9880']   
    include_runs=[],               # [] or List of runs to consider, e.g. ['gluc_ph4_c_k_de_y_9880']
    max_pep_len=50,                # None, or some integer
    max_ptms_per_pep=None,         # None, or some integer
    min_recall=0.1,                # floating point number between 0 and 1
    n_best_runs=1,                 # integer >= 1
    ptm_subset=[],                 # None, [], or list of PTMs to consider, e.g. [181,404]
)


pd.set_option('display.max_columns',None)
best_pr = best_runs_for_ptm_locations(filters)

# The following line saves your best_pr dataframe to a CSV named for the filter settings.
user = '' if 'user' not in globals() else user  
best_pr.to_csv(f'./report_best_pr__{user}__{munch_abbreviation_string(filters)}.csv',index=False,float_format="%g")


# Runs that produced at least one best-precision-at-recall

In [None]:

#
# Execute this cell to get a standard report on each run that produced at least one
# best precision-recall for a peptide.
#
# Or call run_report with your run_i of interest.
#
def run_report( run_i ):
    run = job.runs[run_i]
    hd('h1','_________________________________________________________________')
    plots.standard_run_report( run, classifier=filters.classifier )

    hd('h2', f"Peptides with PTMs, requested min_recall={filters.min_recall} ({filters.classifier})")
    df = pdev.peps_prec_at_min_recall_df( all_runs_pr[all_runs_pr.run_i==run_i], min_recall=filters.min_recall).drop('ptm',axis=1)
    display( df )     
    print()
    
    hd('h2', f"Confusion Matrix, with & without score threshold (best precision pep_i)")
    row = best_pr[best_pr.run_i==run_i].sort_values(by=['prec','recall'],ascending=[False,False]).iloc[0]
    pdev.plot_confusion_matrix_compare( job.runs[run_i],row.pep_i,row.score, classifier=filters.classifier )


# Set to True to get a standard run report on each run that produced a "best pr"
run_info = pdev._run_iz_count_pep_iz( best_pr )
if False:
    for run_i in run_info.run_iz:
        run_report( run_i )
 

#
# Or get a report on a specific run_i
# run_info.run_iz is a list of run_i sorted by best->worst based on filter
# best = produces most peptides with "best pr" of all runs
#
if True:
    run_i = run_info.run_iz[0]  # run_iz is sorted from best->worst
    run_report( run_i )



# Explore fluorosequences

In [None]:
#
# To explore details for a given fluorosequence:
#
# Edit the flu and run you want to explore & set to True

if True:
    flu = '.0.0....2...... ;4,0,1'
    run_i = run_info.run_iz[0]
    peps_prs_df = all_runs_pr[(all_runs_pr.run_i==run_i)&(all_runs_pr.flustr==flu)]
    pdev.plot_flu_info( job.runs[run_i], flu, peps_prs_df=peps_prs_df, min_recall=filters.min_recall, classifier=filters.classifier )



# runs_pr_falses.csv for selected runs

In [None]:
# @REMOVE-FROM-TEMPLATE
PGEN_report_precisions = (0.9,)


In [None]:
#==========================================================================================
# Edit your desired parameters here
#==========================================================================================
precisions = PGEN_report_precisions  # see above cell, or cell at top of notebook
n_falses = 1
protein_of_interest_only = False  

# This controls the ordering of the columns in the csv
cols = ['run_i', 'run_name', 'pro_i', 'pro_id', 'pep_i', 'pep_start', 'pep_stop', 'at_prec', 'recall_at_prec', 'score_at_prec', 'ptms', 'P2', 'seqstr', 'seqlen', 'flustr', 'flu_pros', 'false_i', 'false_type', 'false_pro_i', 'false_pep_i','false_flustr', 'false_weight']

# This controls the default sorting
sort = ['run_i','pro_i','pep_start', 'at_prec', 'recall_at_prec', 'pep_i', 'false_weight' ]
ascend = [True,True,True,False,False,True,False]

#==========================================================================================

def pr_falses_for_best_runs(_run_info, prec, n_falses, protein_of_interest_only, classifier):
    df_list = []
    for run_i in _run_info.run_iz:
        run = job.runs[run_i]
        bag = run.test_call_bag( classifier=classifier )
        df = bag.false_rates_all_peps__ptm_info(prec, n_falses, protein_of_interest_only)
        df["run_i"] = run_i
        df["run_name"] = run.manifest.run_name
        df_list += [df]
    return pd.concat(df_list).reset_index(drop=True)

pep_false_df = pd.concat([
    pr_falses_for_best_runs(run_info, prec, n_falses, protein_of_interest_only=protein_of_interest_only, classifier=filters.classifier)
    for prec in precisions
]).sort_values(by=sort,ascending=ascend).reset_index()[cols]

if True:
    hd('h3','peptides with non-zero recall at precision thresholds (avail as pep_false_df)')
    
    filename = f"./runs_pr_falses__{'_'.join(map(str,precisions))}__{munch_abbreviation_string(filters)}.csv"
    pep_false_df.to_csv(filename,index=False,float_format="%g")
    print( f"Wrote full pep_false_df to: {filename}")
    
    display(pep_false_df[pep_false_df.recall_at_prec>0])
