## Evaluation breakdown by sequencing region (CDR v FW)

In [1]:
import pandas as pd
import os
import epam.evaluation as ev
from epam.utils import pcp_path_of_aaprob_path, load_and_filter_pcp_df
from netam.sequences import translate_sequence
from scripts.annotate_pcps import aa_regions_of_row, aa_seq_by_region, get_cdr_fwk_seqs

In [2]:
os.chdir('/home/mjohnso4/epam/')
test_aaprob_path = 'output/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/set2/SHMple_default/combined_aaprob.hdf5'

In [3]:
test_pcp_path = pcp_path_of_aaprob_path(test_aaprob_path)
full_test_pcp_df = load_and_filter_pcp_df(test_pcp_path) # TEMP
test_pcp_df = full_test_pcp_df.head(3).copy() # TEMP
test_pcp_df['parent_aa'] = test_pcp_df.apply(lambda row: translate_sequence(row['parent']), axis=1)
test_pcp_df['child_aa'] = test_pcp_df.apply(lambda row: translate_sequence(row['child']), axis=1)

In [4]:
for index, row in test_pcp_df.iterrows():
    print(row['parent_aa'])
    # print(row['cdr1_codon_start']/3, row['cdr1_codon_end']/3, row['cdr2_codon_start']/3, row['cdr2_codon_end']/3, row['cdr3_codon_start']/3, row['cdr3_codon_end']/3)
    regions = aa_regions_of_row(row)
    print(regions)
    seq_by_region = aa_seq_by_region(row['parent_aa'], regions)
    print(seq_by_region)
    parent_fwk_seq, parent_cdr_seq, child_fwk_seq, child_cdr_seq = get_cdr_fwk_seqs(row)
    # print(parent_fwk_seq) 
    # print(parent_cdr_seq)
    # print(child_fwk_seq)
    # print(child_cdr_seq)

EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGWVRQMPGKGLEWMGIIYPGDSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCASSGCSGGSCYSGGGPFYWGQGTLVTVSS
{'FWK1': (0, 25), 'CDR1': (25, 33), 'FWK2': (33, 50), 'CDR2': (50, 58), 'FWK3': (58, 96), 'CDR3': (96, 114), 'FWK4': (114, 125)}
{'FWK1': 'EVQLVQSGAEVKKPGESLKISCKGS', 'CDR1': 'GYSFTSYW', 'FWK2': 'IGWVRQMPGKGLEWMGI', 'CDR2': 'IYPGDSDT', 'FWK3': 'RYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYC', 'CDR3': 'ASSGCSGGSCYSGGGPFY', 'FWK4': 'WGQGTLVTVSS'}
QVQLVQSGAEVKKPGASVKVSCKASGYTFTNFGISWVRQAPGQGLEWMGWISAYNGNTNYAQRLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCARLWSTGYQGDDAFDIWGQGTMVTVSS
{'FWK1': (0, 25), 'CDR1': (25, 33), 'FWK2': (33, 50), 'CDR2': (50, 58), 'FWK3': (58, 96), 'CDR3': (96, 112), 'FWK4': (112, 123)}
{'FWK1': 'QVQLVQSGAEVKKPGASVKVSCKAS', 'CDR1': 'GYTFTNFG', 'FWK2': 'ISWVRQAPGQGLEWMGW', 'CDR2': 'ISAYNGNT', 'FWK3': 'NYAQRLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYC', 'CDR3': 'ARLWSTGYQGDDAFDI', 'FWK4': 'WGQGTMVTVSS'}
EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDN

In [None]:
test_pcp_df['parent_fwk_seq'], test_pcp_df['parent_cdr_seq'], test_pcp_df['child_fwk_seq'], test_pcp_df['child_cdr_seq'] = zip(*test_pcp_df.apply(get_cdr_fwk_seqs, axis=1))
print(test_pcp_df.columns)

In [7]:
ev.evaluate_dataset_by_region(test_aaprob_path)

{'data_set': 'pcp_inputs/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive.csv',
 'pcp_count': 3111,
 'model': 'SHMple_default',
 'sub_accuracy': 0.2986861313868613,
 'r_precision': 0.13222031962297245,
 'cross_entropy': 0.14771044408299822,
 'fwk_sub_accuracy': 0.333164351617155,
 'fwk_r_precision': 0.13147488708342037,
 'fwk_cross_entropy': 0.09313059112880417,
 'cdr_sub_accuracy': 0.2518589920132195,
 'cdr_r_precision': 0.2067651338260777,
 'cdr_cross_entropy': 0.05457985264522234,
 'avg_k_subs': 5.504660880745741,
 'avg_k_fwk_subs': 3.1703632272581164,
 'avg_k_cdr_subs': 2.3342976534876247}

In [9]:
def evaluate_all_models(aaprob_paths, model_performance_path):
    """
    Wrapper function for evaluate_dataset() that takes in a list of aaprob matrices and outputs a CSV of model performance metrics.
    Outputs to CSV file with columns for the different metrics and a row per model/data set combo.

    Parameters:
    aaprob_paths (list): List of paths to evaluate. Each aaprob matrix corresponds to predictions for one model on a given data set.
    model_performance_path (str): Path to output for model performance metrics.

    """
    model_performances = [ev.evaluate_dataset_by_region(aaprob_path) for aaprob_path in aaprob_paths]

    all_model_performances = pd.DataFrame(model_performances)

    all_model_performances.to_csv(model_performance_path, index=False)

In [13]:
flairr = 'output/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/'
race = 'output/rodriguez-airr-seq-race-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/'
wyatt = 'output/wyatt-10x-1p5m_pcp_2024-04-01_NI_noN_no-naive/'
tang = 'output/tang-deepshm-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive_rearranged/'
ablang1 = 'set1/AbLang1/'
ablang2_mask = 'set1/AbLang2_mask/'
ablang2_wt = 'set1/AbLang2_wt/'
esm = 'set1/ESM1v_mask/'
shmple = 'set2/SHMple_default/'
shmple_prod = 'set2/SHMple_productive/'
shmple_esm = 'set3/SHMpleESM_mask/'
flairr_models = [f"{flairr}{ablang1}combined_aaprob.hdf5", f"{flairr}{ablang2_mask}combined_aaprob.hdf5", f"{flairr}{ablang2_wt}combined_aaprob.hdf5", f"{flairr}{esm}combined_aaprob.hdf5", f"{flairr}{shmple}combined_aaprob.hdf5", f"{flairr}{shmple_prod}combined_aaprob.hdf5", f"{flairr}{shmple_esm}combined_aaprob.hdf5"]
race_models = [f"{race}{ablang1}combined_aaprob.hdf5", f"{race}{ablang2_mask}combined_aaprob.hdf5",  f"{race}{ablang2_wt}combined_aaprob.hdf5", f"{race}{esm}combined_aaprob.hdf5", f"{race}{shmple}combined_aaprob.hdf5", f"{race}{shmple_prod}combined_aaprob.hdf5", f"{race}{shmple_esm}combined_aaprob.hdf5"]
wyatt_models = [f"{wyatt}{ablang1}combined_aaprob.hdf5", f"{wyatt}{ablang2_mask}combined_aaprob.hdf5", f"{wyatt}{ablang2_wt}combined_aaprob.hdf5", f"{wyatt}{esm}combined_aaprob.hdf5", f"{wyatt}{shmple}combined_aaprob.hdf5", f"{wyatt}{shmple_prod}combined_aaprob.hdf5", f"{wyatt}{shmple_esm}combined_aaprob.hdf5"]
tang_models = [f"{tang}{ablang1}combined_aaprob.hdf5", f"{tang}{ablang2_mask}combined_aaprob.hdf5", f"{tang}{ablang2_wt}combined_aaprob.hdf5", f"{tang}{esm}combined_aaprob.hdf5", f"{tang}{shmple}combined_aaprob.hdf5", f"{tang}{shmple_prod}combined_aaprob.hdf5", f"{tang}{shmple_esm}combined_aaprob.hdf5"]
flairr_out = f"{flairr}combined_performance_by_region.csv"
race_out = f"{race}combined_performance_by_region.csv"
wyatt_out = f"{wyatt}combined_performance_by_region.csv"
tang_out = f"{tang}combined_performance_by_region.csv"

In [14]:
print(flairr_models)

['output/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/set1/AbLang1/combined_aaprob.hdf5', 'output/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/set1/AbLang2_mask/combined_aaprob.hdf5', 'output/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/set1/AbLang2_wt/combined_aaprob.hdf5', 'output/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/set1/ESM1v_mask/combined_aaprob.hdf5', 'output/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/set2/SHMple_default/combined_aaprob.hdf5', 'output/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/set2/SHMple_productive/combined_aaprob.hdf5', 'output/ford-flairr-seq-prod_pcp_2024-04-01_MASKED_NI_noN_no-naive/set3/SHMpleESM_mask/combined_aaprob.hdf5']


In [15]:
evaluate_all_models(flairr_models, flairr_out)

In [16]:
evaluate_all_models(race_models, race_out)

In [17]:
evaluate_all_models(wyatt_models, wyatt_out)

In [None]:
evaluate_all_models(tang_models, tang_out)