# How does model3-rollup perform if allowed the same number of abstentions as model 2?

Model 2's AUC is boosted by abstaining on difficult samples. How high could model 3's AUC get if it also got to abstain on the same number of samples?

Compare model 3's AUC to model 2 more fairly by considering abstentions. If we assume that the N% abstentions are the worst predictions, what would an apples-to-apples AUC comparison be? Take out the worst N% of model 3's predictions (e.g. true 0 but predicted 0.99 -- rank by absolute difference from the truth), and recompute AUC.

Multiclass way to implement this: true labels as one-hot vector, vs predicted probabilities vector; look at the difference of the two vectors: sum of squares or sum of absolute values. Rank by that difference. Take the bottom N%. Drop 'em. Get a new AUC for model 3 with N% of worst predictions removed.

In [1]:
import numpy as np
import pandas as pd

from malid import config, logger
from malid.external import model_evaluation
from malid.datamodels import (
    GeneLocus,
    TargetObsColumnEnum,
    SampleWeightStrategy,
    combine_classification_option_names,
)

In [2]:
from IPython.display import display, Markdown

In [3]:
from malid.trained_model_wrappers import (
    ConvergentClusterClassifier,
    SequenceClassifier,
    RollupSequenceClassifier,
)

In [4]:
target_obs_column = TargetObsColumnEnum.disease
sample_weight_strategy = SampleWeightStrategy.ISOTYPE_USAGE

In [5]:
model2_name = config.metamodel_base_model_names.model_name_convergent_clustering
model3_name = config.metamodel_base_model_names.model_name_sequence_disease

In [6]:
def process(gene_locus: GeneLocus):
    display(Markdown(f"## {gene_locus}"))
    GeneLocus.validate_single_value(gene_locus)

    convergent_cluster_models_base_dir = (
        ConvergentClusterClassifier._get_model_base_dir(
            gene_locus=gene_locus, target_obs_column=target_obs_column
        )
    )

    rollup_models_base_dir = RollupSequenceClassifier._get_model_base_dir(
        sequence_models_base_dir=SequenceClassifier._get_model_base_dir(
            gene_locus=gene_locus,
            target_obs_column=target_obs_column,
            sample_weight_strategy=sample_weight_strategy,
        )
    )

    # Load model2 **test set** performance
    model2_experiment_set = model_evaluation.ExperimentSet.load_from_disk(
        output_prefix=convergent_cluster_models_base_dir
        / "test_performance_of_train_smaller_model"
    )
    # Load model3-rollup **test set** performance
    model3_experiment_set = model_evaluation.ExperimentSet.load_from_disk(
        output_prefix=rollup_models_base_dir / "train_smaller_model"
    )

    # Remove global fold (we trained global fold model, but now get evaluation scores on cross-validation folds only)
    for experiment_set in [model2_experiment_set, model3_experiment_set]:
        # TODO: make kdict support: del self.model_outputs[:, fold_id]
        for key in experiment_set.model_outputs[:, -1].keys():
            logger.info(f"Removing {key} (global fold)")
            del experiment_set.model_outputs[key]

    model3_experiment_set_summary_original = model3_experiment_set.summarize()
    print("Original model3-rollup performance:")
    display(
        model3_experiment_set_summary_original.get_model_comparison_stats().loc[
            [model3_name]
        ]
    )
    print()

    model2_experiment_set_summary = model2_experiment_set.summarize()
    print("Original model2 performance:")
    display(
        model2_experiment_set_summary.get_model_comparison_stats().loc[[model2_name]]
    )
    print()
    print(
        f"Model 2 abstained on {model2_experiment_set_summary.model_global_performances[model2_name].abstention_proportion:0.2%} of samples"
    )

    # Sanity check: confirm that "sample_size including abstentions" matches between the two
    assert (
        model3_experiment_set.summarize()
        .get_model_comparison_stats()
        .loc[model3_name]["sample_size including abstentions"]
        == model2_experiment_set_summary.get_model_comparison_stats().loc[model2_name][
            "sample_size including abstentions"
        ]
    )

    # Discarded attempt: Make model 3 abstain on the exact same samples as model 2 did
    #     samples_to_abstain = (
    #         model2_experiment_set_summary.model_global_performances[model2_name]
    #         .cv_abstentions_metadata["specimen_label"]
    #         .values
    #     )

    # Update: Model 2 got to choose which samples to abstain on, so it has that advantage.
    # Let's instead try to remove the "worst offenders" in model 3 to see the best AUC we could possibly get with the same number of abstentions.
    # Not necessarily abstaining on the same exact samples as model 2 anymore

    y_preds_proba = model3_experiment_set_summary_original.model_global_performances[
        model3_name
    ].cv_y_preds_proba

    y_true_one_hot = pd.get_dummies(
        model3_experiment_set_summary_original.model_global_performances[
            model3_name
        ].cv_y_true_without_abstention
    ).reindex(
        columns=y_preds_proba.columns,
        fill_value=0,
    )

    # Get Euclidean distance between each y_true and accompanying y_preds_proba entry (alternative: sklearn cdist, then extract diagonal)
    differences = np.linalg.norm(y_true_one_hot - y_preds_proba, axis=1)
    assert differences.shape[0] == y_preds_proba.shape[0]

    # Get indices of top N highest-difference entries (same number of abstentions as in model2)
    indices_to_abstain_on = differences.argsort()[
        -model2_experiment_set_summary.model_global_performances[
            model2_name
        ].cv_abstentions.shape[0] :
    ]

    # Convert those to specimen labels
    samples_to_abstain = (
        model3_experiment_set_summary_original.model_global_performances[model3_name]
        .cv_metadata.iloc[indices_to_abstain_on]["index"]
        .values
    )

    print(f"Making model3-rollup abstain on: {samples_to_abstain}")
    revised_model3_outputs = model3_experiment_set.model_outputs[model3_name, :].copy()
    for (
        model_name,
        fold_id,
    ), model_single_fold_performance in revised_model3_outputs.items():
        mask = model_single_fold_performance.test_metadata.index.isin(
            samples_to_abstain
        )
        print(f"In fold {fold_id}, switching {mask.sum()} specimens to abstentions")
        revised_model3_outputs[
            model_name, fold_id
        ] = model_single_fold_performance.apply_abstention_mask(mask)

    model3_experiment_set_revised = model_evaluation.ExperimentSet(
        revised_model3_outputs
    )
    model3_experiment_set_revised_summary = model3_experiment_set_revised.summarize()

    # Sanity checks
    assert (
        model3_experiment_set_revised_summary.model_global_performances[
            model3_name
        ].n_abstentions
        == model2_experiment_set_summary.model_global_performances[
            model2_name
        ].n_abstentions
    )
    assert (
        model3_experiment_set_revised_summary.model_global_performances[
            model3_name
        ].sample_size_with_abstentions
        == model2_experiment_set_summary.model_global_performances[
            model2_name
        ].sample_size_with_abstentions
    )

    print()
    print("New model3-rollup performance:")
    display(model3_experiment_set_revised_summary.get_model_comparison_stats())

    # Export model3_experiment_set_revised_summary
    output_dir = (
        config.paths.sequence_models_output_dir
        / gene_locus.name
        / "rollup_models"
        / combine_classification_option_names(
            target_obs_column=target_obs_column,
            sample_weight_strategy=sample_weight_strategy,
        )
        / "with_abstentions_to_match_model2"
    )
    output_dir.mkdir(exist_ok=True, parents=True)
    model3_experiment_set_revised_summary.export_all_models(
        func_generate_classification_report_fname=lambda model_name: output_dir
        / f"sequence_prediction_rollup_with_abstentions_to_match_model2.{model_name}.train_smaller_model.report.txt",
        func_generate_confusion_matrix_fname=lambda model_name: output_dir
        / f"sequence_prediction_rollup_with_abstentions_to_match_model2.{model_name}.train_smaller_model.confusion_matrix.png",
        confusion_matrix_pred_label="Rollup of sequence predictions (with model2's number of abstentions)",
        dpi=300,
    )
    model3_experiment_set_revised_summary.get_model_comparison_stats().to_csv(
        output_dir
        / f"sequence_prediction_rollup_with_abstentions_to_match_model2.train_smaller_model.compare_model_scores.tsv",
        sep="\t",
    )
    print(f"Exported to {output_dir}")

In [7]:
for gene_locus in config.gene_loci_used:
    process(gene_locus)
    print()
    print()

## GeneLocus.BCR

Original model3-rollup performance:


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
lasso_multiclass,0.829 +/- 0.032 (in 3 folds),0.833 +/- 0.034 (in 3 folds),0.835 +/- 0.026 (in 3 folds),0.846 +/- 0.026 (in 3 folds),0.687 +/- 0.042 (in 3 folds),0.556 +/- 0.067 (in 3 folds),0.688,0.557,480,0,480,0.0,False



Original model2 performance:


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,au-PRC (macro OvO) per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
lasso_multiclass,0.926 +/- 0.009 (in 3 folds),0.933 +/- 0.010 (in 3 folds),0.927 +/- 0.014 (in 3 folds),0.935 +/- 0.014 (in 3 folds),0.761 +/- 0.010 (in 3 folds),0.650 +/- 0.022 (in 3 folds),0.761,0.649,0.744 +/- 0.031 (in 3 folds),0.630 +/- 0.046 (in 3 folds),...,0.945 +/- 0.000 (in 1 folds),0.744,0.628,0.023,Unknown,469,11,480,0.022917,False



Model 2 abstained on 2.29% of samples
Making model3-rollup abstain on: ['M281redo-S034' 'M281redo-S012' 'M454-S058' 'M418-S009' 'M418-S031'
 'M418-S182' 'M404-S014' 'M281redo-S013' 'M456-S006' 'M418-S174'
 'M281redo-S011']
In fold 0, switching 4 specimens to abstentions
In fold 2, switching 5 specimens to abstentions
In fold 1, switching 2 specimens to abstentions

New model3-rollup performance:


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,Unknown/abstention proportion per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
lasso_multiclass,0.856 +/- 0.021 (in 3 folds),0.859 +/- 0.023 (in 3 folds),0.857 +/- 0.017 (in 3 folds),0.867 +/- 0.017 (in 3 folds),0.703 +/- 0.036 (in 3 folds),0.576 +/- 0.062 (in 3 folds),0.704,0.578,0.687 +/- 0.042 (in 3 folds),0.561 +/- 0.065 (in 3 folds),0.023 +/- 0.010 (in 3 folds),0.688,0.562,0.023,Unknown,469,11,480,0.022917,False


Exported to /users/maximz/code/boyd-immune-repertoire-classification/out/unirep_fine_tuned/sequence_models/BCR/rollup_models/disease_sample_weight_strategy_ISOTYPE_USAGE/with_abstentions_to_match_model2




## GeneLocus.TCR

Original model3-rollup performance:


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
lasso_multiclass,0.881 +/- 0.012 (in 3 folds),0.888 +/- 0.016 (in 3 folds),0.857 +/- 0.029 (in 3 folds),0.869 +/- 0.032 (in 3 folds),0.710 +/- 0.042 (in 3 folds),0.614 +/- 0.052 (in 3 folds),0.71,0.613,414,0,414,0.0,False



Original model2 performance:


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,sample_size,n_abstentions,...,ROC-AUC (macro OvO) per fold with abstention,au-PRC (weighted OvO) per fold with abstention,au-PRC (macro OvO) per fold with abstention,Accuracy per fold with abstention,MCC per fold with abstention,Unknown/abstention proportion per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention
lasso_multiclass,0.885 +/- 0.005 (in 3 folds),0.886 +/- 0.004 (in 3 folds),0.879 +/- 0.011 (in 3 folds),0.884 +/- 0.011 (in 3 folds),0.702 +/- 0.016 (in 3 folds),0.575 +/- 0.023 (in 3 folds),0.702,0.569,413,1,...,0.889 +/- 0.002 (in 2 folds),0.885 +/- 0.007 (in 2 folds),0.890 +/- 0.003 (in 2 folds),0.701 +/- 0.017 (in 3 folds),0.573 +/- 0.021 (in 3 folds),0.007 +/- 0.000 (in 1 folds),0.7,0.567,0.002,Unknown



Model 2 abstained on 0.24% of samples
Making model3-rollup abstain on: ['M124-S008']
In fold 0, switching 1 specimens to abstentions
In fold 2, switching 0 specimens to abstentions
In fold 1, switching 0 specimens to abstentions

New model3-rollup performance:


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,au-PRC (macro OvO) per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
lasso_multiclass,0.883 +/- 0.014 (in 3 folds),0.889 +/- 0.018 (in 3 folds),0.858 +/- 0.031 (in 3 folds),0.870 +/- 0.033 (in 3 folds),0.712 +/- 0.044 (in 3 folds),0.616 +/- 0.054 (in 3 folds),0.712,0.615,0.710 +/- 0.042 (in 3 folds),0.614 +/- 0.052 (in 3 folds),...,0.851 +/- 0.006 (in 2 folds),0.71,0.613,0.002,Unknown,413,1,414,0.002415,False


Exported to /users/maximz/code/boyd-immune-repertoire-classification/out/unirep_fine_tuned/sequence_models/TCR/rollup_models/disease_sample_weight_strategy_ISOTYPE_USAGE/with_abstentions_to_match_model2


