## Emulate `analyze_metamodels.ipynb` to follow up on `in_house_peak_disease_leave_one_cohort_out`: exclude the samples where one replicate failed sequencing

We found these samples in `paired_sample_batch_effects.ipynb` by applying our QC min-clone-count filters to the individual replicates.

In [1]:
import os

os.environ["MALID_CV_SPLIT"] = "in_house_peak_disease_leave_one_cohort_out"

In [2]:
from pathlib import Path
from typing import Dict, Generator, List, Optional, Tuple, Union
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import itertools
import genetools

%matplotlib inline
import seaborn as sns
from IPython.display import display, Markdown

from malid import config, logger, helpers
from wrap_glmnet import GlmnetLogitNetWrapper
from malid.train import train_metamodel
import crosseval
from malid.datamodels import (
    DataSource,
    TargetObsColumnEnum,
    GeneLocus,
    healthy_label,
    map_cross_validation_split_strategy_to_default_target_obs_column,
)
from malid.trained_model_wrappers import BlendingMetamodel

In [3]:
base_model_train_fold_name = "train_smaller"
metamodel_fold_label_train = "validation"
gene_locus = config.gene_loci_used
target_obs_column = TargetObsColumnEnum.disease

In [4]:
flavors = train_metamodel.get_metamodel_flavors(
    gene_locus=gene_locus,
    target_obs_column=target_obs_column,
    fold_id=config.all_fold_ids[0],
    base_model_train_fold_name=base_model_train_fold_name,
    use_stubs_instead_of_submodels=True,
)

metamodel_flavor = "default"
metamodel_config = flavors["default"]

In [5]:
# should already exist:
metamodels_base_dir = BlendingMetamodel._get_metamodel_base_dir(
    gene_locus=gene_locus,
    target_obs_column=target_obs_column,
    metamodel_flavor=metamodel_flavor,
)

_output_suffix = Path(gene_locus.name) / target_obs_column.name / metamodel_flavor
# might not exist yet:
output_base_dir = (
    config.paths.second_stage_blending_metamodel_output_dir / _output_suffix
)
highres_output_base_dir = (
    config.paths.high_res_outputs_dir / "metamodel" / _output_suffix
)
output_base_dir.mkdir(parents=True, exist_ok=True)
highres_output_base_dir.mkdir(parents=True, exist_ok=True)

fname_prefix = (
    f"{base_model_train_fold_name}_applied_to_{metamodel_fold_label_train}_model"
)
model_prefix = metamodels_base_dir / fname_prefix
results_output_prefix = output_base_dir / fname_prefix
highres_results_output_prefix = highres_output_base_dir / fname_prefix

computed_abstentions = None

# Load and summarize
experiment_set = crosseval.ExperimentSet.load_from_disk(output_prefix=model_prefix)

# Note that default y_true from BlendingMetamodel._featurize() is target_obs_column.value.blended_evaluation_column_name
# Use DROP_INCOMPLETE_FOLDS setting because alternate classification targets might not be well-split in the small validation set of the cross-validation folds that were designed to stratify disease.
# In the cases of some classification targets, we might need to automatically drop folds that have only a single class in the metamodel training data (i.e. in the validation set).
experiment_set_global_performance = experiment_set.summarize(
    remove_incomplete_strategy=crosseval.RemoveIncompleteStrategy.DROP_INCOMPLETE_FOLDS
)

In [6]:
model_global_performance = experiment_set_global_performance.model_global_performances[
    "ridge_cv"
]

In [7]:
# review classification for each specimen
individual_classifications = model_global_performance.get_all_entries()

{"message": "Removing class absent from y_true: Lupus", "time": "2024-02-22T07:03:23.592817"}


{"message": "Removing class absent from y_true: Influenza", "time": "2024-02-22T07:03:23.596782"}


{"message": "Removing class absent from y_true: T1D", "time": "2024-02-22T07:03:23.598231"}


{"message": "Removing class absent from y_true: HIV", "time": "2024-02-22T07:03:23.599855"}


In [8]:
individual_classifications

Unnamed: 0,y_true,y_pred,max_predicted_proba,second_highest_predicted_proba,difference_between_top_two_predicted_probas,specimen_label,age,disease,disease.rollup,disease.separate_past_exposures,disease_severity,disease_subtype,ethnicity_condensed,isotype_proportion:IGHA,isotype_proportion:IGHD-M,isotype_proportion:IGHG,participant_label,past_exposure,sex,study_name
0,Healthy/Background,Healthy/Background,0.874587,0.020735,0.853852,M64-075,48.0,Healthy/Background,Healthy/Background,Healthy/Background,,Healthy/Background - CMV+,Asian,0.11651,0.816248,0.067242,BFI-0003124,False,F,Healthy-StanfordBloodCenter_included-in-resequ...
1,Healthy/Background,Healthy/Background,0.980914,0.002441,0.978474,M64-033,63.0,Healthy/Background,Healthy/Background,Healthy/Background,,Healthy/Background - CMV+,Caucasian,0.055499,0.925711,0.01879,BFI-0003082,False,M,Healthy-StanfordBloodCenter_included-in-resequ...
2,Covid19,Covid19,0.780389,0.039428,0.74096,M369-S001,73.0,Covid19,Covid19,Covid19,ICU,Covid19 - Sero-positive (ICU),,0.198567,0.57721,0.224223,BFI-0007450,False,F,Covid19-buffycoat
3,Covid19,Covid19,0.89738,0.018198,0.879182,M371-S024,77.0,Covid19,Covid19,Covid19,Admit,Covid19 - Sero-positive (Admit),,0.107579,0.60189,0.290531,BFI-0007483,False,F,Covid19-buffycoat
4,Healthy/Background,Healthy/Background,0.959176,0.004688,0.954488,M64-044,22.0,Healthy/Background,Healthy/Background,Healthy/Background,,Healthy/Background - CMV+,Asian,0.052188,0.87793,0.069881,BFI-0003093,False,M,Healthy-StanfordBloodCenter_included-in-resequ...
5,Healthy/Background,Healthy/Background,0.978962,0.001424,0.977538,M64-065,26.0,Healthy/Background,Healthy/Background,Healthy/Background,,Healthy/Background - CMV-,Asian,0.112299,0.837525,0.050176,BFI-0003114,False,M,Healthy-StanfordBloodCenter_included-in-resequ...
6,Healthy/Background,Healthy/Background,0.975458,0.002597,0.97286,M64-080,59.0,Healthy/Background,Healthy/Background,Healthy/Background,,Healthy/Background - CMV+,Caucasian,0.003738,0.939253,0.057008,BFI-0003129,False,M,Healthy-StanfordBloodCenter_included-in-resequ...
7,Healthy/Background,Healthy/Background,0.985126,0.001829,0.983296,M64-084,52.0,Healthy/Background,Healthy/Background,Healthy/Background,,Healthy/Background - CMV-,Caucasian,0.088505,0.829384,0.082111,BFI-0003133,False,M,Healthy-StanfordBloodCenter_included-in-resequ...
8,Healthy/Background,Healthy/Background,0.979864,0.00242,0.977444,M64-093,50.0,Healthy/Background,Healthy/Background,Healthy/Background,,Healthy/Background - CMV-,Caucasian,0.184208,0.731,0.084793,BFI-0003142,False,M,Healthy-StanfordBloodCenter_included-in-resequ...
9,Healthy/Background,Healthy/Background,0.828647,0.015167,0.81348,M64-038,61.0,Healthy/Background,Healthy/Background,Healthy/Background,,Healthy/Background - CMV-,Caucasian,0.05337,0.917168,0.029462,BFI-0003087,False,M,Healthy-StanfordBloodCenter_included-in-resequ...


In [9]:
# Load the list of rejected specimens (where at least one replicate failed QC)
bad_specimens = pd.read_csv(
    config.paths.base_output_dir_for_selected_cross_validation_strategy
    / "rejected_specimens_because_some_replicates_failed_qc.txt",
    header=None,
)[0].values
bad_specimens

array(['M64-012', 'M64-033', 'M64-035', 'M64-037', 'M64-043', 'M64-049',
       'M64-051', 'M64-052', 'M64-055', 'M64-057', 'M64-060', 'M64-063',
       'M64-064', 'M64-073', 'M64-075', 'M64-080', 'M64-097'],
      dtype=object)

In [10]:
# Filter out healthy samples that underwent replicate sequencing but failed
# See the other notebook mentioned above for details.
print(individual_classifications.shape)

individual_classifications_filtered = individual_classifications[
    (~individual_classifications["specimen_label"].isin(bad_specimens))
]
individual_classifications_filtered.shape

(40, 20)


(23, 20)

In [11]:
# Of the remainder: what was the accuracy?
for y_true, grp in individual_classifications_filtered.groupby("y_true"):
    print(f"For y_true={y_true}, predictions are:")
    print(grp["y_pred"].value_counts())
    print()

For y_true=Covid19, predictions are:
Covid19    10
Name: y_pred, dtype: int64

For y_true=Healthy/Background, predictions are:
Healthy/Background    13
Name: y_pred, dtype: int64

