1. Take one real Covid19 patient. Extract any sequences that are known binders (+/- 15% mutation allowed)
2. Take one real healthy person. Eliminate any sequences that are known binders to Covid (+/- 15% mutation)
3. Mix 1 and 2 at desired signal to noise ratio

This is BCR only.


In [1]:
from typing import List
import numpy as np
import pandas as pd
import anndata
import gc
import shutil

import malid.external.genetools_scanpy_helpers
from malid import io
from malid import config, helpers, logger
from malid.datamodels import (
    GeneLocus,
    TargetObsColumnEnum,
    SampleWeightStrategy,
    healthy_label,
)

from malid.trained_model_wrappers import ConvergentClusterClassifier

In [2]:
# we're looking at BCR only here
gene_locus = GeneLocus.BCR

In [3]:
# If we want to generate based on an older dataset version, we can swap it in like this:
config.paths = config.make_paths(
    embedder=config.embedder, dataset_version="20220704_filter2"
)

In [4]:
config.paths.simulated_data_dir

PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20220704_filter2/embedded/unirep_fine_tuned/simulated_data')

In [5]:
## Get CoV-AbDab known binders
# TODO: switch to malid.interpretation provided loader

# set columnns our code expects (fold_label, fold_id, v_gene, j_gene, cdr3_aa_sequence_trim_len, cdr3_seq_aa_q_trim)
covabdab_df = pd.read_csv(
    config.paths.base_data_dir / "CoV-AbDab_310122.filtered.tsv", sep="\t"
).assign(fold_label="reference", fold_id=0)

covabdab_df = ConvergentClusterClassifier._cluster_training_set(
    df=covabdab_df,
    sequence_identity_threshold=0.95,  # very strict - in order to just prevent near-exact dupes
)

# Make cluster centroids for clusters, weighed by number of clone members (number of unique VDJ sequences)
cluster_centroids_df = ConvergentClusterClassifier._get_cluster_centroids(
    clustered_df=covabdab_df
)
# Reshape as dict
cluster_centroids_by_supergroup = (
    ConvergentClusterClassifier._wrap_cluster_centroids_as_dict_by_supergroup(
        cluster_centroids=cluster_centroids_df
    )
)

In [6]:
def extract_sequences_from_specimen(
    adata: anndata.AnnData,
    keep_known_binding: bool,
    known_binders_cluster_centroids_by_supergroup,
    # threshold for assigning our seqs (test seqs) to known binder clusters
    clustering_test_assignment_threshold=0.85,
) -> anndata.AnnData:
    """given a specimen anndata,
    extract known binder sequences (+/- 15% mutation) if keep_known_binding,
    or extract sequences that are not known binders (+/- 15% mutation) if not keep_known_binding."""
    test_df = adata.obs.copy()
    # group by v, j, len
    test_groups = test_df.groupby(
        ["v_gene", "j_gene", "cdr3_aa_sequence_trim_len"], observed=True
    )

    test_df[["cluster_id_within_clustering_group", "distance_to_nearest_centroid"]] = (
        test_groups["cdr3_seq_aa_q_trim"].transform(
            lambda test_sequences: ConvergentClusterClassifier.assign_sequences_to_cluster_centroids(
                test_sequences,
                known_binders_cluster_centroids_by_supergroup,
                clustering_test_assignment_threshold,
            )
        )
        # extract the series of tuples into two columns while preserving the index
        # .to_list() is an alternative here, which may be faster, and seems to guarantee index ordering but not sure?
        .apply(pd.Series)
    )

    # create a globally meaningful "resulting cluster ID" for each row of df (each input sequence from each participant):
    test_df["global_resulting_cluster_ID"] = test_df[
        [
            "v_gene",
            "j_gene",
            "cdr3_aa_sequence_trim_len",
            "cluster_id_within_clustering_group",
        ]
    ].apply(tuple, axis=1)

    # which sequences assigned to any predictive clusters?
    unmatched_seqs_bool_vector = test_df["cluster_id_within_clustering_group"].isna()
    if keep_known_binding:
        obsnames = test_df[~unmatched_seqs_bool_vector].index
    else:
        obsnames = test_df[unmatched_seqs_bool_vector].index

    return adata[obsnames]

In [7]:
def _sample_from_two_populations_to_achieve_mix_ratio(
    population_size_a: int, population_size_b: int, ratio: float
):
    """
    Return how many items to sample from population A and population B to achieve a desired mix ratio.
    We have two populations of size a,b, with a known target signal-to-noise ratio r
    Solve for x,y fractions of each population to sample, given ax/(ax+by) = r (and x,y > 0)
    Also we want to use as much of possible of the available datasets.
    So figure out which dataset is the limiting factor given the ratio; take as much as possible from there.
    """
    # if taking all of a, i.e. x=1, then r(ax+by)=r(a+by)=a.
    x = 1
    y = population_size_a / population_size_b * (1 - ratio) / ratio
    if y > 1:
        # this means that using all of a would require using more than 100% of b - which is impossible.
        # so b is the limiting factor. can't take all of a; let's take all of b.
        # if taking all of b, i.e. y=1, then we have (1-r)(ax+by) = (1-r)(ax+b) = b
        y = 1
        x = population_size_b / population_size_a * ratio / (1 - ratio)
    # round and return how much to sample from each population
    return int(round(x * population_size_a)), int(round(y * population_size_b))


def make_simulated_patient(
    signal_to_noise_ratio: float,
    disease_adata: anndata.AnnData,
    healthy_adata: anndata.AnnData,
) -> anndata.AnnData:
    """create a synthetic disease patient by mixing a real disease patient's known binding sequences and a real healthy donor's sequences that don't match known binders."""
    new_specimen_label = f"{disease_adata.obs['specimen_label'].iloc[0]}_{healthy_adata.obs['specimen_label'].iloc[0]}"
    new_participant_label = f"{disease_adata.obs['participant_label'].iloc[0]}_{healthy_adata.obs['participant_label'].iloc[0]}"

    def _label_anndata(adata, disease_status):
        adata = adata.copy()
        # label the seqs as disease or not
        adata.obs["is_actually_disease"] = disease_status
        # combine the participant labels and specimen labels
        adata.obs["participant_label"] = new_participant_label
        adata.obs["specimen_label"] = new_specimen_label
        return adata

    disease_part = _label_anndata(disease_adata, True)
    healthy_part = _label_anndata(healthy_adata, False)

    # Mix with signal_to_noise_ratio
    n_disease, n_healthy = _sample_from_two_populations_to_achieve_mix_ratio(
        population_size_a=disease_part.shape[0],
        population_size_b=healthy_part.shape[0],
        ratio=signal_to_noise_ratio,
    )
    if n_disease > disease_part.shape[0] or n_healthy > healthy_part.shape[0]:
        # this should not happen
        raise ValueError(
            f"Not enough healthy or disease sequences to achieve desired signal-to-noise ratio {signal_to_noise_ratio}."
        )
    effective_ratio = n_disease / (n_disease + n_healthy)
    logger.info(
        f"Combining {new_participant_label}, {new_specimen_label}: effective signal-to-noise ratio {effective_ratio} ({n_disease} disease and {n_healthy} healthy sequences); desired ratio {signal_to_noise_ratio}."
    )

    # sample and concatenate
    return anndata.concat(
        [
            disease_part[
                np.random.choice(
                    a=disease_part.obs_names, size=n_disease, replace=False
                )
            ],
            healthy_part[
                np.random.choice(
                    a=healthy_part.obs_names, size=n_healthy, replace=False
                )
            ],
        ]
    )

In [8]:
def validate_passes_thresholds(
    selected_seqs: anndata.AnnData,
    #     n_sequences_per_specimen_per_isotype: int,
    min_number_of_sequences_per_specimen: int,
) -> bool:
    # Sanity check that all isotypes are present:
    if set(selected_seqs.obs["isotype_supergroup"].unique()) != set(
        helpers.isotype_groups_kept[gene_locus]
    ):
        return False

    #     # Sanity check that there are enough sequences from each isotype
    #     for isotype, obs_subset in selected_seqs.obs.groupby(
    #         "isotype_supergroup", observed=True
    #     ):
    #         if obs_subset.shape[0] < n_sequences_per_specimen_per_isotype:
    #             return False

    # Sanity check that there are enough sequences from this specimen
    if selected_seqs.shape[0] < min_number_of_sequences_per_specimen:
        return False

    return True

In [9]:
def simulate_for_fold(
    adata_fold: anndata.AnnData,
    signal_to_noise_ratio: float,
    n_disease_specimens: int,
    n_healthy_specimens: int,
    #     n_sequences_per_specimen_per_isotype: int,
    min_number_of_sequences_per_specimen: int,
) -> anndata.AnnData:
    selected_disease_samples: List[anndata.AnnData] = []
    selected_healthy_samples: List[anndata.AnnData] = []

    already_sampled_participants = set()

    participants_and_specimens_by_disease = {
        disease: grp.values
        for disease, grp in adata_fold.obs[
            ["disease", "specimen_label", "participant_label"]
        ]
        .drop_duplicates()
        .groupby("disease")
    }

    for (
        _,
        disease_specimen_label,
        source_participant_label,
    ) in participants_and_specimens_by_disease["Covid19"]:
        if len(selected_disease_samples) == n_disease_specimens:
            # have enough
            break
        if source_participant_label in already_sampled_participants:
            # already used another specimen from this patient
            logger.warning(
                f"Skipping disease specimen {disease_specimen_label} because we already included another sample from {source_participant_label}."
            )
            continue
        # sample from a real patient
        known_binding_seqs: anndata.AnnData = extract_sequences_from_specimen(
            adata_fold[adata_fold.obs["specimen_label"] == disease_specimen_label],
            keep_known_binding=True,
            known_binders_cluster_centroids_by_supergroup=cluster_centroids_by_supergroup,
        )
        if validate_passes_thresholds(
            selected_seqs=known_binding_seqs,
            #             n_sequences_per_specimen_per_isotype=n_sequences_per_specimen_per_isotype,
            min_number_of_sequences_per_specimen=min_number_of_sequences_per_specimen,
        ):
            selected_disease_samples.append(known_binding_seqs)
            already_sampled_participants.add(source_participant_label)
            logger.info(
                f"Adding disease specimen {disease_specimen_label} from {source_participant_label}."
            )
        else:
            logger.warning(
                f"Disease specimen {disease_specimen_label} from {source_participant_label} did not pass thresholds."
            )

    for (
        _,
        healthy_specimen_label,
        source_participant_label,
    ) in participants_and_specimens_by_disease[healthy_label]:
        # need one per synthetic disease patient and one per healthy control we create
        if len(selected_healthy_samples) == n_disease_specimens + n_healthy_specimens:
            # have enough
            break
        if source_participant_label in already_sampled_participants:
            # already used another specimen from this patient
            logger.warning(
                f"Skipping healthy specimen {healthy_specimen_label} because we already included another sample from {source_participant_label}."
            )
            continue
        # sample from a real healthy donor
        known_not_binding_seqs = extract_sequences_from_specimen(
            adata_fold[adata_fold.obs["specimen_label"] == healthy_specimen_label],
            keep_known_binding=False,
            known_binders_cluster_centroids_by_supergroup=cluster_centroids_by_supergroup,
        )
        if validate_passes_thresholds(
            selected_seqs=known_not_binding_seqs,
            #             n_sequences_per_specimen_per_isotype=n_sequences_per_specimen_per_isotype,
            min_number_of_sequences_per_specimen=min_number_of_sequences_per_specimen,
        ):
            selected_healthy_samples.append(known_not_binding_seqs)
            already_sampled_participants.add(source_participant_label)
            logger.info(
                f"Adding healthy specimen {healthy_specimen_label} from {source_participant_label}."
            )
        else:
            logger.warning(
                f"Healthy specimen {healthy_specimen_label} from {source_participant_label} did not pass thresholds."
            )

    if len(selected_disease_samples) != n_disease_specimens:
        raise ValueError("Did not select enough disease specimens")
    if len(selected_healthy_samples) != n_disease_specimens + n_healthy_specimens:
        raise ValueError("Did not select enough healthy specimens")

    # make synthetic mixtures for disease patients
    returned_specimens: List[anndata.AnnData] = []
    for _ in range(n_disease_specimens):
        disease_specimen, healthy_specimen = (
            selected_disease_samples.pop(),
            selected_healthy_samples.pop(),
        )
        synthetic_specimen: anndata.AnnData = make_simulated_patient(
            signal_to_noise_ratio, disease_specimen, healthy_specimen
        )
        returned_specimens.append(synthetic_specimen)
    for _ in range(n_healthy_specimens):
        returned_specimens.append(selected_healthy_samples.pop())

    # package up as an anndata
    # undo any scaling
    final_anndata = anndata.concat(returned_specimens).raw.to_adata()
    final_anndata.obs_names_make_unique()

    # remove unused labels, if these variables are Categoricals
    final_anndata.obs["participant_label"] = (
        final_anndata.obs["participant_label"]
        .astype("category")
        .cat.remove_unused_categories()
    )
    final_anndata.obs["specimen_label"] = (
        final_anndata.obs["specimen_label"]
        .astype("category")
        .cat.remove_unused_categories()
    )

    # no need to pass old PCA info along
    del final_anndata.obsm

    return final_anndata

In [10]:
pca_n_comps = 10

signal_to_noise_ratios = [0.05, 0.10, 0.25]

output_dirs = {
    signal_to_noise_ratio: config.paths.simulated_data_dir
    / f"scaled_anndatas_{signal_to_noise_ratio:0.2f}"
    for signal_to_noise_ratio in signal_to_noise_ratios
}
for output_dir_anndatas in output_dirs.values():
    # Clear out and remove folder if it already exists
    if output_dir_anndatas.exists():
        if not output_dir_anndatas.is_dir():
            raise ValueError(
                f"Output directory {output_dir_anndatas} already xists but is not a directory."
            )
        shutil.rmtree(output_dir_anndatas)

    output_dir_anndatas.mkdir(parents=True, exist_ok=False)
    print(output_dir_anndatas)


for fold_id in config.cross_validation_fold_ids:
    # These transformations will be fit on train_smaller set and applied to others
    # so they start as None and then will be replaced.
    scale_transformers = {ratio: None for ratio in signal_to_noise_ratios}
    pca_transformers = {ratio: None for ratio in signal_to_noise_ratios}

    for fold_label in ["train_smaller", "validation", "test"]:
        if fold_id == -1 and fold_label == "test":
            # skip: global fold does not have a test set
            continue

        adata_fold = io.load_fold_embeddings(
            fold_id=fold_id,
            fold_label=fold_label,
            gene_locus=gene_locus,
            target_obs_column=TargetObsColumnEnum.disease,
            sample_weight_strategy=SampleWeightStrategy.ISOTYPE_USAGE,
            load_isotype_counts_per_specimen=False,
        )
        for signal_to_noise_ratio in signal_to_noise_ratios:
            # vary fraction disease specific

            # TODO: we can also pull selected_disease_samples,selected_healthy_samples out to be shared between signal_to_noise_ratios
            # (but would need to change pop())
            adata = simulate_for_fold(
                adata_fold=adata_fold,
                signal_to_noise_ratio=signal_to_noise_ratio,
                n_disease_specimens=5,
                n_healthy_specimens=5,
                # n_sequences_per_specimen_per_isotype=100, # removed because expect mostly IgG from Covid patients
                min_number_of_sequences_per_specimen=10,  # TODO: 100
            )

            # Now scale, PCA, and export the data
            # Use transformers if available (starts as None)
            (
                adata,
                scale_transformers[signal_to_noise_ratio],
            ) = malid.external.genetools_scanpy_helpers.scale_anndata(
                adata,
                scale_transformer=scale_transformers[signal_to_noise_ratio],
                inplace=True,
                set_raw=True,
            )
            (
                adata,
                pca_transformers[signal_to_noise_ratio],
            ) = malid.external.genetools_scanpy_helpers.pca_anndata(
                adata,
                pca_transformer=pca_transformers[signal_to_noise_ratio],
                n_components=pca_n_comps,
                inplace=True,
            )
            if adata.obsm["X_pca"].shape[1] != pca_n_comps:
                raise ValueError(
                    "PCA did not produce the expected number of components"
                )

            # Reduce disk space usage by removing unnecessary obs columns
            adata.obs.drop(
                columns=list(
                    set(adata.obs.columns)
                    - set(adata.uns.get("original_obs_columns", []))
                    # do not delete this column
                    - {"is_actually_disease"}
                )
                + [
                    "num_reads",
                    "total_clone_num_reads",
                    "num_clone_members",
                    "cdr1_seq_aa_q_trim",
                    "cdr2_seq_aa_q_trim",
                    "extracted_isotype",
                    "igh_or_tcrb_clone_id",
                    "cdr3_aa_sequence_trim_len",
                    "disease_subtype",
                ],
                errors="ignore",
                inplace=True,
            )
            # Sanity check: make sure we did not drop these columns
            assert "disease" in adata.obs.columns
            assert "is_actually_disease" in adata.obs.columns

            # Also remove any uns keys that were added after the original read-from-disk step within load_fold_embeddings
            for key in set(adata.uns.keys()) - set(
                adata.uns.get("original_uns_keys", [])
            ):
                del adata.uns[key]

            # Also remove large string index
            # this is a RangeIndex, but after reading back in, these will become strings automatically (ImplicitModificationWarning: Transforming to str index.)
            adata.obs_names = range(adata.shape[0])

            # Save some space on this field too
            adata.obs["v_mut"] = adata.obs["v_mut"].astype(np.float32)

            # Write anndata and CSVs to disk
            output_dir_anndatas = output_dirs[signal_to_noise_ratio]
            fname_out = output_dir_anndatas / f"fold.{fold_id}.{fold_label}.h5ad"
            logger.info(f"Fold {fold_id}-{fold_label}, {gene_locus} -> {fname_out}")
            adata.write(fname_out, compression="gzip")
            adata.obs.to_csv(
                output_dir_anndatas / f"fold.{fold_id}.{fold_label}.obs.tsv.gz",
                index=None,
                sep="\t",
            )
            np.savetxt(
                output_dir_anndatas / f"fold.{fold_id}.{fold_label}.X.tsv.gz",
                adata.X,
                fmt="%0.4f",
                delimiter="\t",
            )

            # Replace .X with X_pca
            adata = anndata.AnnData(
                X=adata.obsm["X_pca"],
                obs=adata.obs,
                uns=adata.uns,
            )
            # Writing out the anndata again is unnecessary - already have X_pca in original
            adata.write(
                output_dir_anndatas / f"fold.{fold_id}.{fold_label}.pca.h5ad",
                compression="gzip",
            )
            np.savetxt(
                output_dir_anndatas / f"fold.{fold_id}.{fold_label}.X_pca.tsv.gz",
                adata.X,
                fmt="%0.4f",
                delimiter="\t",
            )

            del adata
            gc.collect()

        # after finishing all desired signal to noise ratios, delete cached dataset
        io.clear_cached_fold_embeddings()
        gc.collect()

2022-08-31 01:14:30,556 - malid.external.scratch_cache - INFO - Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20220704_filter2/embedded/unirep_fine_tuned/anndatas_scaled/BCR/fold.0.train_smaller.h5ad -> /srv/scratch/maximz/cache/de2b6a86e81c33b78d6afeb74df304e81e419dd0a0a9e3f3ffe4dfa2.0.train_smaller.h5ad


/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20220704_filter2/embedded/unirep_fine_tuned/simulated_data/scaled_anndatas_0.05
/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20220704_filter2/embedded/unirep_fine_tuned/simulated_data/scaled_anndatas_0.10
/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20220704_filter2/embedded/unirep_fine_tuned/simulated_data/scaled_anndatas_0.25
Only considering the two last: ['.train_smaller', '.h5ad'].
Only considering the two last: ['.train_smaller', '.h5ad'].


2022-08-31 01:15:22,357 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M369-S001 from BFI-0007450.
2022-08-31 01:15:23,605 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S001 from BFI-0007480.
2022-08-31 01:15:24,833 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S023 from BFI-0007482.
2022-08-31 01:15:25,994 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S008 from BFI-0007483.
2022-08-31 01:15:26,581 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S009 from BFI-0007484.
2022-08-31 01:15:28,586 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S012 from BFI-0002852.
2022-08-31 01:15:29,550 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S038 from BFI-0002866.
2022-08-31 01

Only considering the two last: ['.validation', '.h5ad'].
Only considering the two last: ['.validation', '.h5ad'].


2022-08-31 01:51:19,620 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S017 from BFI-0007453.
2022-08-31 01:51:24,271 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S230 from BFI-0009036.
2022-08-31 01:51:25,865 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S139 from BFI-0009076.
2022-08-31 01:51:26,575 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S021 from BFI-0009139.
2022-08-31 01:51:27,862 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S027 from BFI-0009144.
2022-08-31 01:51:28,693 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S040 from BFI-0002868.
2022-08-31 01:51:30,935 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M64-002 from BFI-0003051.
2022-08-31 01:5

Only considering the two last: ['.test', '.h5ad'].
Only considering the two last: ['.test', '.h5ad'].


2022-08-31 02:39:32,711 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M369-S007 from BFI-0007455.
2022-08-31 02:39:35,287 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S028 from BFI-0007486.
2022-08-31 02:39:38,915 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S144 from BFI-0009080.
2022-08-31 02:39:43,050 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S193 from BFI-0009121.
2022-08-31 02:39:43,790 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S001 from BFI-0009122.
2022-08-31 02:39:44,645 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S014 from BFI-0000234.
2022-08-31 02:39:45,445 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S042 from BFI-0002850.
2022-08-31 02

Only considering the two last: ['.train_smaller', '.h5ad'].
Only considering the two last: ['.train_smaller', '.h5ad'].


2022-08-31 03:07:11,812 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S017 from BFI-0007453.
2022-08-31 03:07:14,371 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S023 from BFI-0007482.
2022-08-31 03:07:15,035 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S011 from BFI-0007485.
2022-08-31 03:07:15,706 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S028 from BFI-0007486.
2022-08-31 03:07:17,388 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S230 from BFI-0009036.
2022-08-31 03:07:18,295 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S042 from BFI-0002850.
2022-08-31 03:07:19,134 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S012 from BFI-0002852.
2022-08-31 03

Only considering the two last: ['.validation', '.h5ad'].
Only considering the two last: ['.validation', '.h5ad'].


2022-08-31 03:33:36,606 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M369-S007 from BFI-0007455.
2022-08-31 03:33:39,692 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S001 from BFI-0007480.
2022-08-31 03:33:41,255 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S008 from BFI-0007483.
2022-08-31 03:33:42,779 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S163 from BFI-0009094.
2022-08-31 03:33:44,484 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S001 from BFI-0009122.
2022-08-31 03:33:45,477 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S014 from BFI-0000234.
2022-08-31 03:33:46,394 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S041 from BFI-0002851.
2022-08-31 03

Only considering the two last: ['.test', '.h5ad'].
Only considering the two last: ['.test', '.h5ad'].


2022-08-31 03:56:19,102 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M369-S001 from BFI-0007450.
2022-08-31 03:56:20,760 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S009 from BFI-0007484.
2022-08-31 03:56:22,617 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S094 from BFI-0009047.
2022-08-31 03:56:23,163 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S095 from BFI-0009048.
2022-08-31 03:56:24,054 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S139 from BFI-0009076.
2022-08-31 03:56:24,770 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S038 from BFI-0002866.
2022-08-31 03:56:25,573 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S040 from BFI-0002868.
2022-08-31 03

Only considering the two last: ['.train_smaller', '.h5ad'].
Only considering the two last: ['.train_smaller', '.h5ad'].


2022-08-31 04:45:22,373 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M369-S001 from BFI-0007450.
2022-08-31 04:45:26,127 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M369-S007 from BFI-0007455.
2022-08-31 04:45:27,251 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S094 from BFI-0009047.
2022-08-31 04:45:27,905 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S095 from BFI-0009048.
2022-08-31 04:45:29,642 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S139 from BFI-0009076.
2022-08-31 04:45:30,328 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S042 from BFI-0002850.
2022-08-31 04:45:30,809 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S037 from BFI-0002861.
2022-08-31 04

Only considering the two last: ['.validation', '.h5ad'].
Only considering the two last: ['.validation', '.h5ad'].


2022-08-31 05:08:25,335 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S009 from BFI-0007484.
2022-08-31 05:08:27,975 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S028 from BFI-0007486.
2022-08-31 05:08:31,496 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S144 from BFI-0009080.
2022-08-31 05:08:34,506 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S001 from BFI-0009122.
2022-08-31 05:08:35,204 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M418-S014 from BFI-0009132.
2022-08-31 05:08:36,066 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S014 from BFI-0000234.
2022-08-31 05:08:37,056 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S041 from BFI-0002851.
2022-08-31 05

Only considering the two last: ['.test', '.h5ad'].
Only considering the two last: ['.test', '.h5ad'].


2022-08-31 05:52:38,023 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S017 from BFI-0007453.
2022-08-31 05:52:40,026 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S001 from BFI-0007480.
2022-08-31 05:52:40,773 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S023 from BFI-0007482.
2022-08-31 05:52:41,600 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S008 from BFI-0007483.
2022-08-31 05:52:42,241 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding disease specimen M371-S011 from BFI-0007485.
2022-08-31 05:52:43,011 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S012 from BFI-0002852.
2022-08-31 05:52:44,023 - generate_simulation_datasets.known_binders_only.ipynb - INFO - Adding healthy specimen M124-S039 from BFI-0002867.
2022-08-31 05