# Analyze convergent clustering model performance on validation set

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
import seaborn as sns
import genetools
from IPython.display import display

In [3]:
from malid import config, logger
from malid.external import model_evaluation
from malid.datamodels import (
    combine_classification_option_names,
)
from malid.trained_model_wrappers import ConvergentClusterClassifier

# Analyze

In [5]:
for gene_locus in config.gene_loci_used:
    for target_obs_col in config.classification_targets:
        models_base_dir = ConvergentClusterClassifier._get_model_base_dir(
            gene_locus=gene_locus, target_obs_column=target_obs_col
        )  # should already exist

        output_base_dir = (
            config.paths.convergent_clusters_output_dir
            / gene_locus.name
            / combine_classification_option_names(target_obs_col)
        )  # might not yet exist
        output_base_dir.mkdir(parents=True, exist_ok=True)  # create if needed

        model_output_prefix = models_base_dir / "train_smaller_model"
        results_output_prefix = output_base_dir / "train_smaller_model"

        try:
            logger.info(
                f"{gene_locus}, {target_obs_col} from {model_output_prefix} to {results_output_prefix}"
            )

            ## Load and summarize
            experiment_set = model_evaluation.ExperimentSet.load_from_disk(
                output_prefix=model_output_prefix
            )

            # Remove global fold (we trained global fold model, but now get evaluation scores on cross-validation folds only)
            # TODO: make kdict support: del self.model_outputs[:, fold_id]
            for key in experiment_set.model_outputs[:, -1].keys():
                logger.debug(f"Removing {key} (global fold)")
                del experiment_set.model_outputs[key]

            experiment_set_global_performance = experiment_set.summarize()
            experiment_set_global_performance.export_all_models(
                func_generate_classification_report_fname=lambda model_name: f"{results_output_prefix}.classification_report.{model_name}.txt",
                func_generate_confusion_matrix_fname=lambda model_name: f"{results_output_prefix}.confusion_matrix.{model_name}.png",
                dpi=72,
            )
            combined_stats = (
                experiment_set_global_performance.get_model_comparison_stats()
            )
            combined_stats.to_csv(
                f"{results_output_prefix}.compare_model_scores.tsv",
                sep="\t",
            )
            print(gene_locus, target_obs_col)
            display(combined_stats)

            # Which p values were chosen (varies by locus, model, and fold)? How many disease-associated sequences found?
            for fold_id in config.cross_validation_fold_ids:
                for model_name in [
                    "lasso_multiclass",
                    "rf_multiclass",
                    "linearsvm_ovr",
                ]:
                    clf = ConvergentClusterClassifier(
                        fold_id=fold_id,
                        model_name=model_name,
                        fold_label_train="train_smaller",
                        gene_locus=gene_locus,
                        target_obs_column=target_obs_col,
                    )
                    p_value = clf.p_value_threshold
                    clusters = clf.cluster_centroids_with_class_specific_p_values
                    feature_names = clf.feature_names_in_
                    print(
                        f"{gene_locus}, fold {fold_id}, {target_obs_col}, {model_name}: best p value = {p_value}. Number of disease associated clusters: {(clusters[feature_names] <= p_value).sum().to_dict()}"
                    )
                print()

        except Exception as err:
            logger.exception(f"{gene_locus}, {target_obs_col} failed with error: {err}")

2023-01-07 03:53:02,592 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.disease from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/disease/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/disease/train_smaller_model


GeneLocus.BCR TargetObsColumnEnum.disease


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,au-PRC (macro OvO) per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
lasso_multiclass,0.912 +/- 0.010 (in 3 folds),0.923 +/- 0.009 (in 3 folds),0.917 +/- 0.004 (in 3 folds),0.930 +/- 0.004 (in 3 folds),0.750 +/- 0.010 (in 3 folds),0.641 +/- 0.021 (in 3 folds),0.75,0.637,0.743 +/- 0.019 (in 3 folds),0.633 +/- 0.027 (in 3 folds),...,0.931 +/- 0.005 (in 2 folds),0.743,0.628,0.009,Unknown,320,3,323,0.009288,False
linearsvm_ovr,0.912 +/- 0.004 (in 3 folds),0.922 +/- 0.004 (in 3 folds),0.919 +/- 0.007 (in 3 folds),0.931 +/- 0.006 (in 3 folds),0.754 +/- 0.012 (in 3 folds),0.647 +/- 0.017 (in 3 folds),0.754,0.643,0.749 +/- 0.005 (in 3 folds),0.642 +/- 0.008 (in 3 folds),...,0.934 +/- 0.004 (in 2 folds),0.749,0.638,0.006,Unknown,321,2,323,0.006192,False
ridge_cv,0.903 +/- 0.002 (in 3 folds),0.915 +/- 0.003 (in 3 folds),0.905 +/- 0.006 (in 3 folds),0.920 +/- 0.002 (in 3 folds),0.563 +/- 0.077 (in 3 folds),0.337 +/- 0.138 (in 3 folds),0.562,0.344,0.561 +/- 0.078 (in 3 folds),0.335 +/- 0.138 (in 3 folds),...,0.920 +/- 0.003 (in 2 folds),0.56,0.342,0.003,Unknown,322,1,323,0.003096,False
lasso_cv,0.897 +/- 0.017 (in 3 folds),0.907 +/- 0.018 (in 3 folds),0.903 +/- 0.012 (in 3 folds),0.916 +/- 0.012 (in 3 folds),0.532 +/- 0.035 (in 3 folds),0.284 +/- 0.068 (in 3 folds),0.531,0.287,0.524 +/- 0.041 (in 3 folds),0.266 +/- 0.089 (in 3 folds),...,0.921 +/- 0.000 (in 1 folds),0.523,0.268,0.015,Unknown,318,5,323,0.01548,True
rf_multiclass,0.889 +/- 0.024 (in 3 folds),0.896 +/- 0.025 (in 3 folds),0.885 +/- 0.024 (in 3 folds),0.894 +/- 0.023 (in 3 folds),0.740 +/- 0.022 (in 3 folds),0.616 +/- 0.034 (in 3 folds),0.74,0.613,0.731 +/- 0.032 (in 3 folds),0.606 +/- 0.046 (in 3 folds),...,0.888 +/- 0.000 (in 1 folds),0.731,0.602,0.012,Unknown,319,4,323,0.012384,False
xgboost,0.882 +/- 0.009 (in 3 folds),0.887 +/- 0.009 (in 3 folds),0.883 +/- 0.009 (in 3 folds),0.892 +/- 0.008 (in 3 folds),0.721 +/- 0.007 (in 3 folds),0.587 +/- 0.004 (in 3 folds),0.721,0.583,0.712 +/- 0.010 (in 3 folds),0.577 +/- 0.010 (in 3 folds),...,0.901 +/- 0.000 (in 1 folds),0.712,0.573,0.012,Unknown,319,4,323,0.012384,False
elasticnet_cv,0.878 +/- 0.023 (in 3 folds),0.891 +/- 0.019 (in 3 folds),0.891 +/- 0.020 (in 3 folds),0.906 +/- 0.017 (in 3 folds),0.590 +/- 0.129 (in 3 folds),0.370 +/- 0.219 (in 3 folds),0.589,0.386,0.587 +/- 0.133 (in 3 folds),0.368 +/- 0.223 (in 3 folds),...,0.906 +/- 0.024 (in 2 folds),0.585,0.382,0.006,Unknown,321,2,323,0.006192,False
dummy_stratified,0.536 +/- 0.032 (in 3 folds),0.543 +/- 0.036 (in 3 folds),0.529 +/- 0.020 (in 3 folds),0.535 +/- 0.024 (in 3 folds),0.378 +/- 0.047 (in 3 folds),0.066 +/- 0.060 (in 3 folds),0.377,0.064,0.366 +/- 0.040 (in 3 folds),0.064 +/- 0.055 (in 3 folds),...,,0.365,0.063,0.031,Unknown,313,10,323,0.03096,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.467 +/- 0.016 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.466,0.0,0.452 +/- 0.010 (in 3 folds),0.028 +/- 0.020 (in 3 folds),...,,0.452,0.025,0.031,Unknown,313,10,323,0.03096,True


GeneLocus.BCR, fold 0, TargetObsColumnEnum.disease, lasso_multiclass: best p value = 0.0005. Number of disease associated clusters: {'Covid19': 137, 'HIV': 151, 'Healthy/Background': 20, 'Lupus': 23}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.disease, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 354, 'HIV': 365, 'Healthy/Background': 102, 'Lupus': 182}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.disease, linearsvm_ovr: best p value = 0.001. Number of disease associated clusters: {'Covid19': 144, 'HIV': 172, 'Healthy/Background': 35, 'Lupus': 41}

GeneLocus.BCR, fold 1, TargetObsColumnEnum.disease, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 301, 'HIV': 212, 'Healthy/Background': 157, 'Lupus': 291}


2023-01-07 03:53:07,843 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.disease_all_demographics_present from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/disease_all_demographics_present/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/disease_all_demographics_present/train_smaller_model


GeneLocus.BCR, fold 1, TargetObsColumnEnum.disease, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 301, 'HIV': 212, 'Healthy/Background': 157, 'Lupus': 291}
GeneLocus.BCR, fold 1, TargetObsColumnEnum.disease, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'Covid19': 351, 'HIV': 552, 'Healthy/Background': 301, 'Lupus': 709}

GeneLocus.BCR, fold 2, TargetObsColumnEnum.disease, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 149, 'HIV': 221, 'Healthy/Background': 147, 'Lupus': 414}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.disease, rf_multiclass: best p value = 0.0005. Number of disease associated clusters: {'Covid19': 58, 'HIV': 89, 'Healthy/Background': 28, 'Lupus': 63}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.disease, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'Covid19': 187, 'HIV': 594, 'Healthy/Background': 284, 'Lupus': 940}



GeneLocus.BCR TargetObsColumnEnum.disease_all_demographics_present


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,au-PRC (macro OvO) per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
linearsvm_ovr,0.914 +/- 0.004 (in 3 folds),0.924 +/- 0.006 (in 3 folds),0.921 +/- 0.006 (in 3 folds),0.931 +/- 0.009 (in 3 folds),0.746 +/- 0.026 (in 3 folds),0.620 +/- 0.030 (in 3 folds),0.746,0.62,0.741 +/- 0.029 (in 3 folds),0.614 +/- 0.030 (in 3 folds),...,0.923 +/- 0.000 (in 1 folds),0.74,0.614,0.007,Unknown,283,2,285,0.007018,False
lasso_multiclass,0.911 +/- 0.014 (in 3 folds),0.921 +/- 0.013 (in 3 folds),0.920 +/- 0.014 (in 3 folds),0.930 +/- 0.014 (in 3 folds),0.738 +/- 0.028 (in 3 folds),0.622 +/- 0.043 (in 3 folds),0.738,0.618,0.730 +/- 0.028 (in 3 folds),0.613 +/- 0.042 (in 3 folds),...,,0.73,0.609,0.011,Unknown,282,3,285,0.010526,False
ridge_cv,0.900 +/- 0.015 (in 3 folds),0.912 +/- 0.017 (in 3 folds),0.902 +/- 0.025 (in 3 folds),0.916 +/- 0.027 (in 3 folds),0.534 +/- 0.111 (in 3 folds),0.171 +/- 0.296 (in 3 folds),0.535,0.292,0.508 +/- 0.120 (in 3 folds),0.193 +/- 0.256 (in 3 folds),...,,0.509,0.229,0.049,Unknown,271,14,285,0.049123,False
xgboost,0.890 +/- 0.015 (in 3 folds),0.894 +/- 0.021 (in 3 folds),0.899 +/- 0.017 (in 3 folds),0.907 +/- 0.024 (in 3 folds),0.728 +/- 0.016 (in 3 folds),0.593 +/- 0.040 (in 3 folds),0.728,0.592,0.712 +/- 0.006 (in 3 folds),0.574 +/- 0.022 (in 3 folds),...,0.880 +/- 0.000 (in 1 folds),0.712,0.573,0.021,Unknown,279,6,285,0.021053,False
lasso_cv,0.884 +/- 0.031 (in 3 folds),0.898 +/- 0.030 (in 3 folds),0.888 +/- 0.032 (in 3 folds),0.905 +/- 0.032 (in 3 folds),0.588 +/- 0.142 (in 3 folds),0.380 +/- 0.206 (in 3 folds),0.587,0.376,0.584 +/- 0.144 (in 3 folds),0.367 +/- 0.212 (in 3 folds),...,0.868 +/- 0.000 (in 1 folds),0.582,0.367,0.007,Unknown,283,2,285,0.007018,False
rf_multiclass,0.878 +/- 0.018 (in 3 folds),0.885 +/- 0.022 (in 3 folds),0.883 +/- 0.025 (in 3 folds),0.893 +/- 0.028 (in 3 folds),0.739 +/- 0.019 (in 3 folds),0.612 +/- 0.042 (in 3 folds),0.739,0.612,0.726 +/- 0.021 (in 3 folds),0.596 +/- 0.032 (in 3 folds),...,0.890 +/- 0.000 (in 1 folds),0.726,0.594,0.018,Unknown,280,5,285,0.017544,False
elasticnet_cv,0.878 +/- 0.013 (in 3 folds),0.891 +/- 0.009 (in 3 folds),0.896 +/- 0.015 (in 3 folds),0.911 +/- 0.012 (in 3 folds),0.610 +/- 0.143 (in 3 folds),0.422 +/- 0.212 (in 3 folds),0.611,0.431,0.601 +/- 0.150 (in 3 folds),0.391 +/- 0.247 (in 3 folds),...,0.897 +/- 0.000 (in 1 folds),0.6,0.405,0.018,Unknown,280,5,285,0.017544,False
dummy_stratified,0.549 +/- 0.026 (in 3 folds),0.547 +/- 0.016 (in 3 folds),0.533 +/- 0.013 (in 3 folds),0.534 +/- 0.009 (in 3 folds),0.414 +/- 0.059 (in 3 folds),0.110 +/- 0.066 (in 3 folds),0.41,0.105,0.393 +/- 0.023 (in 3 folds),0.104 +/- 0.055 (in 3 folds),...,0.529 +/- 0.007 (in 2 folds),0.393,0.102,0.042,Unknown,273,12,285,0.042105,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.474 +/- 0.043 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.472,0.0,0.442 +/- 0.013 (in 3 folds),0.035 +/- 0.020 (in 3 folds),...,,0.442,0.029,0.063,Unknown,267,18,285,0.063158,True


GeneLocus.BCR, fold 0, TargetObsColumnEnum.disease_all_demographics_present, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 323, 'HIV': 286, 'Healthy/Background': 99, 'Lupus': 124}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.disease_all_demographics_present, rf_multiclass: best p value = 0.001. Number of disease associated clusters: {'Covid19': 158, 'HIV': 143, 'Healthy/Background': 33, 'Lupus': 25}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.disease_all_demographics_present, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'Covid19': 323, 'HIV': 286, 'Healthy/Background': 99, 'Lupus': 124}

GeneLocus.BCR, fold 1, TargetObsColumnEnum.disease_all_demographics_present, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 147, 'HIV': 175, 'Healthy/Background': 144, 'Lupus': 209}


GeneLocus.BCR, fold 1, TargetObsColumnEnum.disease_all_demographics_present, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 702, 'HIV': 499, 'Healthy/Background': 215, 'Lupus': 708}


2023-01-07 03:53:12,587 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.covid_vs_healthy from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/covid_vs_healthy/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/covid_vs_healthy/train_smaller_model


GeneLocus.BCR, fold 1, TargetObsColumnEnum.disease_all_demographics_present, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'Covid19': 702, 'HIV': 499, 'Healthy/Background': 215, 'Lupus': 708}

GeneLocus.BCR, fold 2, TargetObsColumnEnum.disease_all_demographics_present, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 64, 'HIV': 186, 'Healthy/Background': 124, 'Lupus': 267}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.disease_all_demographics_present, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 64, 'HIV': 186, 'Healthy/Background': 124, 'Lupus': 267}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.disease_all_demographics_present, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'Covid19': 64, 'HIV': 186, 'Healthy/Background': 124, 'Lupus': 267}



GeneLocus.BCR TargetObsColumnEnum.covid_vs_healthy


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,...,Unknown/abstention proportion per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
linearsvm_ovr,0.994 +/- 0.001 (in 3 folds),0.994 +/- 0.001 (in 3 folds),0.998 +/- 0.000 (in 3 folds),0.998 +/- 0.000 (in 3 folds),0.963 +/- 0.018 (in 3 folds),0.903 +/- 0.041 (in 3 folds),0.963,0.898,0.994 +/- 0.000 (in 1 folds),0.994 +/- 0.000 (in 1 folds),...,0.016 +/- 0.000 (in 2 folds),0.953,0.873,0.01,Unknown,189,2,191,0.010471,False
ridge_cv,0.992 +/- 0.004 (in 3 folds),0.992 +/- 0.004 (in 3 folds),0.998 +/- 0.001 (in 3 folds),0.998 +/- 0.001 (in 3 folds),0.789 +/- 0.013 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.789,0.0,,,...,0.031 +/- 0.016 (in 3 folds),0.764,0.061,0.031,Unknown,185,6,191,0.031414,True
lasso_multiclass,0.992 +/- 0.003 (in 3 folds),0.992 +/- 0.003 (in 3 folds),0.998 +/- 0.000 (in 3 folds),0.998 +/- 0.000 (in 3 folds),0.963 +/- 0.018 (in 3 folds),0.897 +/- 0.048 (in 3 folds),0.963,0.892,0.993 +/- 0.000 (in 1 folds),0.993 +/- 0.000 (in 1 folds),...,0.016 +/- 0.000 (in 2 folds),0.953,0.866,0.01,Unknown,189,2,191,0.010471,False
elasticnet_cv,0.990 +/- 0.008 (in 3 folds),0.990 +/- 0.008 (in 3 folds),0.997 +/- 0.002 (in 3 folds),0.997 +/- 0.002 (in 3 folds),0.803 +/- 0.017 (in 3 folds),0.275 +/- 0.057 (in 3 folds),0.803,0.279,0.986 +/- 0.004 (in 2 folds),0.986 +/- 0.004 (in 2 folds),...,0.048 +/- 0.000 (in 1 folds),0.791,0.22,0.016,Unknown,188,3,191,0.015707,False
rf_multiclass,0.989 +/- 0.004 (in 3 folds),0.989 +/- 0.004 (in 3 folds),0.997 +/- 0.001 (in 3 folds),0.997 +/- 0.001 (in 3 folds),0.947 +/- 0.025 (in 3 folds),0.853 +/- 0.064 (in 3 folds),0.947,0.851,0.989 +/- 0.000 (in 1 folds),0.989 +/- 0.000 (in 1 folds),...,0.016 +/- 0.000 (in 2 folds),0.937,0.828,0.01,Unknown,189,2,191,0.010471,False
lasso_cv,0.988 +/- 0.010 (in 3 folds),0.988 +/- 0.010 (in 3 folds),0.996 +/- 0.003 (in 3 folds),0.996 +/- 0.003 (in 3 folds),0.867 +/- 0.043 (in 3 folds),0.559 +/- 0.185 (in 3 folds),0.868,0.578,0.989 +/- 0.000 (in 1 folds),0.989 +/- 0.000 (in 1 folds),...,0.016 +/- 0.000 (in 2 folds),0.859,0.549,0.01,Unknown,189,2,191,0.010471,False
xgboost,0.981 +/- 0.020 (in 3 folds),0.981 +/- 0.020 (in 3 folds),0.993 +/- 0.006 (in 3 folds),0.993 +/- 0.006 (in 3 folds),0.958 +/- 0.033 (in 3 folds),0.869 +/- 0.110 (in 3 folds),0.957,0.872,0.975 +/- 0.023 (in 2 folds),0.975 +/- 0.023 (in 2 folds),...,0.063 +/- 0.000 (in 1 folds),0.937,0.817,0.021,Unknown,187,4,191,0.020942,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.789 +/- 0.013 (in 3 folds),0.789 +/- 0.013 (in 3 folds),0.789 +/- 0.013 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.789,0.0,,,...,0.031 +/- 0.016 (in 3 folds),0.764,0.061,0.031,Unknown,185,6,191,0.031414,True
dummy_stratified,0.428 +/- 0.020 (in 3 folds),0.428 +/- 0.020 (in 3 folds),0.764 +/- 0.021 (in 3 folds),0.764 +/- 0.021 (in 3 folds),0.629 +/- 0.033 (in 3 folds),-0.150 +/- 0.034 (in 3 folds),0.628,-0.151,,,...,0.042 +/- 0.010 (in 3 folds),0.602,-0.121,0.042,Unknown,183,8,191,0.041885,False


GeneLocus.BCR, fold 0, TargetObsColumnEnum.covid_vs_healthy, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 172, 'Healthy/Background': 145}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.covid_vs_healthy, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 222, 'Healthy/Background': 204}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.covid_vs_healthy, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'Covid19': 172, 'Healthy/Background': 145}

GeneLocus.BCR, fold 1, TargetObsColumnEnum.covid_vs_healthy, lasso_multiclass: best p value = 0.001. Number of disease associated clusters: {'Covid19': 60, 'Healthy/Background': 112}


GeneLocus.BCR, fold 1, TargetObsColumnEnum.covid_vs_healthy, rf_multiclass: best p value = 0.001. Number of disease associated clusters: {'Covid19': 60, 'Healthy/Background': 112}


2023-01-07 03:53:15,581 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.hiv_vs_healthy from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/hiv_vs_healthy/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/hiv_vs_healthy/train_smaller_model


GeneLocus.BCR, fold 1, TargetObsColumnEnum.covid_vs_healthy, linearsvm_ovr: best p value = 0.001. Number of disease associated clusters: {'Covid19': 60, 'Healthy/Background': 112}

GeneLocus.BCR, fold 2, TargetObsColumnEnum.covid_vs_healthy, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 102, 'Healthy/Background': 370}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.covid_vs_healthy, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 102, 'Healthy/Background': 370}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.covid_vs_healthy, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'Covid19': 102, 'Healthy/Background': 370}



GeneLocus.BCR TargetObsColumnEnum.hiv_vs_healthy


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,au-PRC (weighted OvO) per fold with abstention,au-PRC (macro OvO) per fold with abstention
elasticnet_cv,0.975 +/- 0.008 (in 3 folds),0.975 +/- 0.008 (in 3 folds),0.987 +/- 0.006 (in 3 folds),0.987 +/- 0.006 (in 3 folds),0.811 +/- 0.087 (in 3 folds),0.539 +/- 0.236 (in 3 folds),0.808,0.556,0.752 +/- 0.066 (in 3 folds),0.386 +/- 0.237 (in 3 folds),...,Unknown,198,15,213,0.070423,False,,,,
ridge_cv,0.973 +/- 0.005 (in 3 folds),0.973 +/- 0.005 (in 3 folds),0.988 +/- 0.003 (in 3 folds),0.988 +/- 0.003 (in 3 folds),0.701 +/- 0.006 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.701,0.0,0.695 +/- 0.014 (in 3 folds),0.012 +/- 0.021 (in 3 folds),...,Unknown,211,2,213,0.00939,True,0.972 +/- 0.007 (in 2 folds),0.972 +/- 0.007 (in 2 folds),0.988 +/- 0.004 (in 2 folds),0.988 +/- 0.004 (in 2 folds)
linearsvm_ovr,0.971 +/- 0.009 (in 3 folds),0.971 +/- 0.009 (in 3 folds),0.986 +/- 0.006 (in 3 folds),0.986 +/- 0.006 (in 3 folds),0.924 +/- 0.015 (in 3 folds),0.821 +/- 0.034 (in 3 folds),0.924,0.816,0.911 +/- 0.020 (in 3 folds),0.795 +/- 0.037 (in 3 folds),...,Unknown,210,3,213,0.014085,False,0.979 +/- 0.000 (in 1 folds),0.979 +/- 0.000 (in 1 folds),0.991 +/- 0.000 (in 1 folds),0.991 +/- 0.000 (in 1 folds)
lasso_cv,0.969 +/- 0.015 (in 3 folds),0.969 +/- 0.015 (in 3 folds),0.985 +/- 0.009 (in 3 folds),0.985 +/- 0.009 (in 3 folds),0.887 +/- 0.009 (in 3 folds),0.736 +/- 0.016 (in 3 folds),0.887,0.735,0.850 +/- 0.032 (in 3 folds),0.649 +/- 0.072 (in 3 folds),...,Unknown,204,9,213,0.042254,False,,,,
lasso_multiclass,0.969 +/- 0.006 (in 3 folds),0.969 +/- 0.006 (in 3 folds),0.985 +/- 0.005 (in 3 folds),0.985 +/- 0.005 (in 3 folds),0.933 +/- 0.016 (in 3 folds),0.839 +/- 0.041 (in 3 folds),0.933,0.839,0.911 +/- 0.020 (in 3 folds),0.793 +/- 0.049 (in 3 folds),...,Unknown,208,5,213,0.023474,False,,,,
xgboost,0.955 +/- 0.005 (in 3 folds),0.955 +/- 0.005 (in 3 folds),0.976 +/- 0.006 (in 3 folds),0.976 +/- 0.006 (in 3 folds),0.918 +/- 0.029 (in 3 folds),0.804 +/- 0.073 (in 3 folds),0.918,0.803,0.897 +/- 0.031 (in 3 folds),0.759 +/- 0.076 (in 3 folds),...,Unknown,208,5,213,0.023474,False,,,,
rf_multiclass,0.932 +/- 0.010 (in 3 folds),0.932 +/- 0.010 (in 3 folds),0.961 +/- 0.002 (in 3 folds),0.961 +/- 0.002 (in 3 folds),0.914 +/- 0.029 (in 3 folds),0.795 +/- 0.068 (in 3 folds),0.914,0.791,0.897 +/- 0.018 (in 3 folds),0.759 +/- 0.045 (in 3 folds),...,Unknown,209,4,213,0.018779,False,0.922 +/- 0.000 (in 1 folds),0.922 +/- 0.000 (in 1 folds),0.962 +/- 0.000 (in 1 folds),0.962 +/- 0.000 (in 1 folds)
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.701 +/- 0.006 (in 3 folds),0.701 +/- 0.006 (in 3 folds),0.701 +/- 0.006 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.701,0.0,0.695 +/- 0.014 (in 3 folds),0.012 +/- 0.021 (in 3 folds),...,Unknown,211,2,213,0.00939,True,0.500 +/- 0.000 (in 2 folds),0.500 +/- 0.000 (in 2 folds),0.702 +/- 0.009 (in 2 folds),0.702 +/- 0.009 (in 2 folds)
dummy_stratified,0.493 +/- 0.051 (in 3 folds),0.493 +/- 0.051 (in 3 folds),0.664 +/- 0.029 (in 3 folds),0.664 +/- 0.029 (in 3 folds),0.556 +/- 0.039 (in 3 folds),-0.016 +/- 0.105 (in 3 folds),0.556,-0.017,0.469 +/- 0.015 (in 3 folds),-0.038 +/- 0.091 (in 3 folds),...,Unknown,180,33,213,0.15493,False,,,,


GeneLocus.BCR, fold 0, TargetObsColumnEnum.hiv_vs_healthy, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'HIV': 337, 'Healthy/Background': 62}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.hiv_vs_healthy, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'HIV': 1298, 'Healthy/Background': 311}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.hiv_vs_healthy, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'HIV': 337, 'Healthy/Background': 62}

GeneLocus.BCR, fold 1, TargetObsColumnEnum.hiv_vs_healthy, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'HIV': 213, 'Healthy/Background': 147}


2023-01-07 03:53:18,537 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.lupus_vs_healthy from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/lupus_vs_healthy/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/lupus_vs_healthy/train_smaller_model


GeneLocus.BCR, fold 1, TargetObsColumnEnum.hiv_vs_healthy, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'HIV': 213, 'Healthy/Background': 147}
GeneLocus.BCR, fold 1, TargetObsColumnEnum.hiv_vs_healthy, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'HIV': 213, 'Healthy/Background': 147}

GeneLocus.BCR, fold 2, TargetObsColumnEnum.hiv_vs_healthy, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'HIV': 216, 'Healthy/Background': 124}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.hiv_vs_healthy, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'HIV': 216, 'Healthy/Background': 124}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.hiv_vs_healthy, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'HIV': 885, 'Healthy/Background': 491}



GeneLocus.BCR TargetObsColumnEnum.lupus_vs_healthy


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,au-PRC (weighted OvO) per fold with abstention,au-PRC (macro OvO) per fold with abstention
elasticnet_cv,0.863 +/- 0.029 (in 3 folds),0.863 +/- 0.029 (in 3 folds),0.739 +/- 0.118 (in 3 folds),0.739 +/- 0.118 (in 3 folds),0.747 +/- 0.046 (in 3 folds),0.389 +/- 0.152 (in 3 folds),0.747,0.387,0.541 +/- 0.136 (in 3 folds),0.206 +/- 0.136 (in 3 folds),...,Unknown,158,59,217,0.271889,False,,,,
linearsvm_ovr,0.863 +/- 0.016 (in 3 folds),0.863 +/- 0.016 (in 3 folds),0.780 +/- 0.027 (in 3 folds),0.780 +/- 0.027 (in 3 folds),0.821 +/- 0.060 (in 3 folds),0.581 +/- 0.141 (in 3 folds),0.821,0.576,0.802 +/- 0.035 (in 3 folds),0.547 +/- 0.098 (in 3 folds),...,Unknown,212,5,217,0.023041,False,0.853 +/- 0.000 (in 1 folds),0.853 +/- 0.000 (in 1 folds),0.787 +/- 0.000 (in 1 folds),0.787 +/- 0.000 (in 1 folds)
lasso_cv,0.862 +/- 0.035 (in 3 folds),0.862 +/- 0.035 (in 3 folds),0.766 +/- 0.075 (in 3 folds),0.766 +/- 0.075 (in 3 folds),0.764 +/- 0.031 (in 3 folds),0.436 +/- 0.111 (in 3 folds),0.762,0.424,0.650 +/- 0.043 (in 3 folds),0.259 +/- 0.090 (in 3 folds),...,Unknown,185,32,217,0.147465,False,,,,
lasso_multiclass,0.859 +/- 0.022 (in 3 folds),0.859 +/- 0.022 (in 3 folds),0.771 +/- 0.036 (in 3 folds),0.771 +/- 0.036 (in 3 folds),0.821 +/- 0.060 (in 3 folds),0.581 +/- 0.141 (in 3 folds),0.821,0.576,0.802 +/- 0.035 (in 3 folds),0.547 +/- 0.098 (in 3 folds),...,Unknown,212,5,217,0.023041,False,0.849 +/- 0.000 (in 1 folds),0.849 +/- 0.000 (in 1 folds),0.773 +/- 0.000 (in 1 folds),0.773 +/- 0.000 (in 1 folds)
rf_multiclass,0.829 +/- 0.034 (in 3 folds),0.829 +/- 0.034 (in 3 folds),0.759 +/- 0.071 (in 3 folds),0.759 +/- 0.071 (in 3 folds),0.808 +/- 0.059 (in 3 folds),0.557 +/- 0.143 (in 3 folds),0.809,0.549,0.723 +/- 0.112 (in 3 folds),0.452 +/- 0.128 (in 3 folds),...,Unknown,194,23,217,0.105991,False,,,,
xgboost,0.818 +/- 0.037 (in 3 folds),0.818 +/- 0.037 (in 3 folds),0.735 +/- 0.103 (in 3 folds),0.735 +/- 0.103 (in 3 folds),0.828 +/- 0.038 (in 3 folds),0.604 +/- 0.080 (in 3 folds),0.831,0.602,0.723 +/- 0.112 (in 3 folds),0.469 +/- 0.120 (in 3 folds),...,Unknown,189,28,217,0.129032,False,,,,
ridge_cv,0.746 +/- 0.219 (in 3 folds),0.746 +/- 0.219 (in 3 folds),0.538 +/- 0.244 (in 3 folds),0.538 +/- 0.244 (in 3 folds),0.709 +/- 0.019 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.707,0.0,0.323 +/- 0.057 (in 3 folds),0.016 +/- 0.037 (in 3 folds),...,Unknown,99,118,217,0.543779,True,,,,
dummy_stratified,0.541 +/- 0.093 (in 3 folds),0.541 +/- 0.093 (in 3 folds),0.348 +/- 0.045 (in 3 folds),0.348 +/- 0.045 (in 3 folds),0.550 +/- 0.075 (in 3 folds),0.072 +/- 0.167 (in 3 folds),0.536,0.046,0.272 +/- 0.046 (in 3 folds),0.019 +/- 0.060 (in 3 folds),...,Unknown,110,107,217,0.493088,False,,,,
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.291 +/- 0.019 (in 3 folds),0.291 +/- 0.019 (in 3 folds),0.709 +/- 0.019 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.707,0.0,0.323 +/- 0.057 (in 3 folds),0.016 +/- 0.037 (in 3 folds),...,Unknown,99,118,217,0.543779,True,,,,


GeneLocus.BCR, fold 0, TargetObsColumnEnum.lupus_vs_healthy, lasso_multiclass: best p value = 0.05. Number of disease associated clusters: {'Healthy/Background': 75, 'Lupus': 902}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.lupus_vs_healthy, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'Healthy/Background': 8, 'Lupus': 61}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.lupus_vs_healthy, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'Healthy/Background': 75, 'Lupus': 902}

GeneLocus.BCR, fold 1, TargetObsColumnEnum.lupus_vs_healthy, lasso_multiclass: best p value = 0.05. Number of disease associated clusters: {'Healthy/Background': 93, 'Lupus': 1135}


2023-01-07 03:53:21,551 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.ethnicity_condensed_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/ethnicity_condensed_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/ethnicity_condensed_healthy_only/train_smaller_model


GeneLocus.BCR, fold 1, TargetObsColumnEnum.lupus_vs_healthy, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'Healthy/Background': 93, 'Lupus': 1135}
GeneLocus.BCR, fold 1, TargetObsColumnEnum.lupus_vs_healthy, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'Healthy/Background': 93, 'Lupus': 1135}

GeneLocus.BCR, fold 2, TargetObsColumnEnum.lupus_vs_healthy, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'Healthy/Background': 12, 'Lupus': 280}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.lupus_vs_healthy, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'Healthy/Background': 12, 'Lupus': 280}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.lupus_vs_healthy, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'Healthy/Background': 12, 'Lupus': 280}



GeneLocus.BCR TargetObsColumnEnum.ethnicity_condensed_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,au-PRC (weighted OvO) per fold with abstention,au-PRC (macro OvO) per fold with abstention
elasticnet_cv,0.654 +/- 0.053 (in 3 folds),0.654 +/- 0.033 (in 3 folds),0.671 +/- 0.069 (in 3 folds),0.683 +/- 0.041 (in 3 folds),0.568 +/- 0.108 (in 3 folds),0.148 +/- 0.257 (in 3 folds),0.565,0.261,0.463 +/- 0.074 (in 3 folds),0.194 +/- 0.115 (in 3 folds),...,Unknown,108,24,132,0.181818,True,,,,
lasso_cv,0.633 +/- 0.068 (in 3 folds),0.621 +/- 0.056 (in 3 folds),0.655 +/- 0.085 (in 3 folds),0.651 +/- 0.062 (in 3 folds),0.519 +/- 0.120 (in 3 folds),0.069 +/- 0.119 (in 3 folds),0.513,0.15,0.437 +/- 0.056 (in 3 folds),0.131 +/- 0.051 (in 3 folds),...,Unknown,113,19,132,0.143939,True,,,,
rf_multiclass,0.628 +/- 0.041 (in 3 folds),0.622 +/- 0.029 (in 3 folds),0.636 +/- 0.023 (in 3 folds),0.618 +/- 0.014 (in 3 folds),0.579 +/- 0.026 (in 3 folds),0.295 +/- 0.101 (in 3 folds),0.581,0.318,0.543 +/- 0.061 (in 3 folds),0.257 +/- 0.071 (in 3 folds),...,Unknown,124,8,132,0.060606,True,0.634 +/- 0.000 (in 1 folds),0.634 +/- 0.000 (in 1 folds),0.619 +/- 0.000 (in 1 folds),0.605 +/- 0.000 (in 1 folds)
ridge_cv,0.618 +/- 0.107 (in 3 folds),0.636 +/- 0.120 (in 3 folds),0.621 +/- 0.117 (in 3 folds),0.647 +/- 0.132 (in 3 folds),0.529 +/- 0.111 (in 3 folds),0.059 +/- 0.101 (in 3 folds),0.528,0.115,0.430 +/- 0.061 (in 3 folds),0.129 +/- 0.048 (in 3 folds),...,Unknown,108,24,132,0.181818,True,,,,
xgboost,0.595 +/- 0.039 (in 3 folds),0.578 +/- 0.037 (in 3 folds),0.641 +/- 0.055 (in 3 folds),0.626 +/- 0.046 (in 3 folds),0.573 +/- 0.008 (in 3 folds),0.285 +/- 0.143 (in 3 folds),0.573,0.307,0.536 +/- 0.042 (in 3 folds),0.248 +/- 0.116 (in 3 folds),...,Unknown,124,8,132,0.060606,False,0.576 +/- 0.000 (in 1 folds),0.578 +/- 0.000 (in 1 folds),0.588 +/- 0.000 (in 1 folds),0.589 +/- 0.000 (in 1 folds)
lasso_multiclass,0.555 +/- 0.048 (in 3 folds),0.529 +/- 0.033 (in 3 folds),0.656 +/- 0.034 (in 3 folds),0.635 +/- 0.029 (in 3 folds),0.591 +/- 0.068 (in 3 folds),0.331 +/- 0.024 (in 3 folds),0.59,0.346,0.521 +/- 0.040 (in 3 folds),0.263 +/- 0.030 (in 3 folds),...,Unknown,117,15,132,0.113636,True,,,,
dummy_stratified,0.554 +/- 0.056 (in 3 folds),0.532 +/- 0.048 (in 3 folds),0.531 +/- 0.036 (in 3 folds),0.521 +/- 0.024 (in 3 folds),0.518 +/- 0.136 (in 3 folds),0.169 +/- 0.226 (in 3 folds),0.505,0.138,0.385 +/- 0.038 (in 3 folds),0.136 +/- 0.142 (in 3 folds),...,Unknown,101,31,132,0.234848,True,,,,
linearsvm_ovr,0.550 +/- 0.067 (in 3 folds),0.523 +/- 0.061 (in 3 folds),0.655 +/- 0.036 (in 3 folds),0.634 +/- 0.032 (in 3 folds),0.609 +/- 0.045 (in 3 folds),0.369 +/- 0.121 (in 3 folds),0.607,0.374,0.538 +/- 0.005 (in 3 folds),0.294 +/- 0.087 (in 3 folds),...,Unknown,117,15,132,0.113636,True,,,,
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.521 +/- 0.118 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.519,0.0,0.423 +/- 0.067 (in 3 folds),0.114 +/- 0.025 (in 3 folds),...,Unknown,108,24,132,0.181818,True,,,,


GeneLocus.BCR, fold 0, TargetObsColumnEnum.ethnicity_condensed_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'African': 22, 'Asian': 87, 'Caucasian': 23, 'Hispanic/Latino': 34}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.ethnicity_condensed_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'African': 22, 'Asian': 87, 'Caucasian': 23, 'Hispanic/Latino': 34}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.ethnicity_condensed_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'African': 22, 'Asian': 87, 'Caucasian': 23, 'Hispanic/Latino': 34}



GeneLocus.BCR, fold 1, TargetObsColumnEnum.ethnicity_condensed_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'African': 19, 'Asian': 122, 'Caucasian': 19, 'Hispanic/Latino': 240}
GeneLocus.BCR, fold 1, TargetObsColumnEnum.ethnicity_condensed_healthy_only, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'African': 516, 'Asian': 2584, 'Caucasian': 163, 'Hispanic/Latino': 207999}
GeneLocus.BCR, fold 1, TargetObsColumnEnum.ethnicity_condensed_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'African': 19, 'Asian': 122, 'Caucasian': 19, 'Hispanic/Latino': 240}



2023-01-07 03:53:26,362 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.age_group_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/age_group_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/age_group_healthy_only/train_smaller_model


GeneLocus.BCR, fold 2, TargetObsColumnEnum.ethnicity_condensed_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'African': 3, 'Asian': 51, 'Caucasian': 68, 'Hispanic/Latino': 222}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.ethnicity_condensed_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'African': 3, 'Asian': 51, 'Caucasian': 68, 'Hispanic/Latino': 222}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.ethnicity_condensed_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'African': 3, 'Asian': 51, 'Caucasian': 68, 'Hispanic/Latino': 222}



















































































































































































































































































































































































































































































































































































































































GeneLocus.BCR TargetObsColumnEnum.age_group_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,au-PRC (weighted OvO) per fold with abstention,au-PRC (macro OvO) per fold with abstention
linearsvm_ovr,0.537 +/- 0.058 (in 3 folds),0.534 +/- 0.056 (in 3 folds),0.626 +/- 0.063 (in 3 folds),0.626 +/- 0.062 (in 3 folds),0.215 +/- 0.030 (in 3 folds),0.066 +/- 0.016 (in 3 folds),0.215,0.053,0.212 +/- 0.028 (in 3 folds),0.066 +/- 0.017 (in 3 folds),...,Unknown,130,2,132,0.015152,True,0.603 +/- 0.000 (in 1 folds),0.594 +/- 0.000 (in 1 folds),0.699 +/- 0.000 (in 1 folds),0.695 +/- 0.000 (in 1 folds)
dummy_stratified,0.536 +/- 0.026 (in 3 folds),0.535 +/- 0.030 (in 3 folds),0.549 +/- 0.018 (in 3 folds),0.549 +/- 0.018 (in 3 folds),0.222 +/- 0.043 (in 3 folds),0.080 +/- 0.049 (in 3 folds),0.22,0.065,0.206 +/- 0.034 (in 3 folds),0.068 +/- 0.040 (in 3 folds),...,Unknown,123,9,132,0.068182,True,0.507 +/- 0.000 (in 1 folds),0.501 +/- 0.000 (in 1 folds),0.529 +/- 0.000 (in 1 folds),0.529 +/- 0.000 (in 1 folds)
lasso_cv,0.533 +/- 0.041 (in 3 folds),0.530 +/- 0.040 (in 3 folds),0.614 +/- 0.045 (in 3 folds),0.614 +/- 0.043 (in 3 folds),0.214 +/- 0.020 (in 3 folds),0.108 +/- 0.038 (in 3 folds),0.213,0.06,0.205 +/- 0.013 (in 3 folds),0.095 +/- 0.031 (in 3 folds),...,Unknown,127,5,132,0.037879,True,,,,
lasso_multiclass,0.529 +/- 0.030 (in 3 folds),0.523 +/- 0.036 (in 3 folds),0.628 +/- 0.036 (in 3 folds),0.624 +/- 0.041 (in 3 folds),0.241 +/- 0.053 (in 3 folds),0.110 +/- 0.051 (in 3 folds),0.227,0.088,0.188 +/- 0.039 (in 3 folds),0.084 +/- 0.016 (in 3 folds),...,Unknown,110,22,132,0.166667,False,0.562 +/- 0.000 (in 1 folds),0.565 +/- 0.000 (in 1 folds),0.650 +/- 0.000 (in 1 folds),0.651 +/- 0.000 (in 1 folds)
xgboost,0.525 +/- 0.053 (in 3 folds),0.523 +/- 0.049 (in 3 folds),0.584 +/- 0.074 (in 3 folds),0.582 +/- 0.068 (in 3 folds),0.235 +/- 0.011 (in 3 folds),0.007 +/- 0.132 (in 3 folds),0.239,0.092,0.168 +/- 0.107 (in 3 folds),0.052 +/- 0.048 (in 3 folds),...,Unknown,92,40,132,0.30303,True,,,,
rf_multiclass,0.517 +/- 0.039 (in 3 folds),0.517 +/- 0.045 (in 3 folds),0.584 +/- 0.019 (in 3 folds),0.585 +/- 0.023 (in 3 folds),0.228 +/- 0.036 (in 3 folds),0.076 +/- 0.018 (in 3 folds),0.229,0.069,0.226 +/- 0.037 (in 3 folds),0.076 +/- 0.018 (in 3 folds),...,Unknown,131,1,132,0.007576,True,0.535 +/- 0.031 (in 2 folds),0.539 +/- 0.033 (in 2 folds),0.587 +/- 0.026 (in 2 folds),0.591 +/- 0.028 (in 2 folds)
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.372 +/- 0.176 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.241,0.131,0.101 +/- 0.064 (in 3 folds),0.048 +/- 0.043 (in 3 folds),...,Unknown,54,78,132,0.590909,True,0.500 +/- 0.000 (in 1 folds),0.500 +/- 0.000 (in 1 folds),0.500 +/- 0.000 (in 1 folds),0.500 +/- 0.000 (in 1 folds)
elasticnet_cv,0.366 +/- 0.319 (in 3 folds),0.365 +/- 0.318 (in 3 folds),0.560 +/- 0.131 (in 3 folds),0.562 +/- 0.130 (in 3 folds),0.305 +/- 0.169 (in 3 folds),0.088 +/- 0.076 (in 3 folds),0.221,0.06,0.146 +/- 0.089 (in 3 folds),0.096 +/- 0.030 (in 3 folds),...,Unknown,86,46,132,0.348485,True,,,,
ridge_cv,0.355 +/- 0.310 (in 3 folds),0.354 +/- 0.309 (in 3 folds),0.530 +/- 0.128 (in 3 folds),0.530 +/- 0.124 (in 3 folds),0.372 +/- 0.176 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.241,0.131,0.101 +/- 0.064 (in 3 folds),0.048 +/- 0.043 (in 3 folds),...,Unknown,54,78,132,0.590909,True,0.576 +/- 0.000 (in 1 folds),0.563 +/- 0.000 (in 1 folds),0.670 +/- 0.000 (in 1 folds),0.663 +/- 0.000 (in 1 folds)


GeneLocus.BCR, fold 0, TargetObsColumnEnum.age_group_healthy_only, lasso_multiclass: best p value = 0.001. Number of disease associated clusters: {'20-30': 2, '30-40': 1, '40-50': 8, '50-60': 4, '60-70': 8, '70-80': 0, '<20': 4}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.age_group_healthy_only, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'20-30': 251, '30-40': 180, '40-50': 2150, '50-60': 568, '60-70': 809, '70-80': 55095, '<20': 1521}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.age_group_healthy_only, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'20-30': 251, '30-40': 180, '40-50': 2150, '50-60': 568, '60-70': 809, '70-80': 55095, '<20': 1521}

GeneLocus.BCR, fold 1, TargetObsColumnEnum.age_group_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'20-30': 55, '30-40': 35, '40-50': 61, '50-60': 106, '60-70': 36, '<20': 15}


2023-01-07 03:53:35,013 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.age_group_binary_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/age_group_binary_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/age_group_binary_healthy_only/train_smaller_model


GeneLocus.BCR, fold 1, TargetObsColumnEnum.age_group_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'20-30': 55, '30-40': 35, '40-50': 61, '50-60': 106, '60-70': 36, '<20': 15}
GeneLocus.BCR, fold 1, TargetObsColumnEnum.age_group_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'20-30': 55, '30-40': 35, '40-50': 61, '50-60': 106, '60-70': 36, '<20': 15}

GeneLocus.BCR, fold 2, TargetObsColumnEnum.age_group_healthy_only, lasso_multiclass: best p value = 0.05. Number of disease associated clusters: {'20-30': 1201, '30-40': 78, '40-50': 1451, '50-60': 1211, '60-70': 2923, '70-80': 5845, '<20': 892}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.age_group_healthy_only, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'20-30': 1201, '30-40': 78, '40-50': 1451, '50-60': 1211, '60-70': 2923, '70-80': 5845, '<20': 892}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.age_group_healthy_only, lin

GeneLocus.BCR TargetObsColumnEnum.age_group_binary_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,au-PRC (weighted OvO) per fold with abstention,au-PRC (macro OvO) per fold with abstention
xgboost,0.667 +/- 0.110 (in 3 folds),0.667 +/- 0.110 (in 3 folds),0.778 +/- 0.105 (in 3 folds),0.778 +/- 0.105 (in 3 folds),0.679 +/- 0.057 (in 3 folds),0.314 +/- 0.119 (in 3 folds),0.673,0.264,0.513 +/- 0.150 (in 3 folds),0.205 +/- 0.036 (in 3 folds),...,Unknown,101,31,132,0.234848,False,0.563 +/- 0.000 (in 1 folds),0.563 +/- 0.000 (in 1 folds),0.657 +/- 0.000 (in 1 folds),0.657 +/- 0.000 (in 1 folds)
rf_multiclass,0.616 +/- 0.156 (in 3 folds),0.616 +/- 0.156 (in 3 folds),0.756 +/- 0.062 (in 3 folds),0.756 +/- 0.062 (in 3 folds),0.674 +/- 0.065 (in 3 folds),0.252 +/- 0.196 (in 3 folds),0.664,0.212,0.554 +/- 0.156 (in 3 folds),0.168 +/- 0.095 (in 3 folds),...,Unknown,110,22,132,0.166667,False,0.543 +/- 0.129 (in 2 folds),0.543 +/- 0.129 (in 2 folds),0.720 +/- 0.009 (in 2 folds),0.720 +/- 0.009 (in 2 folds)
lasso_cv,0.608 +/- 0.106 (in 3 folds),0.608 +/- 0.106 (in 3 folds),0.736 +/- 0.129 (in 3 folds),0.736 +/- 0.129 (in 3 folds),0.538 +/- 0.116 (in 3 folds),0.115 +/- 0.099 (in 3 folds),0.565,0.11,0.367 +/- 0.190 (in 3 folds),0.080 +/- 0.043 (in 3 folds),...,Unknown,85,47,132,0.356061,False,,,,
lasso_multiclass,0.597 +/- 0.115 (in 3 folds),0.597 +/- 0.115 (in 3 folds),0.749 +/- 0.105 (in 3 folds),0.749 +/- 0.105 (in 3 folds),0.640 +/- 0.051 (in 3 folds),0.233 +/- 0.106 (in 3 folds),0.64,0.206,0.550 +/- 0.084 (in 3 folds),0.175 +/- 0.055 (in 3 folds),...,Unknown,114,18,132,0.136364,False,0.466 +/- 0.000 (in 1 folds),0.466 +/- 0.000 (in 1 folds),0.637 +/- 0.000 (in 1 folds),0.637 +/- 0.000 (in 1 folds)
linearsvm_ovr,0.596 +/- 0.117 (in 3 folds),0.596 +/- 0.117 (in 3 folds),0.747 +/- 0.108 (in 3 folds),0.747 +/- 0.108 (in 3 folds),0.647 +/- 0.051 (in 3 folds),0.249 +/- 0.087 (in 3 folds),0.649,0.221,0.557 +/- 0.094 (in 3 folds),0.191 +/- 0.047 (in 3 folds),...,Unknown,114,18,132,0.136364,False,0.461 +/- 0.000 (in 1 folds),0.461 +/- 0.000 (in 1 folds),0.631 +/- 0.000 (in 1 folds),0.631 +/- 0.000 (in 1 folds)
dummy_stratified,0.584 +/- 0.016 (in 3 folds),0.584 +/- 0.016 (in 3 folds),0.728 +/- 0.120 (in 3 folds),0.728 +/- 0.120 (in 3 folds),0.502 +/- 0.148 (in 3 folds),0.168 +/- 0.030 (in 3 folds),0.573,0.119,0.315 +/- 0.281 (in 3 folds),0.070 +/- 0.073 (in 3 folds),...,Unknown,75,57,132,0.431818,False,0.568 +/- 0.000 (in 1 folds),0.568 +/- 0.000 (in 1 folds),0.664 +/- 0.000 (in 1 folds),0.664 +/- 0.000 (in 1 folds)
elasticnet_cv,0.514 +/- 0.039 (in 3 folds),0.514 +/- 0.039 (in 3 folds),0.638 +/- 0.118 (in 3 folds),0.638 +/- 0.118 (in 3 folds),0.433 +/- 0.019 (in 3 folds),0.089 +/- 0.155 (in 3 folds),0.435,0.202,0.208 +/- 0.094 (in 3 folds),0.086 +/- 0.081 (in 3 folds),...,Unknown,62,70,132,0.530303,False,,,,
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.613 +/- 0.067 (in 3 folds),0.613 +/- 0.067 (in 3 folds),0.387 +/- 0.067 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.371,0.0,0.176 +/- 0.038 (in 3 folds),0.051 +/- 0.021 (in 3 folds),...,Unknown,62,70,132,0.530303,True,,,,
ridge_cv,0.495 +/- 0.009 (in 3 folds),0.495 +/- 0.009 (in 3 folds),0.610 +/- 0.071 (in 3 folds),0.610 +/- 0.071 (in 3 folds),0.387 +/- 0.067 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.371,0.0,0.176 +/- 0.038 (in 3 folds),0.051 +/- 0.021 (in 3 folds),...,Unknown,62,70,132,0.530303,True,,,,


GeneLocus.BCR, fold 0, TargetObsColumnEnum.age_group_binary_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'50+': 27, 'under 50': 5}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.age_group_binary_healthy_only, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'50+': 311, 'under 50': 38}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.age_group_binary_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'50+': 27, 'under 50': 5}



GeneLocus.BCR, fold 1, TargetObsColumnEnum.age_group_binary_healthy_only, lasso_multiclass: best p value = 0.05. Number of disease associated clusters: {'50+': 739, 'under 50': 18}
GeneLocus.BCR, fold 1, TargetObsColumnEnum.age_group_binary_healthy_only, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'50+': 739, 'under 50': 18}
GeneLocus.BCR, fold 1, TargetObsColumnEnum.age_group_binary_healthy_only, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'50+': 739, 'under 50': 18}

GeneLocus.BCR, fold 2, TargetObsColumnEnum.age_group_binary_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'50+': 46, 'under 50': 4}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.age_group_binary_healthy_only, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'50+': 19, 'under 50': 2}


2023-01-07 03:53:37,977 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.age_group_pediatric_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/age_group_pediatric_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/age_group_pediatric_healthy_only/train_smaller_model


GeneLocus.BCR, fold 2, TargetObsColumnEnum.age_group_binary_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'50+': 46, 'under 50': 4}





2023-01-07 03:53:39,708 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:53:39,726 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:53:39,770 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:53:39,779 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.


GeneLocus.BCR TargetObsColumnEnum.age_group_pediatric_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,Unknown/abstention proportion per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
ridge_cv,0.813 +/- 0.132 (in 3 folds),0.813 +/- 0.132 (in 3 folds),0.383 +/- 0.206 (in 3 folds),0.383 +/- 0.206 (in 3 folds),0.733 +/- 0.243 (in 3 folds),0.096 +/- 0.167 (in 3 folds),0.819,0.113,0.523 +/- 0.362 (in 3 folds),0.028 +/- 0.040 (in 3 folds),0.363 +/- 0.346 (in 3 folds),0.515,0.064,0.371,Unknown,83,49,132,0.371212,False
linearsvm_ovr,0.762 +/- 0.164 (in 3 folds),0.762 +/- 0.164 (in 3 folds),0.371 +/- 0.174 (in 3 folds),0.371 +/- 0.174 (in 3 folds),0.835 +/- 0.050 (in 3 folds),0.191 +/- 0.251 (in 3 folds),0.837,0.19,0.661 +/- 0.103 (in 3 folds),0.079 +/- 0.077 (in 3 folds),0.211 +/- 0.087 (in 3 folds),0.659,0.1,0.212,Unknown,104,28,132,0.212121,False
xgboost,0.727 +/- 0.017 (in 3 folds),0.727 +/- 0.017 (in 3 folds),0.262 +/- 0.072 (in 3 folds),0.262 +/- 0.072 (in 3 folds),0.815 +/- 0.050 (in 3 folds),0.116 +/- 0.152 (in 3 folds),0.821,0.17,0.587 +/- 0.151 (in 3 folds),0.057 +/- 0.057 (in 3 folds),0.284 +/- 0.142 (in 3 folds),0.591,0.078,0.28,Unknown,95,37,132,0.280303,False
lasso_cv,0.679 +/- 0.044 (in 3 folds),0.679 +/- 0.044 (in 3 folds),0.227 +/- 0.043 (in 3 folds),0.227 +/- 0.043 (in 3 folds),0.733 +/- 0.243 (in 3 folds),0.096 +/- 0.167 (in 3 folds),0.819,0.113,0.523 +/- 0.362 (in 3 folds),0.028 +/- 0.040 (in 3 folds),0.363 +/- 0.346 (in 3 folds),0.515,0.064,0.371,Unknown,83,49,132,0.371212,False
elasticnet_cv,0.679 +/- 0.044 (in 3 folds),0.679 +/- 0.044 (in 3 folds),0.227 +/- 0.043 (in 3 folds),0.227 +/- 0.043 (in 3 folds),0.733 +/- 0.243 (in 3 folds),0.096 +/- 0.167 (in 3 folds),0.819,0.113,0.523 +/- 0.362 (in 3 folds),0.028 +/- 0.040 (in 3 folds),0.363 +/- 0.346 (in 3 folds),0.515,0.064,0.371,Unknown,83,49,132,0.371212,False
lasso_multiclass,0.670 +/- 0.128 (in 2 folds),0.670 +/- 0.128 (in 2 folds),0.251 +/- 0.056 (in 2 folds),0.251 +/- 0.056 (in 2 folds),0.884 +/- 0.110 (in 3 folds),0.060 +/- 0.154 (in 3 folds),0.836,0.171,0.449 +/- 0.363 (in 3 folds),0.059 +/- 0.056 (in 3 folds),0.463 +/- 0.431 (in 3 folds),0.462,0.066,0.447,Unknown,73,59,132,0.44697,False
dummy_stratified,0.595 +/- 0.071 (in 3 folds),0.595 +/- 0.071 (in 3 folds),0.193 +/- 0.071 (in 3 folds),0.193 +/- 0.071 (in 3 folds),0.704 +/- 0.073 (in 3 folds),0.152 +/- 0.101 (in 3 folds),0.716,0.156,0.511 +/- 0.151 (in 3 folds),0.091 +/- 0.042 (in 3 folds),0.284 +/- 0.142 (in 3 folds),0.515,0.093,0.28,Unknown,95,37,132,0.280303,False
rf_multiclass,0.546 +/- 0.069 (in 3 folds),0.546 +/- 0.069 (in 3 folds),0.227 +/- 0.113 (in 3 folds),0.227 +/- 0.113 (in 3 folds),0.852 +/- 0.062 (in 3 folds),0.218 +/- 0.210 (in 3 folds),0.856,0.251,0.676 +/- 0.120 (in 3 folds),0.094 +/- 0.051 (in 3 folds),0.211 +/- 0.087 (in 3 folds),0.674,0.116,0.212,Unknown,104,28,132,0.212121,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.147 +/- 0.043 (in 3 folds),0.147 +/- 0.043 (in 3 folds),0.853 +/- 0.043 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.856,0.0,0.675 +/- 0.106 (in 3 folds),0.012 +/- 0.025 (in 3 folds),0.211 +/- 0.087 (in 3 folds),0.674,0.02,0.212,Unknown,104,28,132,0.212121,True


GeneLocus.BCR, fold 0, TargetObsColumnEnum.age_group_pediatric_healthy_only, lasso_multiclass: best p value = 0.0005. Number of disease associated clusters: {'18+': 0, 'under 18': 3}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.age_group_pediatric_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'18+': 3, 'under 18': 69}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.age_group_pediatric_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'18+': 3, 'under 18': 69}

GeneLocus.BCR, fold 1, TargetObsColumnEnum.age_group_pediatric_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'18+': 10, 'under 18': 21}


GeneLocus.BCR, fold 1, TargetObsColumnEnum.age_group_pediatric_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'18+': 10, 'under 18': 21}
GeneLocus.BCR, fold 1, TargetObsColumnEnum.age_group_pediatric_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'18+': 10, 'under 18': 21}



2023-01-07 03:53:40,952 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.BCR, TargetObsColumnEnum.sex_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/BCR/sex_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/BCR/sex_healthy_only/train_smaller_model


GeneLocus.BCR, fold 2, TargetObsColumnEnum.age_group_pediatric_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'18+': 3, 'under 18': 57}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.age_group_pediatric_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'18+': 3, 'under 18': 57}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.age_group_pediatric_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'18+': 3, 'under 18': 57}



GeneLocus.BCR TargetObsColumnEnum.sex_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,Unknown/abstention proportion per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
dummy_stratified,0.564 +/- 0.053 (in 3 folds),0.564 +/- 0.053 (in 3 folds),0.514 +/- 0.230 (in 3 folds),0.514 +/- 0.230 (in 3 folds),0.491 +/- 0.101 (in 3 folds),0.135 +/- 0.103 (in 3 folds),0.491,-0.007,0.193 +/- 0.184 (in 3 folds),0.031 +/- 0.073 (in 3 folds),0.605 +/- 0.392 (in 3 folds),0.197,-0.009,0.598,Unknown,53,79,132,0.598485,False
rf_multiclass,0.560 +/- 0.029 (in 3 folds),0.560 +/- 0.029 (in 3 folds),0.568 +/- 0.192 (in 3 folds),0.568 +/- 0.192 (in 3 folds),0.537 +/- 0.059 (in 3 folds),0.194 +/- 0.069 (in 3 folds),0.541,0.078,0.452 +/- 0.130 (in 3 folds),0.128 +/- 0.053 (in 3 folds),0.169 +/- 0.151 (in 3 folds),0.447,0.048,0.174,Unknown,109,23,132,0.174242,False
elasticnet_cv,0.556 +/- 0.069 (in 3 folds),0.556 +/- 0.069 (in 3 folds),0.579 +/- 0.179 (in 3 folds),0.579 +/- 0.179 (in 3 folds),0.537 +/- 0.047 (in 3 folds),0.191 +/- 0.099 (in 3 folds),0.541,0.08,0.451 +/- 0.118 (in 3 folds),0.123 +/- 0.047 (in 3 folds),0.169 +/- 0.151 (in 3 folds),0.447,0.054,0.174,Unknown,109,23,132,0.174242,False
linearsvm_ovr,0.555 +/- 0.063 (in 3 folds),0.555 +/- 0.063 (in 3 folds),0.537 +/- 0.157 (in 3 folds),0.537 +/- 0.157 (in 3 folds),0.580 +/- 0.009 (in 3 folds),0.173 +/- 0.064 (in 3 folds),0.581,0.15,0.546 +/- 0.041 (in 3 folds),0.156 +/- 0.062 (in 3 folds),0.059 +/- 0.062 (in 3 folds),0.545,0.134,0.061,Unknown,124,8,132,0.060606,False
lasso_multiclass,0.554 +/- 0.058 (in 3 folds),0.554 +/- 0.058 (in 3 folds),0.536 +/- 0.148 (in 3 folds),0.536 +/- 0.148 (in 3 folds),0.556 +/- 0.027 (in 3 folds),0.123 +/- 0.099 (in 3 folds),0.556,0.098,0.523 +/- 0.029 (in 3 folds),0.110 +/- 0.084 (in 3 folds),0.059 +/- 0.062 (in 3 folds),0.523,0.087,0.061,Unknown,124,8,132,0.060606,False
lasso_cv,0.547 +/- 0.084 (in 3 folds),0.547 +/- 0.084 (in 3 folds),0.581 +/- 0.187 (in 3 folds),0.581 +/- 0.187 (in 3 folds),0.521 +/- 0.130 (in 3 folds),0.240 +/- 0.079 (in 3 folds),0.49,-0.014,0.376 +/- 0.087 (in 3 folds),0.147 +/- 0.021 (in 3 folds),0.238 +/- 0.267 (in 3 folds),0.379,0.002,0.227,Unknown,102,30,132,0.227273,False
ridge_cv,0.533 +/- 0.084 (in 3 folds),0.533 +/- 0.084 (in 3 folds),0.581 +/- 0.187 (in 3 folds),0.581 +/- 0.187 (in 3 folds),0.471 +/- 0.120 (in 3 folds),0.102 +/- 0.176 (in 3 folds),0.451,-0.1,0.346 +/- 0.113 (in 3 folds),0.092 +/- 0.064 (in 3 folds),0.238 +/- 0.267 (in 3 folds),0.348,-0.046,0.227,Unknown,102,30,132,0.227273,False
xgboost,0.515 +/- 0.065 (in 3 folds),0.515 +/- 0.065 (in 3 folds),0.633 +/- 0.165 (in 3 folds),0.633 +/- 0.165 (in 3 folds),0.609 +/- 0.050 (in 3 folds),0.108 +/- 0.101 (in 3 folds),0.59,0.191,0.356 +/- 0.216 (in 3 folds),0.136 +/- 0.039 (in 3 folds),0.395 +/- 0.391 (in 3 folds),0.348,0.09,0.409,Unknown,78,54,132,0.409091,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.628 +/- 0.088 (in 3 folds),0.628 +/- 0.088 (in 3 folds),0.628 +/- 0.088 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.632,0.0,0.273 +/- 0.159 (in 3 folds),0.120 +/- 0.060 (in 3 folds),0.565 +/- 0.226 (in 3 folds),0.273,0.148,0.568,Unknown,57,75,132,0.568182,True


GeneLocus.BCR, fold 0, TargetObsColumnEnum.sex_healthy_only, lasso_multiclass: best p value = 0.05. Number of disease associated clusters: {'F': 109, 'M': 152}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.sex_healthy_only, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'F': 109, 'M': 152}
GeneLocus.BCR, fold 0, TargetObsColumnEnum.sex_healthy_only, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'F': 109, 'M': 152}

GeneLocus.BCR, fold 1, TargetObsColumnEnum.sex_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'F': 3, 'M': 81}


GeneLocus.BCR, fold 1, TargetObsColumnEnum.sex_healthy_only, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'F': 1, 'M': 25}


2023-01-07 03:53:44,001 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.disease from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/disease/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/disease/train_smaller_model


GeneLocus.BCR, fold 1, TargetObsColumnEnum.sex_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'F': 3, 'M': 81}

GeneLocus.BCR, fold 2, TargetObsColumnEnum.sex_healthy_only, lasso_multiclass: best p value = 0.05. Number of disease associated clusters: {'F': 34, 'M': 492}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.sex_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'F': 1, 'M': 53}
GeneLocus.BCR, fold 2, TargetObsColumnEnum.sex_healthy_only, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'F': 34, 'M': 492}



GeneLocus.TCR TargetObsColumnEnum.disease


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
linearsvm_ovr,0.923 +/- 0.009 (in 3 folds),0.926 +/- 0.007 (in 3 folds),0.920 +/- 0.013 (in 3 folds),0.927 +/- 0.010 (in 3 folds),0.755 +/- 0.028 (in 3 folds),0.637 +/- 0.048 (in 3 folds),0.755,0.635,274,0,274,0.0,False
lasso_multiclass,0.903 +/- 0.017 (in 3 folds),0.903 +/- 0.024 (in 3 folds),0.901 +/- 0.016 (in 3 folds),0.905 +/- 0.022 (in 3 folds),0.766 +/- 0.031 (in 3 folds),0.665 +/- 0.040 (in 3 folds),0.766,0.662,274,0,274,0.0,False
ridge_cv,0.894 +/- 0.018 (in 3 folds),0.902 +/- 0.018 (in 3 folds),0.879 +/- 0.017 (in 3 folds),0.891 +/- 0.017 (in 3 folds),0.536 +/- 0.069 (in 3 folds),0.209 +/- 0.195 (in 3 folds),0.536,0.253,274,0,274,0.0,False
xgboost,0.891 +/- 0.011 (in 3 folds),0.886 +/- 0.013 (in 3 folds),0.889 +/- 0.011 (in 3 folds),0.890 +/- 0.012 (in 3 folds),0.730 +/- 0.034 (in 3 folds),0.607 +/- 0.050 (in 3 folds),0.73,0.597,274,0,274,0.0,False
lasso_cv,0.881 +/- 0.019 (in 3 folds),0.887 +/- 0.012 (in 3 folds),0.874 +/- 0.017 (in 3 folds),0.884 +/- 0.015 (in 3 folds),0.632 +/- 0.063 (in 3 folds),0.432 +/- 0.105 (in 3 folds),0.631,0.433,274,0,274,0.0,False
rf_multiclass,0.880 +/- 0.018 (in 3 folds),0.882 +/- 0.021 (in 3 folds),0.880 +/- 0.019 (in 3 folds),0.886 +/- 0.020 (in 3 folds),0.715 +/- 0.034 (in 3 folds),0.583 +/- 0.059 (in 3 folds),0.715,0.57,274,0,274,0.0,False
elasticnet_cv,0.863 +/- 0.057 (in 3 folds),0.863 +/- 0.064 (in 3 folds),0.851 +/- 0.054 (in 3 folds),0.856 +/- 0.060 (in 3 folds),0.598 +/- 0.058 (in 3 folds),0.370 +/- 0.106 (in 3 folds),0.599,0.37,274,0,274,0.0,False
dummy_stratified,0.502 +/- 0.017 (in 3 folds),0.509 +/- 0.012 (in 3 folds),0.512 +/- 0.006 (in 3 folds),0.516 +/- 0.005 (in 3 folds),0.336 +/- 0.029 (in 3 folds),-0.003 +/- 0.046 (in 3 folds),0.336,-0.003,274,0,274,0.0,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.474 +/- 0.003 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.474,0.0,274,0,274,0.0,True


GeneLocus.TCR, fold 0, TargetObsColumnEnum.disease, lasso_multiclass: best p value = 0.0005. Number of disease associated clusters: {'Covid19': 2, 'HIV': 11, 'Healthy/Background': 207, 'Lupus': 221}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.disease, rf_multiclass: best p value = 0.0005. Number of disease associated clusters: {'Covid19': 2, 'HIV': 11, 'Healthy/Background': 207, 'Lupus': 221}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.disease, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'Covid19': 18, 'HIV': 161, 'Healthy/Background': 903, 'Lupus': 948}

GeneLocus.TCR, fold 1, TargetObsColumnEnum.disease, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 144, 'HIV': 215, 'Healthy/Background': 1212, 'Lupus': 1458}


2023-01-07 03:53:47,625 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.disease_all_demographics_present from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/disease_all_demographics_present/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/disease_all_demographics_present/train_smaller_model


GeneLocus.TCR, fold 1, TargetObsColumnEnum.disease, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 144, 'HIV': 215, 'Healthy/Background': 1212, 'Lupus': 1458}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.disease, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'Covid19': 126, 'HIV': 176, 'Healthy/Background': 876, 'Lupus': 1227}

GeneLocus.TCR, fold 2, TargetObsColumnEnum.disease, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 66, 'HIV': 122, 'Healthy/Background': 931, 'Lupus': 1163}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.disease, rf_multiclass: best p value = 0.001. Number of disease associated clusters: {'Covid19': 3, 'HIV': 31, 'Healthy/Background': 364, 'Lupus': 285}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.disease, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'Covid19': 85, 'HIV': 162, 'Healthy/Background': 1282, 'Lupus': 1389}



GeneLocus.TCR TargetObsColumnEnum.disease_all_demographics_present


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,...,Unknown/abstention proportion per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
linearsvm_ovr,0.907 +/- 0.009 (in 3 folds),0.906 +/- 0.016 (in 3 folds),0.889 +/- 0.006 (in 3 folds),0.892 +/- 0.002 (in 3 folds),0.749 +/- 0.028 (in 3 folds),0.639 +/- 0.055 (in 3 folds),0.749,0.636,0.912 +/- 0.003 (in 2 folds),0.915 +/- 0.002 (in 2 folds),...,0.013 +/- 0.000 (in 1 folds),0.746,0.633,0.004,Unknown,239,1,240,0.004167,False
lasso_multiclass,0.906 +/- 0.012 (in 3 folds),0.905 +/- 0.017 (in 3 folds),0.897 +/- 0.011 (in 3 folds),0.899 +/- 0.010 (in 3 folds),0.761 +/- 0.016 (in 3 folds),0.671 +/- 0.033 (in 3 folds),0.762,0.668,0.912 +/- 0.010 (in 2 folds),0.914 +/- 0.007 (in 2 folds),...,0.013 +/- 0.000 (in 1 folds),0.758,0.664,0.004,Unknown,239,1,240,0.004167,False
ridge_cv,0.889 +/- 0.008 (in 3 folds),0.892 +/- 0.013 (in 3 folds),0.874 +/- 0.010 (in 3 folds),0.881 +/- 0.016 (in 3 folds),0.547 +/- 0.115 (in 3 folds),0.174 +/- 0.301 (in 3 folds),0.548,0.286,0.893 +/- 0.003 (in 2 folds),0.899 +/- 0.000 (in 2 folds),...,0.013 +/- 0.000 (in 1 folds),0.546,0.284,0.004,Unknown,239,1,240,0.004167,False
xgboost,0.885 +/- 0.024 (in 3 folds),0.884 +/- 0.024 (in 3 folds),0.878 +/- 0.033 (in 3 folds),0.882 +/- 0.032 (in 3 folds),0.758 +/- 0.031 (in 3 folds),0.643 +/- 0.046 (in 3 folds),0.758,0.638,,,...,,,,,,240,0,240,0.0,False
rf_multiclass,0.885 +/- 0.014 (in 3 folds),0.882 +/- 0.008 (in 3 folds),0.881 +/- 0.016 (in 3 folds),0.879 +/- 0.009 (in 3 folds),0.716 +/- 0.022 (in 3 folds),0.573 +/- 0.032 (in 3 folds),0.715,0.571,0.886 +/- 0.019 (in 2 folds),0.884 +/- 0.011 (in 2 folds),...,0.013 +/- 0.000 (in 1 folds),0.713,0.568,0.004,Unknown,239,1,240,0.004167,False
elasticnet_cv,0.882 +/- 0.016 (in 3 folds),0.879 +/- 0.021 (in 3 folds),0.853 +/- 0.029 (in 3 folds),0.856 +/- 0.028 (in 3 folds),0.603 +/- 0.102 (in 3 folds),0.371 +/- 0.180 (in 3 folds),0.603,0.374,0.886 +/- 0.020 (in 2 folds),0.887 +/- 0.021 (in 2 folds),...,0.013 +/- 0.000 (in 1 folds),0.6,0.372,0.004,Unknown,239,1,240,0.004167,False
lasso_cv,0.876 +/- 0.027 (in 3 folds),0.876 +/- 0.035 (in 3 folds),0.864 +/- 0.030 (in 3 folds),0.870 +/- 0.033 (in 3 folds),0.615 +/- 0.049 (in 3 folds),0.388 +/- 0.104 (in 3 folds),0.615,0.392,0.892 +/- 0.004 (in 2 folds),0.896 +/- 0.003 (in 2 folds),...,0.013 +/- 0.000 (in 1 folds),0.613,0.39,0.004,Unknown,239,1,240,0.004167,False
dummy_stratified,0.531 +/- 0.035 (in 3 folds),0.523 +/- 0.032 (in 3 folds),0.529 +/- 0.023 (in 3 folds),0.526 +/- 0.020 (in 3 folds),0.405 +/- 0.062 (in 3 folds),0.088 +/- 0.087 (in 3 folds),0.404,0.086,,,...,,,,,,240,0,240,0.0,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.481 +/- 0.013 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.481,0.0,0.500 +/- 0.000 (in 2 folds),0.500 +/- 0.000 (in 2 folds),...,0.013 +/- 0.000 (in 1 folds),0.479,0.027,0.004,Unknown,239,1,240,0.004167,True


GeneLocus.TCR, fold 0, TargetObsColumnEnum.disease_all_demographics_present, lasso_multiclass: best p value = 0.0005. Number of disease associated clusters: {'Covid19': 2, 'HIV': 10, 'Healthy/Background': 149, 'Lupus': 126}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.disease_all_demographics_present, rf_multiclass: best p value = 0.0005. Number of disease associated clusters: {'Covid19': 2, 'HIV': 10, 'Healthy/Background': 149, 'Lupus': 126}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.disease_all_demographics_present, linearsvm_ovr: best p value = 0.0005. Number of disease associated clusters: {'Covid19': 2, 'HIV': 10, 'Healthy/Background': 149, 'Lupus': 126}

GeneLocus.TCR, fold 1, TargetObsColumnEnum.disease_all_demographics_present, lasso_multiclass: best p value = 0.001. Number of disease associated clusters: {'Covid19': 3, 'HIV': 28, 'Healthy/Background': 225, 'Lupus': 296}


GeneLocus.TCR, fold 1, TargetObsColumnEnum.disease_all_demographics_present, rf_multiclass: best p value = 0.001. Number of disease associated clusters: {'Covid19': 3, 'HIV': 28, 'Healthy/Background': 225, 'Lupus': 296}


2023-01-07 03:53:52,082 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.covid_vs_healthy from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/covid_vs_healthy/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/covid_vs_healthy/train_smaller_model


GeneLocus.TCR, fold 1, TargetObsColumnEnum.disease_all_demographics_present, linearsvm_ovr: best p value = 0.001. Number of disease associated clusters: {'Covid19': 3, 'HIV': 28, 'Healthy/Background': 225, 'Lupus': 296}

GeneLocus.TCR, fold 2, TargetObsColumnEnum.disease_all_demographics_present, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 4, 'HIV': 112, 'Healthy/Background': 694, 'Lupus': 570}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.disease_all_demographics_present, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'Covid19': 4, 'HIV': 112, 'Healthy/Background': 694, 'Lupus': 570}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.disease_all_demographics_present, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'Covid19': 7, 'HIV': 135, 'Healthy/Background': 1170, 'Lupus': 1485}



GeneLocus.TCR TargetObsColumnEnum.covid_vs_healthy


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
linearsvm_ovr,0.964 +/- 0.039 (in 3 folds),0.964 +/- 0.039 (in 3 folds),0.991 +/- 0.010 (in 3 folds),0.991 +/- 0.010 (in 3 folds),0.911 +/- 0.063 (in 3 folds),0.791 +/- 0.107 (in 3 folds),0.911,0.772,168,0,168,0.0,False
lasso_multiclass,0.964 +/- 0.039 (in 3 folds),0.964 +/- 0.039 (in 3 folds),0.990 +/- 0.010 (in 3 folds),0.990 +/- 0.010 (in 3 folds),0.923 +/- 0.044 (in 3 folds),0.799 +/- 0.095 (in 3 folds),0.923,0.791,168,0,168,0.0,False
lasso_cv,0.962 +/- 0.040 (in 3 folds),0.962 +/- 0.040 (in 3 folds),0.990 +/- 0.010 (in 3 folds),0.990 +/- 0.010 (in 3 folds),0.815 +/- 0.028 (in 3 folds),0.382 +/- 0.148 (in 3 folds),0.815,0.385,168,0,168,0.0,False
xgboost,0.962 +/- 0.029 (in 3 folds),0.962 +/- 0.029 (in 3 folds),0.987 +/- 0.008 (in 3 folds),0.987 +/- 0.008 (in 3 folds),0.905 +/- 0.019 (in 3 folds),0.736 +/- 0.046 (in 3 folds),0.905,0.719,168,0,168,0.0,False
elasticnet_cv,0.960 +/- 0.040 (in 3 folds),0.960 +/- 0.040 (in 3 folds),0.990 +/- 0.010 (in 3 folds),0.990 +/- 0.010 (in 3 folds),0.815 +/- 0.041 (in 3 folds),0.328 +/- 0.288 (in 3 folds),0.815,0.385,168,0,168,0.0,False
ridge_cv,0.947 +/- 0.052 (in 3 folds),0.947 +/- 0.052 (in 3 folds),0.987 +/- 0.014 (in 3 folds),0.987 +/- 0.014 (in 3 folds),0.774 +/- 0.007 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.774,0.0,168,0,168,0.0,True
rf_multiclass,0.943 +/- 0.049 (in 3 folds),0.943 +/- 0.049 (in 3 folds),0.975 +/- 0.021 (in 3 folds),0.975 +/- 0.021 (in 3 folds),0.911 +/- 0.030 (in 3 folds),0.756 +/- 0.076 (in 3 folds),0.911,0.735,168,0,168,0.0,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.774 +/- 0.007 (in 3 folds),0.774 +/- 0.007 (in 3 folds),0.774 +/- 0.007 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.774,0.0,168,0,168,0.0,True
dummy_stratified,0.411 +/- 0.029 (in 3 folds),0.411 +/- 0.029 (in 3 folds),0.744 +/- 0.017 (in 3 folds),0.744 +/- 0.017 (in 3 folds),0.578 +/- 0.023 (in 3 folds),-0.174 +/- 0.060 (in 3 folds),0.577,-0.175,168,0,168,0.0,False


GeneLocus.TCR, fold 0, TargetObsColumnEnum.covid_vs_healthy, lasso_multiclass: best p value = 0.0005. Number of disease associated clusters: {'Covid19': 2, 'Healthy/Background': 275}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.covid_vs_healthy, rf_multiclass: best p value = 0.001. Number of disease associated clusters: {'Covid19': 2, 'Healthy/Background': 350}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.covid_vs_healthy, linearsvm_ovr: best p value = 0.0005. Number of disease associated clusters: {'Covid19': 2, 'Healthy/Background': 275}

GeneLocus.TCR, fold 1, TargetObsColumnEnum.covid_vs_healthy, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 11, 'Healthy/Background': 994}


GeneLocus.TCR, fold 1, TargetObsColumnEnum.covid_vs_healthy, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 11, 'Healthy/Background': 994}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.covid_vs_healthy, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'Covid19': 11, 'Healthy/Background': 994}

2023-01-07 03:53:54,651 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.hiv_vs_healthy from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/hiv_vs_healthy/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/hiv_vs_healthy/train_smaller_model




GeneLocus.TCR, fold 2, TargetObsColumnEnum.covid_vs_healthy, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 5, 'Healthy/Background': 924}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.covid_vs_healthy, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'Covid19': 5, 'Healthy/Background': 924}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.covid_vs_healthy, linearsvm_ovr: best p value = 0.001. Number of disease associated clusters: {'Covid19': 1, 'Healthy/Background': 431}



GeneLocus.TCR TargetObsColumnEnum.hiv_vs_healthy


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
linearsvm_ovr,0.881 +/- 0.023 (in 3 folds),0.881 +/- 0.023 (in 3 folds),0.942 +/- 0.009 (in 3 folds),0.942 +/- 0.009 (in 3 folds),0.783 +/- 0.017 (in 3 folds),0.529 +/- 0.028 (in 3 folds),0.784,0.5,194,0,194,0.0,False
lasso_multiclass,0.873 +/- 0.023 (in 3 folds),0.873 +/- 0.023 (in 3 folds),0.939 +/- 0.007 (in 3 folds),0.939 +/- 0.007 (in 3 folds),0.768 +/- 0.017 (in 3 folds),0.511 +/- 0.015 (in 3 folds),0.768,0.506,194,0,194,0.0,False
lasso_cv,0.857 +/- 0.011 (in 3 folds),0.857 +/- 0.011 (in 3 folds),0.934 +/- 0.004 (in 3 folds),0.934 +/- 0.004 (in 3 folds),0.773 +/- 0.017 (in 3 folds),0.465 +/- 0.061 (in 3 folds),0.773,0.462,194,0,194,0.0,False
elasticnet_cv,0.854 +/- 0.012 (in 3 folds),0.854 +/- 0.012 (in 3 folds),0.933 +/- 0.005 (in 3 folds),0.933 +/- 0.005 (in 3 folds),0.768 +/- 0.016 (in 3 folds),0.451 +/- 0.051 (in 3 folds),0.768,0.448,194,0,194,0.0,False
ridge_cv,0.853 +/- 0.045 (in 3 folds),0.853 +/- 0.045 (in 3 folds),0.930 +/- 0.016 (in 3 folds),0.930 +/- 0.016 (in 3 folds),0.696 +/- 0.047 (in 3 folds),0.132 +/- 0.228 (in 3 folds),0.696,0.217,194,0,194,0.0,False
xgboost,0.853 +/- 0.013 (in 3 folds),0.853 +/- 0.013 (in 3 folds),0.924 +/- 0.008 (in 3 folds),0.924 +/- 0.008 (in 3 folds),0.778 +/- 0.022 (in 3 folds),0.500 +/- 0.071 (in 3 folds),0.778,0.497,194,0,194,0.0,False
rf_multiclass,0.839 +/- 0.004 (in 3 folds),0.839 +/- 0.004 (in 3 folds),0.904 +/- 0.011 (in 3 folds),0.904 +/- 0.011 (in 3 folds),0.778 +/- 0.016 (in 3 folds),0.523 +/- 0.076 (in 3 folds),0.778,0.519,194,0,194,0.0,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.670 +/- 0.008 (in 3 folds),0.670 +/- 0.008 (in 3 folds),0.670 +/- 0.008 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.67,0.0,194,0,194,0.0,True
dummy_stratified,0.467 +/- 0.016 (in 3 folds),0.467 +/- 0.016 (in 3 folds),0.656 +/- 0.003 (in 3 folds),0.656 +/- 0.003 (in 3 folds),0.541 +/- 0.012 (in 3 folds),-0.068 +/- 0.032 (in 3 folds),0.541,-0.067,194,0,194,0.0,False


GeneLocus.TCR, fold 0, TargetObsColumnEnum.hiv_vs_healthy, lasso_multiclass: best p value = 0.0005. Number of disease associated clusters: {'HIV': 3, 'Healthy/Background': 128}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.hiv_vs_healthy, rf_multiclass: best p value = 0.001. Number of disease associated clusters: {'HIV': 6, 'Healthy/Background': 176}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.hiv_vs_healthy, linearsvm_ovr: best p value = 0.001. Number of disease associated clusters: {'HIV': 6, 'Healthy/Background': 176}

GeneLocus.TCR, fold 1, TargetObsColumnEnum.hiv_vs_healthy, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'HIV': 40, 'Healthy/Background': 434}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.hiv_vs_healthy, rf_multiclass: best p value = 0.0005. Number of disease associated clusters: {'HIV': 7, 'Healthy/Background': 126}


GeneLocus.TCR, fold 1, TargetObsColumnEnum.hiv_vs_healthy, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'HIV': 40, 'Healthy/Background': 434}



2023-01-07 03:53:57,137 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.lupus_vs_healthy from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/lupus_vs_healthy/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/lupus_vs_healthy/train_smaller_model


GeneLocus.TCR, fold 2, TargetObsColumnEnum.hiv_vs_healthy, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'HIV': 47, 'Healthy/Background': 598}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.hiv_vs_healthy, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'HIV': 31, 'Healthy/Background': 442}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.hiv_vs_healthy, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'HIV': 1136, 'Healthy/Background': 2002}



GeneLocus.TCR TargetObsColumnEnum.lupus_vs_healthy


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,au-PRC (macro OvO) per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
elasticnet_cv,0.951 +/- 0.033 (in 3 folds),0.951 +/- 0.033 (in 3 folds),0.892 +/- 0.058 (in 3 folds),0.892 +/- 0.058 (in 3 folds),0.788 +/- 0.020 (in 3 folds),0.249 +/- 0.219 (in 3 folds),0.788,0.309,0.779 +/- 0.021 (in 3 folds),0.264 +/- 0.127 (in 3 folds),...,0.825 +/- 0.000 (in 1 folds),0.779,0.275,0.012,Unknown,170,2,172,0.011628,False
ridge_cv,0.945 +/- 0.029 (in 3 folds),0.945 +/- 0.029 (in 3 folds),0.875 +/- 0.054 (in 3 folds),0.875 +/- 0.054 (in 3 folds),0.759 +/- 0.008 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.759,0.0,0.750 +/- 0.008 (in 3 folds),0.027 +/- 0.081 (in 3 folds),...,0.824 +/- 0.000 (in 1 folds),0.75,0.032,0.012,Unknown,170,2,172,0.011628,True
linearsvm_ovr,0.925 +/- 0.080 (in 3 folds),0.925 +/- 0.080 (in 3 folds),0.862 +/- 0.117 (in 3 folds),0.862 +/- 0.117 (in 3 folds),0.898 +/- 0.047 (in 3 folds),0.724 +/- 0.122 (in 3 folds),0.899,0.72,0.878 +/- 0.053 (in 3 folds),0.681 +/- 0.125 (in 3 folds),...,,0.878,0.675,0.023,Unknown,168,4,172,0.023256,False
lasso_multiclass,0.925 +/- 0.080 (in 3 folds),0.925 +/- 0.080 (in 3 folds),0.862 +/- 0.117 (in 3 folds),0.862 +/- 0.117 (in 3 folds),0.892 +/- 0.038 (in 3 folds),0.708 +/- 0.096 (in 3 folds),0.893,0.701,0.872 +/- 0.045 (in 3 folds),0.665 +/- 0.100 (in 3 folds),...,,0.872,0.657,0.023,Unknown,168,4,172,0.023256,False
lasso_cv,0.919 +/- 0.021 (in 3 folds),0.919 +/- 0.021 (in 3 folds),0.825 +/- 0.047 (in 3 folds),0.825 +/- 0.047 (in 3 folds),0.840 +/- 0.061 (in 3 folds),0.507 +/- 0.220 (in 3 folds),0.84,0.531,0.825 +/- 0.048 (in 3 folds),0.474 +/- 0.170 (in 3 folds),...,0.803 +/- 0.000 (in 1 folds),0.826,0.482,0.017,Unknown,169,3,172,0.017442,False
xgboost,0.918 +/- 0.033 (in 3 folds),0.918 +/- 0.033 (in 3 folds),0.829 +/- 0.051 (in 3 folds),0.829 +/- 0.051 (in 3 folds),0.887 +/- 0.027 (in 3 folds),0.684 +/- 0.067 (in 3 folds),0.887,0.68,0.866 +/- 0.021 (in 3 folds),0.639 +/- 0.044 (in 3 folds),...,,0.866,0.635,0.023,Unknown,168,4,172,0.023256,False
rf_multiclass,0.909 +/- 0.055 (in 3 folds),0.909 +/- 0.055 (in 3 folds),0.827 +/- 0.096 (in 3 folds),0.827 +/- 0.096 (in 3 folds),0.869 +/- 0.041 (in 3 folds),0.646 +/- 0.086 (in 3 folds),0.869,0.63,0.849 +/- 0.037 (in 3 folds),0.605 +/- 0.072 (in 3 folds),...,,0.849,0.589,0.023,Unknown,168,4,172,0.023256,False
dummy_stratified,0.509 +/- 0.080 (in 3 folds),0.509 +/- 0.080 (in 3 folds),0.311 +/- 0.082 (in 3 folds),0.311 +/- 0.082 (in 3 folds),0.586 +/- 0.025 (in 3 folds),0.016 +/- 0.155 (in 3 folds),0.581,-0.001,0.418 +/- 0.121 (in 3 folds),-0.031 +/- 0.092 (in 3 folds),...,,0.419,-0.028,0.279,Unknown,124,48,172,0.27907,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.241 +/- 0.008 (in 3 folds),0.241 +/- 0.008 (in 3 folds),0.759 +/- 0.008 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.759,0.0,0.750 +/- 0.008 (in 3 folds),0.027 +/- 0.081 (in 3 folds),...,0.246 +/- 0.000 (in 1 folds),0.75,0.032,0.012,Unknown,170,2,172,0.011628,True


GeneLocus.TCR, fold 0, TargetObsColumnEnum.lupus_vs_healthy, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'Healthy/Background': 20, 'Lupus': 231}


GeneLocus.TCR, fold 0, TargetObsColumnEnum.lupus_vs_healthy, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'Healthy/Background': 20, 'Lupus': 231}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.lupus_vs_healthy, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'Healthy/Background': 20, 'Lupus': 231}

GeneLocus.TCR, fold 1, TargetObsColumnEnum.lupus_vs_healthy, lasso_multiclass: best p value = 0.001. Number of disease associated clusters: {'Healthy/Background': 2, 'Lupus': 116}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.lupus_vs_healthy, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'Healthy/Background': 19, 'Lupus': 316}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.lupus_vs_healthy, linearsvm_ovr: best p value = 0.001. Number of disease associated clusters: {'Healthy/Background': 2, 'Lupus': 116}

GeneLocus.TCR, fold 2, TargetObsColumnEnum.lupus_vs_healthy, lasso_multiclass: best p value = 0.01. Number 

2023-01-07 03:54:00,493 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.ethnicity_condensed_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/ethnicity_condensed_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/ethnicity_condensed_healthy_only/train_smaller_model


GeneLocus.TCR, fold 2, TargetObsColumnEnum.lupus_vs_healthy, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'Healthy/Background': 20, 'Lupus': 279}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.lupus_vs_healthy, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'Healthy/Background': 40, 'Lupus': 346}



GeneLocus.TCR TargetObsColumnEnum.ethnicity_condensed_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,au-PRC (macro OvO) per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
xgboost,0.702 +/- 0.041 (in 3 folds),0.690 +/- 0.038 (in 3 folds),0.709 +/- 0.024 (in 3 folds),0.707 +/- 0.009 (in 3 folds),0.601 +/- 0.097 (in 3 folds),0.357 +/- 0.084 (in 3 folds),0.598,0.345,0.556 +/- 0.053 (in 3 folds),0.296 +/- 0.028 (in 3 folds),...,0.703 +/- 0.000 (in 1 folds),0.557,0.273,0.07,Unknown,107,8,115,0.069565,False
linearsvm_ovr,0.692 +/- 0.052 (in 3 folds),0.697 +/- 0.058 (in 3 folds),0.716 +/- 0.018 (in 3 folds),0.725 +/- 0.040 (in 3 folds),0.583 +/- 0.078 (in 3 folds),0.331 +/- 0.086 (in 3 folds),0.584,0.338,0.573 +/- 0.082 (in 3 folds),0.295 +/- 0.029 (in 3 folds),...,0.727 +/- 0.056 (in 2 folds),0.574,0.292,0.017,Unknown,113,2,115,0.017391,True
lasso_multiclass,0.673 +/- 0.018 (in 3 folds),0.662 +/- 0.012 (in 3 folds),0.684 +/- 0.015 (in 3 folds),0.673 +/- 0.010 (in 3 folds),0.600 +/- 0.090 (in 3 folds),0.371 +/- 0.103 (in 3 folds),0.602,0.382,0.590 +/- 0.092 (in 3 folds),0.337 +/- 0.064 (in 3 folds),...,0.675 +/- 0.013 (in 2 folds),0.591,0.336,0.017,Unknown,113,2,115,0.017391,True
lasso_cv,0.622 +/- 0.073 (in 3 folds),0.612 +/- 0.068 (in 3 folds),0.669 +/- 0.052 (in 3 folds),0.663 +/- 0.048 (in 3 folds),0.541 +/- 0.133 (in 3 folds),0.273 +/- 0.104 (in 3 folds),0.546,0.263,0.512 +/- 0.157 (in 3 folds),0.168 +/- 0.186 (in 3 folds),...,0.688 +/- 0.000 (in 1 folds),0.513,0.154,0.061,Unknown,108,7,115,0.06087,True
rf_multiclass,0.612 +/- 0.089 (in 3 folds),0.609 +/- 0.066 (in 3 folds),0.640 +/- 0.048 (in 3 folds),0.629 +/- 0.038 (in 3 folds),0.597 +/- 0.145 (in 3 folds),0.381 +/- 0.173 (in 3 folds),0.591,0.365,0.564 +/- 0.089 (in 3 folds),0.319 +/- 0.068 (in 3 folds),...,0.610 +/- 0.027 (in 2 folds),0.565,0.284,0.043,Unknown,110,5,115,0.043478,True
elasticnet_cv,0.596 +/- 0.061 (in 3 folds),0.591 +/- 0.072 (in 3 folds),0.656 +/- 0.046 (in 3 folds),0.659 +/- 0.067 (in 3 folds),0.529 +/- 0.122 (in 3 folds),0.196 +/- 0.172 (in 3 folds),0.532,0.221,0.512 +/- 0.134 (in 3 folds),0.136 +/- 0.167 (in 3 folds),...,0.685 +/- 0.071 (in 2 folds),0.513,0.128,0.035,Unknown,111,4,115,0.034783,True
ridge_cv,0.575 +/- 0.068 (in 3 folds),0.559 +/- 0.051 (in 3 folds),0.617 +/- 0.103 (in 3 folds),0.608 +/- 0.095 (in 3 folds),0.504 +/- 0.099 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.5,0.0,0.478 +/- 0.054 (in 3 folds),0.002 +/- 0.003 (in 3 folds),...,0.572 +/- 0.102 (in 2 folds),0.478,-0.018,0.043,Unknown,110,5,115,0.043478,True
dummy_stratified,0.504 +/- 0.027 (in 3 folds),0.501 +/- 0.025 (in 3 folds),0.515 +/- 0.017 (in 3 folds),0.512 +/- 0.014 (in 3 folds),0.424 +/- 0.042 (in 3 folds),0.030 +/- 0.067 (in 3 folds),0.429,0.038,0.392 +/- 0.096 (in 3 folds),0.009 +/- 0.053 (in 3 folds),...,0.504 +/- 0.008 (in 2 folds),0.391,0.019,0.087,Unknown,105,10,115,0.086957,True
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.504 +/- 0.099 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.5,0.0,0.478 +/- 0.054 (in 3 folds),0.002 +/- 0.003 (in 3 folds),...,0.500 +/- 0.000 (in 2 folds),0.478,-0.018,0.043,Unknown,110,5,115,0.043478,True


GeneLocus.TCR, fold 0, TargetObsColumnEnum.ethnicity_condensed_healthy_only, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'African': 220, 'Asian': 166, 'Caucasian': 11, 'Hispanic/Latino': 123}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.ethnicity_condensed_healthy_only, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'African': 220, 'Asian': 166, 'Caucasian': 11, 'Hispanic/Latino': 123}


GeneLocus.TCR, fold 0, TargetObsColumnEnum.ethnicity_condensed_healthy_only, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'African': 220, 'Asian': 166, 'Caucasian': 11, 'Hispanic/Latino': 123}



GeneLocus.TCR, fold 1, TargetObsColumnEnum.ethnicity_condensed_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'African': 129, 'Asian': 496, 'Caucasian': 26, 'Hispanic/Latino': 202}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.ethnicity_condensed_healthy_only, rf_multiclass: best p value = 0.001. Number of disease associated clusters: {'African': 7, 'Asian': 33, 'Caucasian': 2, 'Hispanic/Latino': 5}


GeneLocus.TCR, fold 1, TargetObsColumnEnum.ethnicity_condensed_healthy_only, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'African': 126, 'Asian': 109, 'Caucasian': 11, 'Hispanic/Latino': 141}



2023-01-07 03:54:05,156 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.age_group_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/age_group_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/age_group_healthy_only/train_smaller_model


GeneLocus.TCR, fold 2, TargetObsColumnEnum.ethnicity_condensed_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'African': 177, 'Asian': 171, 'Caucasian': 85, 'Hispanic/Latino': 187}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.ethnicity_condensed_healthy_only, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'African': 2966, 'Asian': 4195, 'Caucasian': 626, 'Hispanic/Latino': 66760}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.ethnicity_condensed_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'African': 177, 'Asian': 171, 'Caucasian': 85, 'Hispanic/Latino': 187}



































































































































































































































































































GeneLocus.TCR TargetObsColumnEnum.age_group_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,au-PRC (macro OvO) per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes
lasso_multiclass,0.624 +/- 0.021 (in 3 folds),0.604 +/- 0.014 (in 3 folds),0.691 +/- 0.012 (in 3 folds),0.675 +/- 0.013 (in 3 folds),0.324 +/- 0.035 (in 3 folds),0.250 +/- 0.044 (in 3 folds),0.325,0.194,0.322 +/- 0.039 (in 3 folds),0.248 +/- 0.043 (in 3 folds),...,0.680 +/- 0.013 (in 2 folds),0.322,0.193,0.009,Unknown,114,1,115,0.008696,True
lasso_cv,0.619 +/- 0.035 (in 3 folds),0.604 +/- 0.031 (in 3 folds),0.692 +/- 0.014 (in 3 folds),0.681 +/- 0.011 (in 3 folds),0.298 +/- 0.053 (in 3 folds),0.253 +/- 0.029 (in 3 folds),0.298,0.184,0.296 +/- 0.057 (in 3 folds),0.249 +/- 0.034 (in 3 folds),...,0.684 +/- 0.013 (in 2 folds),0.296,0.183,0.009,Unknown,114,1,115,0.008696,True
rf_multiclass,0.615 +/- 0.026 (in 3 folds),0.598 +/- 0.030 (in 3 folds),0.657 +/- 0.011 (in 3 folds),0.644 +/- 0.015 (in 3 folds),0.324 +/- 0.009 (in 3 folds),0.230 +/- 0.005 (in 3 folds),0.325,0.18,0.322 +/- 0.013 (in 3 folds),0.228 +/- 0.006 (in 3 folds),...,0.650 +/- 0.016 (in 2 folds),0.322,0.179,0.009,Unknown,114,1,115,0.008696,True
xgboost,0.606 +/- 0.062 (in 3 folds),0.581 +/- 0.065 (in 3 folds),0.660 +/- 0.051 (in 3 folds),0.646 +/- 0.060 (in 3 folds),0.357 +/- 0.035 (in 3 folds),0.228 +/- 0.062 (in 3 folds),0.355,0.217,0.331 +/- 0.022 (in 3 folds),0.206 +/- 0.035 (in 3 folds),...,0.713 +/- 0.000 (in 1 folds),0.33,0.2,0.07,Unknown,107,8,115,0.069565,True
linearsvm_ovr,0.604 +/- 0.010 (in 3 folds),0.580 +/- 0.022 (in 3 folds),0.664 +/- 0.031 (in 3 folds),0.646 +/- 0.040 (in 3 folds),0.355 +/- 0.040 (in 3 folds),0.240 +/- 0.050 (in 3 folds),0.355,0.213,0.339 +/- 0.018 (in 3 folds),0.226 +/- 0.036 (in 3 folds),...,0.666 +/- 0.000 (in 1 folds),0.339,0.204,0.043,Unknown,110,5,115,0.043478,True
ridge_cv,0.599 +/- 0.056 (in 3 folds),0.570 +/- 0.044 (in 3 folds),0.659 +/- 0.017 (in 3 folds),0.639 +/- 0.015 (in 3 folds),0.253 +/- 0.061 (in 3 folds),0.109 +/- 0.116 (in 3 folds),0.25,0.077,0.235 +/- 0.049 (in 3 folds),0.138 +/- 0.082 (in 3 folds),...,0.625 +/- 0.000 (in 1 folds),0.235,0.078,0.061,Unknown,108,7,115,0.06087,True
elasticnet_cv,0.598 +/- 0.010 (in 3 folds),0.580 +/- 0.015 (in 3 folds),0.661 +/- 0.023 (in 3 folds),0.648 +/- 0.028 (in 3 folds),0.281 +/- 0.084 (in 3 folds),0.248 +/- 0.092 (in 3 folds),0.281,0.147,0.279 +/- 0.086 (in 3 folds),0.246 +/- 0.096 (in 3 folds),...,0.632 +/- 0.008 (in 2 folds),0.278,0.144,0.009,Unknown,114,1,115,0.008696,True
dummy_stratified,0.526 +/- 0.011 (in 3 folds),0.527 +/- 0.009 (in 3 folds),0.539 +/- 0.005 (in 3 folds),0.540 +/- 0.004 (in 3 folds),0.213 +/- 0.028 (in 3 folds),0.054 +/- 0.033 (in 3 folds),0.212,0.047,0.209 +/- 0.032 (in 3 folds),0.056 +/- 0.031 (in 3 folds),...,0.538 +/- 0.004 (in 2 folds),0.209,0.048,0.017,Unknown,113,2,115,0.017391,False
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.229 +/- 0.064 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.226,0.035,0.208 +/- 0.042 (in 3 folds),0.042 +/- 0.043 (in 3 folds),...,0.500 +/- 0.000 (in 1 folds),0.209,0.032,0.078,Unknown,106,9,115,0.078261,True


GeneLocus.TCR, fold 0, TargetObsColumnEnum.age_group_healthy_only, lasso_multiclass: best p value = 0.05. Number of disease associated clusters: {'20-30': 3023, '30-40': 916, '40-50': 4285, '50-60': 730, '60-70': 1319, '70-80': 49604, '<20': 5331}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.age_group_healthy_only, rf_multiclass: best p value = 0.001. Number of disease associated clusters: {'20-30': 27, '30-40': 6, '40-50': 11, '50-60': 2, '60-70': 6, '70-80': 21, '<20': 359}


GeneLocus.TCR, fold 0, TargetObsColumnEnum.age_group_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'20-30': 202, '30-40': 379, '40-50': 164, '50-60': 67, '60-70': 599, '70-80': 80, '<20': 1123}



GeneLocus.TCR, fold 1, TargetObsColumnEnum.age_group_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'20-30': 94, '30-40': 63, '40-50': 346, '50-60': 227, '60-70': 112, '<20': 1273}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.age_group_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'20-30': 94, '30-40': 63, '40-50': 346, '50-60': 227, '60-70': 112, '<20': 1273}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.age_group_healthy_only, linearsvm_ovr: best p value = 0.001. Number of disease associated clusters: {'20-30': 0, '30-40': 2, '40-50': 17, '50-60': 6, '60-70': 2, '<20': 160}

GeneLocus.TCR, fold 2, TargetObsColumnEnum.age_group_healthy_only, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'20-30': 17, '30-40': 126, '40-50': 206, '50-60': 58, '60-70': 28, '<20': 554}


2023-01-07 03:54:14,068 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.age_group_binary_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/age_group_binary_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/age_group_binary_healthy_only/train_smaller_model


GeneLocus.TCR, fold 2, TargetObsColumnEnum.age_group_healthy_only, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'20-30': 17, '30-40': 126, '40-50': 206, '50-60': 58, '60-70': 28, '<20': 554}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.age_group_healthy_only, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'20-30': 17, '30-40': 126, '40-50': 206, '50-60': 58, '60-70': 28, '<20': 554}





2023-01-07 03:54:14,859 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:14,880 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:14,925 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:14,934 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,100 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,120 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,162 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,172 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,538 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,559 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,601 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,610 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,780 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,800 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,843 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:15,862 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:16,209 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:16,224 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:16,264 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric rocauc: Only one class present in y_true. Probability-based score is not defined in that case.




2023-01-07 03:54:16,273 - malid.external.model_evaluation - ERROR - Error in evaluating predict-proba-based metric auprc: Only one class present in y_true. Probability-based score is not defined in that case.


GeneLocus.TCR TargetObsColumnEnum.age_group_binary_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,au-PRC (weighted OvO) per fold with abstention,au-PRC (macro OvO) per fold with abstention
lasso_multiclass,0.755 +/- 0.061 (in 3 folds),0.755 +/- 0.061 (in 3 folds),0.871 +/- 0.049 (in 3 folds),0.871 +/- 0.049 (in 3 folds),0.697 +/- 0.036 (in 3 folds),0.339 +/- 0.076 (in 3 folds),0.696,0.331,0.679 +/- 0.033 (in 3 folds),0.320 +/- 0.058 (in 3 folds),...,Unknown,112,3,115,0.026087,False,0.801 +/- 0.000 (in 1 folds),0.801 +/- 0.000 (in 1 folds),0.926 +/- 0.000 (in 1 folds),0.926 +/- 0.000 (in 1 folds)
linearsvm_ovr,0.743 +/- 0.055 (in 3 folds),0.743 +/- 0.055 (in 3 folds),0.864 +/- 0.029 (in 3 folds),0.864 +/- 0.029 (in 3 folds),0.706 +/- 0.050 (in 3 folds),0.334 +/- 0.134 (in 3 folds),0.705,0.34,0.687 +/- 0.041 (in 3 folds),0.313 +/- 0.110 (in 3 folds),...,Unknown,112,3,115,0.026087,False,0.745 +/- 0.000 (in 1 folds),0.745 +/- 0.000 (in 1 folds),0.887 +/- 0.000 (in 1 folds),0.887 +/- 0.000 (in 1 folds)
elasticnet_cv,0.701 +/- 0.136 (in 2 folds),0.701 +/- 0.136 (in 2 folds),0.820 +/- 0.076 (in 2 folds),0.820 +/- 0.076 (in 2 folds),0.781 +/- 0.190 (in 3 folds),0.101 +/- 0.174 (in 3 folds),0.714,0.256,0.518 +/- 0.191 (in 3 folds),0.160 +/- 0.142 (in 3 folds),...,Unknown,84,31,115,0.269565,False,,,,
ridge_cv,0.690 +/- 0.117 (in 2 folds),0.690 +/- 0.117 (in 2 folds),0.811 +/- 0.054 (in 2 folds),0.811 +/- 0.054 (in 2 folds),0.754 +/- 0.216 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.679,0.0,0.492 +/- 0.171 (in 3 folds),0.078 +/- 0.116 (in 3 folds),...,Unknown,84,31,115,0.269565,True,,,,
lasso_cv,0.688 +/- 0.119 (in 2 folds),0.688 +/- 0.119 (in 2 folds),0.807 +/- 0.058 (in 2 folds),0.807 +/- 0.058 (in 2 folds),0.808 +/- 0.172 (in 3 folds),0.204 +/- 0.253 (in 3 folds),0.75,0.38,0.544 +/- 0.219 (in 3 folds),0.247 +/- 0.176 (in 3 folds),...,Unknown,84,31,115,0.269565,False,,,,
xgboost,0.647 +/- 0.024 (in 3 folds),0.647 +/- 0.024 (in 3 folds),0.749 +/- 0.070 (in 3 folds),0.749 +/- 0.070 (in 3 folds),0.688 +/- 0.063 (in 3 folds),0.285 +/- 0.118 (in 3 folds),0.688,0.292,0.671 +/- 0.075 (in 3 folds),0.270 +/- 0.123 (in 3 folds),...,Unknown,112,3,115,0.026087,False,0.673 +/- 0.000 (in 1 folds),0.673 +/- 0.000 (in 1 folds),0.824 +/- 0.000 (in 1 folds),0.824 +/- 0.000 (in 1 folds)
rf_multiclass,0.626 +/- 0.063 (in 3 folds),0.626 +/- 0.063 (in 3 folds),0.760 +/- 0.045 (in 3 folds),0.760 +/- 0.045 (in 3 folds),0.679 +/- 0.065 (in 3 folds),0.233 +/- 0.148 (in 3 folds),0.679,0.238,0.662 +/- 0.070 (in 3 folds),0.216 +/- 0.140 (in 3 folds),...,Unknown,112,3,115,0.026087,False,0.556 +/- 0.000 (in 1 folds),0.556 +/- 0.000 (in 1 folds),0.781 +/- 0.000 (in 1 folds),0.781 +/- 0.000 (in 1 folds)
dummy_stratified,0.585 +/- 0.027 (in 2 folds),0.585 +/- 0.027 (in 2 folds),0.671 +/- 0.066 (in 2 folds),0.671 +/- 0.066 (in 2 folds),0.713 +/- 0.172 (in 3 folds),0.114 +/- 0.104 (in 3 folds),0.651,0.206,0.483 +/- 0.184 (in 3 folds),0.161 +/- 0.021 (in 3 folds),...,Unknown,86,29,115,0.252174,False,0.567 +/- 0.000 (in 1 folds),0.567 +/- 0.000 (in 1 folds),0.625 +/- 0.000 (in 1 folds),0.625 +/- 0.000 (in 1 folds)
dummy_most_frequent,0.500 +/- 0.000 (in 2 folds),0.500 +/- 0.000 (in 2 folds),0.631 +/- 0.051 (in 2 folds),0.631 +/- 0.051 (in 2 folds),0.754 +/- 0.216 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.679,0.0,0.492 +/- 0.171 (in 3 folds),0.078 +/- 0.116 (in 3 folds),...,Unknown,84,31,115,0.269565,True,,,,


GeneLocus.TCR, fold 0, TargetObsColumnEnum.age_group_binary_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'50+': 53, 'under 50': 123}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.age_group_binary_healthy_only, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'50+': 359, 'under 50': 823}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.age_group_binary_healthy_only, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'50+': 359, 'under 50': 823}



GeneLocus.TCR, fold 1, TargetObsColumnEnum.age_group_binary_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'50+': 67, 'under 50': 24}


GeneLocus.TCR, fold 1, TargetObsColumnEnum.age_group_binary_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'50+': 67, 'under 50': 24}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.age_group_binary_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'50+': 67, 'under 50': 24}

GeneLocus.TCR, fold 2, TargetObsColumnEnum.age_group_binary_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'50+': 50, 'under 50': 67}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.age_group_binary_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'50+': 50, 'under 50': 67}


2023-01-07 03:54:17,267 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.age_group_pediatric_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/age_group_pediatric_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/age_group_pediatric_healthy_only/train_smaller_model


GeneLocus.TCR, fold 2, TargetObsColumnEnum.age_group_binary_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'50+': 50, 'under 50': 67}



GeneLocus.TCR TargetObsColumnEnum.age_group_pediatric_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,sample_size,n_abstentions,...,MCC per fold with abstention,Unknown/abstention proportion per fold with abstention,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,au-PRC (weighted OvO) per fold with abstention,au-PRC (macro OvO) per fold with abstention,Accuracy global with abstention,MCC global with abstention,Unknown/abstention proportion global with abstention,Abstention label global with abstention
linearsvm_ovr,0.954 +/- 0.024 (in 3 folds),0.954 +/- 0.024 (in 3 folds),0.885 +/- 0.042 (in 3 folds),0.885 +/- 0.042 (in 3 folds),0.948 +/- 0.044 (in 3 folds),0.825 +/- 0.113 (in 3 folds),0.947,0.81,114,1,...,0.790 +/- 0.102 (in 3 folds),0.026 +/- 0.000 (in 1 folds),0.939 +/- 0.002 (in 2 folds),0.939 +/- 0.002 (in 2 folds),0.865 +/- 0.034 (in 2 folds),0.865 +/- 0.034 (in 2 folds),0.939,0.78,0.009,Unknown
lasso_multiclass,0.953 +/- 0.041 (in 3 folds),0.953 +/- 0.041 (in 3 folds),0.898 +/- 0.097 (in 3 folds),0.898 +/- 0.097 (in 3 folds),0.939 +/- 0.058 (in 3 folds),0.796 +/- 0.162 (in 3 folds),0.938,0.775,113,2,...,0.736 +/- 0.148 (in 3 folds),0.051 +/- 0.000 (in 1 folds),0.930 +/- 0.011 (in 2 folds),0.930 +/- 0.011 (in 2 folds),0.847 +/- 0.059 (in 2 folds),0.847 +/- 0.059 (in 2 folds),0.922,0.719,0.017,Unknown
xgboost,0.939 +/- 0.032 (in 3 folds),0.939 +/- 0.032 (in 3 folds),0.832 +/- 0.078 (in 3 folds),0.832 +/- 0.078 (in 3 folds),0.939 +/- 0.059 (in 3 folds),0.796 +/- 0.162 (in 3 folds),0.939,0.776,114,1,...,0.762 +/- 0.145 (in 3 folds),0.026 +/- 0.000 (in 1 folds),0.921 +/- 0.007 (in 2 folds),0.921 +/- 0.007 (in 2 folds),0.802 +/- 0.082 (in 2 folds),0.802 +/- 0.082 (in 2 folds),0.93,0.746,0.009,Unknown
lasso_cv,0.936 +/- 0.006 (in 3 folds),0.936 +/- 0.006 (in 3 folds),0.863 +/- 0.024 (in 3 folds),0.863 +/- 0.024 (in 3 folds),0.826 +/- 0.052 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.826,0.0,115,0,...,,,,,,,,,,
elasticnet_cv,0.936 +/- 0.006 (in 3 folds),0.936 +/- 0.006 (in 3 folds),0.863 +/- 0.024 (in 3 folds),0.863 +/- 0.024 (in 3 folds),0.826 +/- 0.052 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.826,0.0,115,0,...,,,,,,,,,,
ridge_cv,0.895 +/- 0.077 (in 3 folds),0.895 +/- 0.077 (in 3 folds),0.790 +/- 0.131 (in 3 folds),0.790 +/- 0.131 (in 3 folds),0.826 +/- 0.052 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.826,0.0,115,0,...,,,,,,,,,,
rf_multiclass,0.877 +/- 0.052 (in 3 folds),0.877 +/- 0.052 (in 3 folds),0.786 +/- 0.100 (in 3 folds),0.786 +/- 0.100 (in 3 folds),0.939 +/- 0.059 (in 3 folds),0.796 +/- 0.162 (in 3 folds),0.939,0.776,114,1,...,0.762 +/- 0.145 (in 3 folds),0.026 +/- 0.000 (in 1 folds),0.866 +/- 0.068 (in 2 folds),0.866 +/- 0.068 (in 2 folds),0.766 +/- 0.133 (in 2 folds),0.766 +/- 0.133 (in 2 folds),0.93,0.746,0.009,Unknown
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.174 +/- 0.052 (in 3 folds),0.174 +/- 0.052 (in 3 folds),0.826 +/- 0.052 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.826,0.0,115,0,...,,,,,,,,,,
dummy_stratified,0.425 +/- 0.106 (in 3 folds),0.425 +/- 0.106 (in 3 folds),0.171 +/- 0.046 (in 3 folds),0.171 +/- 0.046 (in 3 folds),0.633 +/- 0.115 (in 3 folds),-0.127 +/- 0.165 (in 3 folds),0.635,-0.138,115,0,...,,,,,,,,,,


GeneLocus.TCR, fold 0, TargetObsColumnEnum.age_group_pediatric_healthy_only, lasso_multiclass: best p value = 0.0005. Number of disease associated clusters: {'18+': 0, 'under 18': 316}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.age_group_pediatric_healthy_only, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'18+': 0, 'under 18': 877}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.age_group_pediatric_healthy_only, linearsvm_ovr: best p value = 0.0005. Number of disease associated clusters: {'18+': 0, 'under 18': 316}



GeneLocus.TCR, fold 1, TargetObsColumnEnum.age_group_pediatric_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'18+': 0, 'under 18': 1127}


GeneLocus.TCR, fold 1, TargetObsColumnEnum.age_group_pediatric_healthy_only, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'18+': 0, 'under 18': 1042}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.age_group_pediatric_healthy_only, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'18+': 0, 'under 18': 1042}

GeneLocus.TCR, fold 2, TargetObsColumnEnum.age_group_pediatric_healthy_only, lasso_multiclass: best p value = 0.001. Number of disease associated clusters: {'18+': 0, 'under 18': 298}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.age_group_pediatric_healthy_only, rf_multiclass: best p value = 0.01. Number of disease associated clusters: {'18+': 0, 'under 18': 1348}


2023-01-07 03:54:19,751 - analyze_convergent_clustering_models.ipynb - INFO - GeneLocus.TCR, TargetObsColumnEnum.sex_healthy_only from /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/convergent_clusters/TCR/sex_healthy_only/train_smaller_model to /users/maximz/code/boyd-immune-repertoire-classification/out/convergent_clusters/TCR/sex_healthy_only/train_smaller_model


GeneLocus.TCR, fold 2, TargetObsColumnEnum.age_group_pediatric_healthy_only, linearsvm_ovr: best p value = 0.005. Number of disease associated clusters: {'18+': 0, 'under 18': 1171}



GeneLocus.TCR TargetObsColumnEnum.sex_healthy_only


Unnamed: 0,ROC-AUC (weighted OvO) per fold,ROC-AUC (macro OvO) per fold,au-PRC (weighted OvO) per fold,au-PRC (macro OvO) per fold,Accuracy per fold,MCC per fold,Accuracy global,MCC global,Accuracy per fold with abstention,MCC per fold with abstention,...,Abstention label global with abstention,sample_size,n_abstentions,sample_size including abstentions,abstention_rate,missing_classes,ROC-AUC (weighted OvO) per fold with abstention,ROC-AUC (macro OvO) per fold with abstention,au-PRC (weighted OvO) per fold with abstention,au-PRC (macro OvO) per fold with abstention
linearsvm_ovr,0.598 +/- 0.035 (in 3 folds),0.598 +/- 0.035 (in 3 folds),0.591 +/- 0.133 (in 3 folds),0.591 +/- 0.133 (in 3 folds),0.576 +/- 0.132 (in 3 folds),0.192 +/- 0.117 (in 3 folds),0.577,0.162,0.556 +/- 0.129 (in 3 folds),0.179 +/- 0.102 (in 3 folds),...,Unknown,111,4,115,0.034783,False,,,,
dummy_stratified,0.598 +/- 0.029 (in 3 folds),0.598 +/- 0.029 (in 3 folds),0.565 +/- 0.170 (in 3 folds),0.565 +/- 0.170 (in 3 folds),0.575 +/- 0.051 (in 3 folds),0.191 +/- 0.064 (in 3 folds),0.574,0.149,0.540 +/- 0.054 (in 3 folds),0.169 +/- 0.059 (in 3 folds),...,Unknown,108,7,115,0.06087,False,,,,
lasso_multiclass,0.594 +/- 0.052 (in 3 folds),0.594 +/- 0.052 (in 3 folds),0.602 +/- 0.132 (in 3 folds),0.602 +/- 0.132 (in 3 folds),0.543 +/- 0.085 (in 3 folds),0.160 +/- 0.070 (in 3 folds),0.541,0.078,0.514 +/- 0.068 (in 3 folds),0.145 +/- 0.050 (in 3 folds),...,Unknown,109,6,115,0.052174,False,,,,
ridge_cv,0.587 +/- 0.064 (in 3 folds),0.587 +/- 0.064 (in 3 folds),0.638 +/- 0.099 (in 3 folds),0.638 +/- 0.099 (in 3 folds),0.393 +/- 0.084 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.393,-0.234,0.384 +/- 0.092 (in 3 folds),0.046 +/- 0.041 (in 3 folds),...,Unknown,112,3,115,0.026087,False,0.567 +/- 0.000 (in 1 folds),0.567 +/- 0.000 (in 1 folds),0.630 +/- 0.000 (in 1 folds),0.630 +/- 0.000 (in 1 folds)
lasso_cv,0.533 +/- 0.044 (in 3 folds),0.533 +/- 0.044 (in 3 folds),0.551 +/- 0.186 (in 3 folds),0.551 +/- 0.186 (in 3 folds),0.523 +/- 0.015 (in 3 folds),0.176 +/- 0.111 (in 3 folds),0.523,0.045,0.495 +/- 0.015 (in 3 folds),0.144 +/- 0.117 (in 3 folds),...,Unknown,109,6,115,0.052174,False,,,,
rf_multiclass,0.524 +/- 0.035 (in 3 folds),0.524 +/- 0.035 (in 3 folds),0.540 +/- 0.143 (in 3 folds),0.540 +/- 0.143 (in 3 folds),0.528 +/- 0.078 (in 3 folds),0.148 +/- 0.066 (in 3 folds),0.548,0.101,0.401 +/- 0.199 (in 3 folds),0.112 +/- 0.088 (in 3 folds),...,Unknown,84,31,115,0.269565,False,,,,
elasticnet_cv,0.516 +/- 0.040 (in 3 folds),0.516 +/- 0.040 (in 3 folds),0.557 +/- 0.172 (in 3 folds),0.557 +/- 0.172 (in 3 folds),0.537 +/- 0.066 (in 3 folds),0.096 +/- 0.111 (in 3 folds),0.536,0.071,0.513 +/- 0.045 (in 3 folds),0.084 +/- 0.096 (in 3 folds),...,Unknown,110,5,115,0.043478,False,0.480 +/- 0.000 (in 1 folds),0.480 +/- 0.000 (in 1 folds),0.510 +/- 0.000 (in 1 folds),0.510 +/- 0.000 (in 1 folds)
xgboost,0.514 +/- 0.035 (in 3 folds),0.514 +/- 0.035 (in 3 folds),0.530 +/- 0.154 (in 3 folds),0.530 +/- 0.154 (in 3 folds),0.547 +/- 0.035 (in 3 folds),0.109 +/- 0.033 (in 3 folds),0.546,0.091,0.513 +/- 0.026 (in 3 folds),0.094 +/- 0.029 (in 3 folds),...,Unknown,108,7,115,0.06087,False,,,,
dummy_most_frequent,0.500 +/- 0.000 (in 3 folds),0.500 +/- 0.000 (in 3 folds),0.510 +/- 0.155 (in 3 folds),0.510 +/- 0.155 (in 3 folds),0.393 +/- 0.084 (in 3 folds),0.000 +/- 0.000 (in 3 folds),0.393,-0.234,0.384 +/- 0.092 (in 3 folds),0.046 +/- 0.041 (in 3 folds),...,Unknown,112,3,115,0.026087,False,0.500 +/- 0.000 (in 1 folds),0.500 +/- 0.000 (in 1 folds),0.486 +/- 0.000 (in 1 folds),0.486 +/- 0.000 (in 1 folds)


GeneLocus.TCR, fold 0, TargetObsColumnEnum.sex_healthy_only, lasso_multiclass: best p value = 0.01. Number of disease associated clusters: {'F': 17, 'M': 42}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.sex_healthy_only, rf_multiclass: best p value = 0.005. Number of disease associated clusters: {'F': 11, 'M': 10}
GeneLocus.TCR, fold 0, TargetObsColumnEnum.sex_healthy_only, linearsvm_ovr: best p value = 0.01. Number of disease associated clusters: {'F': 17, 'M': 42}

GeneLocus.TCR, fold 1, TargetObsColumnEnum.sex_healthy_only, lasso_multiclass: best p value = 0.05. Number of disease associated clusters: {'F': 361, 'M': 431}
GeneLocus.TCR, fold 1, TargetObsColumnEnum.sex_healthy_only, rf_multiclass: best p value = 0.05. Number of disease associated clusters: {'F': 361, 'M': 431}


GeneLocus.TCR, fold 1, TargetObsColumnEnum.sex_healthy_only, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'F': 361, 'M': 431}

GeneLocus.TCR, fold 2, TargetObsColumnEnum.sex_healthy_only, lasso_multiclass: best p value = 0.005. Number of disease associated clusters: {'F': 3, 'M': 47}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.sex_healthy_only, rf_multiclass: best p value = 0.0005. Number of disease associated clusters: {'F': 0, 'M': 4}
GeneLocus.TCR, fold 2, TargetObsColumnEnum.sex_healthy_only, linearsvm_ovr: best p value = 0.05. Number of disease associated clusters: {'F': 153, 'M': 638}

