# Likelihood ratio test when removing one demographic covariate at a time from the metamodel with sequence features and demographic features

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics
from scipy.stats.distributions import chi2

In [2]:
from malid.trained_model_wrappers import BlendingMetamodel
from malid import io
from malid.datamodels import GeneLocus, TargetObsColumnEnum
import crosseval

In [3]:
clf = BlendingMetamodel.from_disk(
    fold_id=-1,
    # Load elastic net model because this had the highest cross validation test set performance in our evaluations for this metamodel flavor
    metamodel_name="elasticnet_cv",
    base_model_train_fold_name="train_smaller",
    metamodel_fold_label_train="validation",
    gene_locus=GeneLocus.BCR | GeneLocus.TCR,
    target_obs_column=TargetObsColumnEnum.disease_all_demographics_present,
    metamodel_flavor="with_demographics_columns",
)
clf

In [4]:
type(clf)

malid.trained_model_wrappers.blending_metamodel.BlendingMetamodel

In [5]:
clf._inner

In [6]:
# Load data and make features
adata_bcr = io.load_fold_embeddings(
    fold_id=-1,
    fold_label="validation",
    gene_locus=GeneLocus.BCR,
    target_obs_column=TargetObsColumnEnum.disease_all_demographics_present,
)
adata_tcr = io.load_fold_embeddings(
    fold_id=-1,
    fold_label="validation",
    gene_locus=GeneLocus.TCR,
    target_obs_column=TargetObsColumnEnum.disease_all_demographics_present,
)
features = clf.featurize({GeneLocus.BCR: adata_bcr, GeneLocus.TCR: adata_tcr})

{"message": "Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20231027/in_house_peak_disease_timepoints/embedded/esm2_cdr3/anndatas_scaled/BCR/fold.-1.validation.h5ad -> /srv/scratch/maximz/cache/798442e496404b3dae573dc1dc9879f7a842e7d0ea2e552e5b501a99.-1.validation.h5ad", "time": "2024-08-10T23:46:03.311151"}


Only considering the two last: ['.validation', '.h5ad'].
Only considering the two last: ['.validation', '.h5ad'].


{"message": "Reading network file from local machine cache: /users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20231027/in_house_peak_disease_timepoints/embedded/esm2_cdr3/anndatas_scaled/TCR/fold.-1.validation.h5ad -> /srv/scratch/maximz/cache/d73139e50107490f30240a6c96d5a14e2bb34ee53b298c87cfe6b974.-1.validation.h5ad", "time": "2024-08-10T23:47:48.431629"}


Only considering the two last: ['.validation', '.h5ad'].
Only considering the two last: ['.validation', '.h5ad'].


{"message": "Metamodel featurization with data keys dict_keys([<GeneLocus.BCR: 1>, <GeneLocus.TCR: 2>]) and gene_locus GeneLocus.BCR|TCR: dropping specimens from GeneLocus.BCR anndata: {'M281redo-S042', 'M281redo-S003', 'M404-S011', 'M281redo-S045', 'M281redo-S065', 'M281redo-S037', 'M281redo-S047', 'M281redo-S066', 'M281redo-S054', 'M281redo-S046', 'M281redo-S011', 'M281redo-S035', 'M281redo-S028', 'M281redo-S026', 'M281redo-S062', 'M281redo-S027', 'M281redo-S039', 'M281redo-S063', 'M281redo-S002', 'M281redo-S001'}", "time": "2024-08-10T23:50:55.064911"}
{"message": "Number of VJGeneSpecificSequenceModelRollupClassifier featurization matrix N/As due to specimens not having any sequences with particular V/J gene pairs: 3236 / 58080 = 5.57%", "time": "2024-08-10T23:56:03.520751"}
{"message": "Number of VJGeneSpecificSequenceModelRollupClassifier featurization matrix N/As due to specimens not having any sequences with particular V/J gene pairs: 0 / 13552 = 0.00%", "time": "2024-08-11T00:

In [7]:
features.X

Unnamed: 0_level_0,BCR:repertoire_stats:Covid19,BCR:repertoire_stats:HIV,BCR:repertoire_stats:Healthy/Background,BCR:repertoire_stats:Lupus,BCR:convergent_cluster_model:Covid19,BCR:convergent_cluster_model:HIV,BCR:convergent_cluster_model:Healthy/Background,BCR:convergent_cluster_model:Lupus,BCR:sequence_model:Covid19,BCR:sequence_model:HIV,...,interaction|TCR:sequence_model:Healthy/Background|demographics:ethnicity_condensed_Asian,interaction|TCR:sequence_model:Healthy/Background|demographics:ethnicity_condensed_African,interaction|TCR:sequence_model:Healthy/Background|demographics:ethnicity_condensed_Hispanic/Latino,interaction|TCR:sequence_model:Healthy/Background|demographics:ethnicity_condensed_Caucasian,interaction|TCR:sequence_model:Lupus|demographics:age,interaction|TCR:sequence_model:Lupus|demographics:sex_F,interaction|TCR:sequence_model:Lupus|demographics:ethnicity_condensed_Asian,interaction|TCR:sequence_model:Lupus|demographics:ethnicity_condensed_African,interaction|TCR:sequence_model:Lupus|demographics:ethnicity_condensed_Hispanic/Latino,interaction|TCR:sequence_model:Lupus|demographics:ethnicity_condensed_Caucasian
specimen_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M418-S054,0.817439,0.000023,0.000490,0.182049,0.738322,0.048219,0.140620,0.072839,0.58,0.02,...,0.46,0.00,0.0,0.00,7.37,0.00,0.11,0.00,0.0,0.00
M456-S008,0.267772,0.011225,0.180243,0.540760,0.050087,0.224689,0.184959,0.540265,0.00,0.10,...,0.00,0.35,0.0,0.00,4.41,0.09,0.00,0.09,0.0,0.00
M456-S009,0.064413,0.049449,0.045609,0.840529,0.054670,0.171795,0.220632,0.552903,0.01,0.07,...,0.00,0.44,0.0,0.00,14.70,0.35,0.00,0.35,0.0,0.00
M464-S031,0.071249,0.004770,0.898645,0.025337,0.229246,0.111376,0.451881,0.207496,0.01,0.21,...,0.30,0.00,0.0,0.00,1.54,0.14,0.14,0.00,0.0,0.00
M124-S055,0.000193,0.998025,0.001107,0.000675,0.252200,0.258253,0.286789,0.202757,0.07,0.77,...,0.00,0.24,0.0,0.00,2.86,0.00,0.00,0.13,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M64-031,0.012513,0.000858,0.983899,0.002730,0.163230,0.101347,0.535676,0.199748,0.05,0.01,...,0.00,0.00,0.0,0.55,1.10,0.02,0.00,0.00,0.0,0.02
M64-097,0.064282,0.000663,0.922980,0.012076,0.008261,0.020645,0.900852,0.070241,0.01,0.01,...,0.00,0.00,0.0,0.99,0.00,0.00,0.00,0.00,0.0,0.00
M64-072,0.005384,0.000797,0.976841,0.016978,0.114882,0.082880,0.621070,0.181168,0.01,0.03,...,0.00,0.00,0.0,0.81,6.00,0.00,0.00,0.00,0.0,0.10
M111-S030,0.004991,0.909756,0.029952,0.055301,0.222280,0.233147,0.359539,0.185035,0.01,0.54,...,0.00,0.51,0.0,0.00,0.42,0.02,0.00,0.02,0.0,0.00


In [8]:
X = features.X.copy()
y = features.y.copy()

In [9]:
X

Unnamed: 0_level_0,BCR:repertoire_stats:Covid19,BCR:repertoire_stats:HIV,BCR:repertoire_stats:Healthy/Background,BCR:repertoire_stats:Lupus,BCR:convergent_cluster_model:Covid19,BCR:convergent_cluster_model:HIV,BCR:convergent_cluster_model:Healthy/Background,BCR:convergent_cluster_model:Lupus,BCR:sequence_model:Covid19,BCR:sequence_model:HIV,...,interaction|TCR:sequence_model:Healthy/Background|demographics:ethnicity_condensed_Asian,interaction|TCR:sequence_model:Healthy/Background|demographics:ethnicity_condensed_African,interaction|TCR:sequence_model:Healthy/Background|demographics:ethnicity_condensed_Hispanic/Latino,interaction|TCR:sequence_model:Healthy/Background|demographics:ethnicity_condensed_Caucasian,interaction|TCR:sequence_model:Lupus|demographics:age,interaction|TCR:sequence_model:Lupus|demographics:sex_F,interaction|TCR:sequence_model:Lupus|demographics:ethnicity_condensed_Asian,interaction|TCR:sequence_model:Lupus|demographics:ethnicity_condensed_African,interaction|TCR:sequence_model:Lupus|demographics:ethnicity_condensed_Hispanic/Latino,interaction|TCR:sequence_model:Lupus|demographics:ethnicity_condensed_Caucasian
specimen_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M418-S054,0.817439,0.000023,0.000490,0.182049,0.738322,0.048219,0.140620,0.072839,0.58,0.02,...,0.46,0.00,0.0,0.00,7.37,0.00,0.11,0.00,0.0,0.00
M456-S008,0.267772,0.011225,0.180243,0.540760,0.050087,0.224689,0.184959,0.540265,0.00,0.10,...,0.00,0.35,0.0,0.00,4.41,0.09,0.00,0.09,0.0,0.00
M456-S009,0.064413,0.049449,0.045609,0.840529,0.054670,0.171795,0.220632,0.552903,0.01,0.07,...,0.00,0.44,0.0,0.00,14.70,0.35,0.00,0.35,0.0,0.00
M464-S031,0.071249,0.004770,0.898645,0.025337,0.229246,0.111376,0.451881,0.207496,0.01,0.21,...,0.30,0.00,0.0,0.00,1.54,0.14,0.14,0.00,0.0,0.00
M124-S055,0.000193,0.998025,0.001107,0.000675,0.252200,0.258253,0.286789,0.202757,0.07,0.77,...,0.00,0.24,0.0,0.00,2.86,0.00,0.00,0.13,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M64-031,0.012513,0.000858,0.983899,0.002730,0.163230,0.101347,0.535676,0.199748,0.05,0.01,...,0.00,0.00,0.0,0.55,1.10,0.02,0.00,0.00,0.0,0.02
M64-097,0.064282,0.000663,0.922980,0.012076,0.008261,0.020645,0.900852,0.070241,0.01,0.01,...,0.00,0.00,0.0,0.99,0.00,0.00,0.00,0.00,0.0,0.00
M64-072,0.005384,0.000797,0.976841,0.016978,0.114882,0.082880,0.621070,0.181168,0.01,0.03,...,0.00,0.00,0.0,0.81,6.00,0.00,0.00,0.00,0.0,0.10
M111-S030,0.004991,0.909756,0.029952,0.055301,0.222280,0.233147,0.359539,0.185035,0.01,0.54,...,0.00,0.51,0.0,0.00,0.42,0.02,0.00,0.02,0.0,0.00


In [10]:
y

array(['Covid19', 'Lupus', 'Lupus', 'Healthy/Background', 'HIV',
       'Healthy/Background', 'Healthy/Background', 'Lupus',
       'Healthy/Background', 'Lupus', 'HIV', 'Healthy/Background',
       'Healthy/Background', 'Healthy/Background', 'HIV',
       'Healthy/Background', 'HIV', 'HIV', 'Healthy/Background',
       'Healthy/Background', 'Healthy/Background', 'Healthy/Background',
       'Healthy/Background', 'HIV', 'Healthy/Background', 'Lupus',
       'Healthy/Background', 'Covid19', 'Covid19', 'Healthy/Background',
       'HIV', 'Healthy/Background', 'HIV', 'Lupus', 'Lupus',
       'Healthy/Background', 'HIV', 'Lupus', 'HIV', 'Healthy/Background',
       'HIV', 'Healthy/Background', 'Lupus', 'HIV', 'Healthy/Background',
       'Healthy/Background', 'Lupus', 'Lupus', 'Covid19',
       'Healthy/Background', 'HIV', 'Lupus', 'Healthy/Background',
       'Healthy/Background', 'Healthy/Background', 'Covid19', 'HIV',
       'HIV', 'Lupus', 'Healthy/Background', 'HIV', 'HIV', 'Lupus', '

In [11]:
groups = features.metadata["participant_label"].values.copy()
groups

array(['BFI-0009169', 'BFI-0010067', 'BFI-0010068', 'BFI-0010230',
       'BFI-0003774', 'BFI-0010228', 'BFI-0003159', 'BFI-0009827',
       'BFI-0003713', 'BFI-0009816', 'BFI-0000258', 'BFI-0003156',
       'BFI-0003052', 'BFI-0010218', 'BFI-0003476', 'BFI-0003130',
       'BFI-0000254', 'BFI-0000255', 'BFI-0003118', 'BFI-0003123',
       'BFI-0003163', 'BFI-0003147', 'BFI-0003077', 'BFI-0003451',
       'BFI-0003122', 'BFI-0009824', 'BFI-0003091', 'BFI-0009056',
       'BFI-0009154', 'BFI-0003161', 'BFI-0002877', 'BFI-0003145',
       'BFI-0003477', 'BFI-0010016', 'BFI-0009804', 'BFI-0003727',
       'BFI-0002855', 'BFI-0010049', 'BFI-0003481', 'BFI-0010200',
       'BFI-0000258', 'BFI-0003092', 'BFI-0009820', 'BFI-0003482',
       'BFI-0003137', 'BFI-0003100', 'BFI-0009830', 'BFI-0010050',
       'BFI-0009159', 'BFI-0003062', 'BFI-0003473', 'BFI-0009837',
       'BFI-0010201', 'BFI-0002861', 'BFI-0003088', 'BFI-0009131',
       'BFI-0003736', 'BFI-0003471', 'BFI-0010027', 'BFI-00102

In [12]:
def clone_and_refit_with_same_lambda(clf, X, y, groups, feature_to_drop=None):
    """Clone the clf glmnet classifier and refit it having dropped feature_to_drop. Keep the regularization parameter lambda identical."""
    if feature_to_drop is not None:
        X = X.drop(feature_to_drop, axis="columns")

    # Clone
    clf_new = sklearn.base.clone(clf)

    # Set the lambda
    desired_lambda = clf.steps[-1][1]._lambda_for_prediction_
    clf_new.steps[-1][1]._inner.lambda_path = np.array([desired_lambda])
    clf_new.steps[-1][1].internal_cv = None
    clf_new.steps[-1][1]._inner.n_splits = 0
    clf_new.steps[-1][1]._inner.n_lambda = 1

    # Fit
    # Use crosseval to properly pass groups to the final step of the pipeline
    clf_new, _ = crosseval.train_classifier(
        clf=clf_new, X_train=X, y_train=y, train_groups=groups
    )

    assert len(clf_new.steps[-1][1].lambda_path_) == 1
    assert np.allclose(desired_lambda, clf_new.steps[-1][1].lambda_path_)

    return clf_new, desired_lambda, X

In [13]:
def log_likelihood(model, X, y, chosen_lambda):
    """Calculate the log-likelihood of a fitted multinomial model."""
    return -1 * sklearn.metrics.log_loss(
        y_true=y,
        y_pred=model.predict_proba(X, lamb=chosen_lambda),
        labels=model.classes_,
        normalize=False,
        sample_weight=None,
    )

In [14]:
# sanity check: should be identical outputs if refit with same data
clf_tmp, chosen_lambda, _ = clone_and_refit_with_same_lambda(
    clf._inner, X, y, groups, feature_to_drop=None
)
assert np.allclose(
    clf_tmp.predict_proba(X, lamb=chosen_lambda), clf.predict_proba(X), atol=1e-3
)



In [15]:
# Remove interaction terms between sequence features and demographic features.
X_trim = X[X.columns[~X.columns.str.startswith("interaction")]].copy()
X_trim.columns

Index(['BCR:repertoire_stats:Covid19', 'BCR:repertoire_stats:HIV',
       'BCR:repertoire_stats:Healthy/Background', 'BCR:repertoire_stats:Lupus',
       'BCR:convergent_cluster_model:Covid19',
       'BCR:convergent_cluster_model:HIV',
       'BCR:convergent_cluster_model:Healthy/Background',
       'BCR:convergent_cluster_model:Lupus', 'BCR:sequence_model:Covid19',
       'BCR:sequence_model:HIV', 'BCR:sequence_model:Healthy/Background',
       'BCR:sequence_model:Lupus', 'TCR:repertoire_stats:Covid19',
       'TCR:repertoire_stats:HIV', 'TCR:repertoire_stats:Healthy/Background',
       'TCR:repertoire_stats:Lupus', 'TCR:convergent_cluster_model:Covid19',
       'TCR:convergent_cluster_model:HIV',
       'TCR:convergent_cluster_model:Healthy/Background',
       'TCR:convergent_cluster_model:Lupus', 'TCR:sequence_model:Covid19',
       'TCR:sequence_model:HIV', 'TCR:sequence_model:Healthy/Background',
       'TCR:sequence_model:Lupus', 'demographics:age', 'demographics:sex_F',
       

In [16]:
X_trim.shape

(120, 30)

In [17]:
X_trim

Unnamed: 0_level_0,BCR:repertoire_stats:Covid19,BCR:repertoire_stats:HIV,BCR:repertoire_stats:Healthy/Background,BCR:repertoire_stats:Lupus,BCR:convergent_cluster_model:Covid19,BCR:convergent_cluster_model:HIV,BCR:convergent_cluster_model:Healthy/Background,BCR:convergent_cluster_model:Lupus,BCR:sequence_model:Covid19,BCR:sequence_model:HIV,...,TCR:sequence_model:Covid19,TCR:sequence_model:HIV,TCR:sequence_model:Healthy/Background,TCR:sequence_model:Lupus,demographics:age,demographics:sex_F,demographics:ethnicity_condensed_Asian,demographics:ethnicity_condensed_African,demographics:ethnicity_condensed_Hispanic/Latino,demographics:ethnicity_condensed_Caucasian
specimen_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M418-S054,0.817439,0.000023,0.000490,0.182049,0.738322,0.048219,0.140620,0.072839,0.58,0.02,...,0.19,0.04,0.46,0.11,67.0,0,1,0,0,0
M456-S008,0.267772,0.011225,0.180243,0.540760,0.050087,0.224689,0.184959,0.540265,0.00,0.10,...,0.31,0.06,0.35,0.09,49.0,1,0,1,0,0
M456-S009,0.064413,0.049449,0.045609,0.840529,0.054670,0.171795,0.220632,0.552903,0.01,0.07,...,0.03,0.20,0.44,0.35,42.0,1,0,1,0,0
M464-S031,0.071249,0.004770,0.898645,0.025337,0.229246,0.111376,0.451881,0.207496,0.01,0.21,...,0.48,0.00,0.30,0.14,11.0,1,1,0,0,0
M124-S055,0.000193,0.998025,0.001107,0.000675,0.252200,0.258253,0.286789,0.202757,0.07,0.77,...,0.13,0.49,0.24,0.13,22.0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M64-031,0.012513,0.000858,0.983899,0.002730,0.163230,0.101347,0.535676,0.199748,0.05,0.01,...,0.44,0.02,0.55,0.02,55.0,1,0,0,0,1
M64-097,0.064282,0.000663,0.922980,0.012076,0.008261,0.020645,0.900852,0.070241,0.01,0.01,...,0.01,0.00,0.99,0.00,26.0,1,0,0,0,1
M64-072,0.005384,0.000797,0.976841,0.016978,0.114882,0.082880,0.621070,0.181168,0.01,0.03,...,0.18,0.00,0.81,0.10,60.0,0,0,0,0,1
M111-S030,0.004991,0.909756,0.029952,0.055301,0.222280,0.233147,0.359539,0.185035,0.01,0.54,...,0.04,0.45,0.51,0.02,21.0,1,0,1,0,0


In [18]:
pd.Series(y).value_counts()

Healthy/Background    56
HIV                   32
Lupus                 21
Covid19               11
dtype: int64

In [19]:
# Train original full model, without interaction terms
clf_orig, _ = crosseval.train_classifier(
    clf=sklearn.base.clone(clf._inner), X_train=X_trim, y_train=y, train_groups=groups
)
clf_orig

In [22]:
# Drop age, sex, and ancestry (which is a set of dummy variables), refitting the model each time and comparing log likelihoods
results = []
for to_drop in [
    ["demographics:age"],
    ["demographics:sex_F"],
    list(
        X_trim.columns[
            X_trim.columns.str.startswith("demographics:ethnicity_condensed")
        ]
    ),
]:
    reduced_model, chosen_lambda, X_reduced = clone_and_refit_with_same_lambda(
        clf_orig, X_trim, y, groups, feature_to_drop=to_drop
    )
    for f in to_drop:
        assert f not in X_reduced.columns
    ll_full = log_likelihood(clf_orig, X_trim, y, chosen_lambda)
    ll_reduced = log_likelihood(reduced_model, X_reduced, y, chosen_lambda)

    # Perform the likelihood ratio test
    likelihood_ratio = 2 * (ll_full - ll_reduced)
    # the full model has n_removed_features more Degrees of Freedom than the reduced model
    dof = len(to_drop)
    p_value = chi2.sf(
        likelihood_ratio,
        df=dof,
    )
    results.append(
        dict(
            p_value=p_value,
            likelihood_ratio=likelihood_ratio,
            to_drop=to_drop,
            ll_full=ll_full,
            ll_reduced=ll_reduced,
            dof=dof,
        )
    )


pd.DataFrame(results)



Unnamed: 0,p_value,likelihood_ratio,to_drop,ll_full,ll_reduced,dof
0,0.054278,3.704124,[demographics:age],-19.290238,-21.1423,1
1,0.067151,3.351327,[demographics:sex_F],-19.290238,-20.965901,1
2,0.986162,0.352708,"[demographics:ethnicity_condensed_Asian, demog...",-19.290238,-19.466592,4


In [29]:
# Try again, but now keep the interaction terms. Drop each demographic feature along with all the associated interaction terms.
results_with_interaction_terms = []
for to_drop in [
    # Include all interaction terms
    list(X.columns[X.columns.str.contains("demographics:age", regex=False)]),
    list(X.columns[X.columns.str.contains("demographics:sex_F", regex=False)]),
    list(
        X.columns[
            X.columns.str.contains("demographics:ethnicity_condensed", regex=False)
        ]
    ),
]:
    reduced_model, chosen_lambda, X_reduced = clone_and_refit_with_same_lambda(
        clf._inner, X, y, groups, feature_to_drop=to_drop
    )
    for f in to_drop:
        assert f not in X_reduced.columns
    ll_full = log_likelihood(clf, X, y, chosen_lambda)
    ll_reduced = log_likelihood(reduced_model, X_reduced, y, chosen_lambda)

    # Perform the likelihood ratio test
    likelihood_ratio = 2 * (ll_full - ll_reduced)
    # the full model has n_removed_features more Degrees of Freedom than the reduced model
    dof = len(to_drop)
    p_value = chi2.sf(
        likelihood_ratio,
        df=dof,
    )
    results_with_interaction_terms.append(
        dict(
            p_value=p_value,
            likelihood_ratio=likelihood_ratio,
            to_drop=to_drop,
            ll_full=ll_full,
            ll_reduced=ll_reduced,
            dof=dof,
        )
    )


pd.DataFrame(results_with_interaction_terms)



Unnamed: 0,p_value,likelihood_ratio,to_drop,ll_full,ll_reduced,dof
0,1.0,0.240088,"[demographics:age, interaction|BCR:repertoire_...",-1.372914,-1.492958,25
1,1.0,0.604577,"[demographics:sex_F, interaction|BCR:repertoir...",-1.372914,-1.675202,25
2,1.0,1.541819,"[demographics:ethnicity_condensed_Asian, demog...",-1.372914,-2.143823,100
