In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score

# Train Simple Models to Predict the Compound

We train a simple logistic regression model to measure how well the model is able to predict the compound belonging to a profile.

## 1 Data Reading

In [2]:
# data_path = "../../data/jump/"
data_path = "/mnt/d/Uni/Master_2.Semester/mp/jump/"

In [3]:
data_file = f"{data_path}jump_target2_spherized.h5ad"
if not os.path.exists(data_file):
    raise FileNotFoundError("Please download the JUMP data and run pycytominer on it!")

In [12]:
adata = sc.read_h5ad(data_file)
adata



AnnData object with n_obs × n_vars = 64464 × 558
    obs: 'Metadata_JCP2022', 'Metadata_InChIKey', 'Metadata_InChI', 'Metadata_Source', 'Metadata_Plate', 'Metadata_Well', 'Metadata_PlateType', 'Metadata_WellType', 'Metadata_Microscope_Name', 'Metadata_Widefield_vs_Confocal', 'Metadata_Excitation_Type', 'Metadata_Objective_NA', 'Metadata_N_Brightfield_Planes_Min', 'Metadata_N_Brightfield_Planes_Max', 'Metadata_Sites_Per_Well', 'Metadata_Filter_Configuration', 'Metadata_Excitation_Low_DNA', 'Metadata_Excitation_Low_ER', 'Metadata_Excitation_Low_RNA', 'Metadata_Excitation_Low_AGP', 'Metadata_Excitation_Low_Mito', 'Metadata_Excitation_High_DNA', 'Metadata_Excitation_High_ER', 'Metadata_Excitation_High_RNA', 'Metadata_Excitation_High_AGP', 'Metadata_Excitation_High_Mito', 'Metadata_Emission_Low_DNA', 'Metadata_Emission_Low_ER', 'Metadata_Emission_Low_RNA', 'Metadata_Emission_Low_AGP', 'Metadata_Emission_Low_Mito', 'Metadata_Emission_High_DNA', 'Metadata_Emission_High_ER', 'Metadata_Emission

**Low-Level Integrations**

In [28]:
harmony_low = sc.read_h5ad(f"{data_path}harmony_low.h5ad")
scanorama_low = sc.read_h5ad(f"{data_path}scanorama_low.h5ad")
scvi_low = sc.read_h5ad(f"{data_path}scvi_low.h5ad")
scanvi_low = sc.read_h5ad(f"{data_path}scanvi_low.h5ad")
scgen_low = sc.read_h5ad(f"{data_path}scgen_low.h5ad")

**High-Level Integrations**

In [29]:
harmony_high = sc.read_h5ad(f"{data_path}harmony_high.h5ad")
scanorama_high = sc.read_h5ad(f"{data_path}scanorama_high.h5ad")
scvi_high = sc.read_h5ad(f"{data_path}scvi_high.h5ad")
scanvi_high = sc.read_h5ad(f"{data_path}scanvi_high.h5ad")
scgen_high = sc.read_h5ad(f"{data_path}scgen_high.h5ad")
gaushanvi_high = sc.read_h5ad(f"{data_path}gaushanvi_high.h5ad")

**Direct Intgerations**

In [16]:
harmony = sc.read_h5ad(f"{data_path}harmony.h5ad")
scanorama = sc.read_h5ad(f"{data_path}scanorama.h5ad")
scvi = sc.read_h5ad(f"{data_path}scvi.h5ad")
scanvi = sc.read_h5ad(f"{data_path}scanvi.h5ad")
scgen = sc.read_h5ad(f"{data_path}scgen.h5ad")
gaushvi = sc.read_h5ad(f"{data_path}gaushvi.h5ad")
gaushanvi = sc.read_h5ad(f"{data_path}gaushanvi.h5ad")

## 2 Predicting Compounds with Logistic Regression

In [8]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=68)

**Unintegrated Data**

Per source and overall

In [20]:
# Per source
sources = adata.obs.Metadata_Source.unique()
accuracies = []

for source in sources:
    print(f"Starting with {source}")
    adata_source = adata[adata.obs.Metadata_Source==source].copy()
    train, test = [x for x in sss.split(adata_source.X, adata_source.obs.Metadata_JCP2022)][0]
    x_train = adata_source.X[train]
    y_train = adata_source.obs.Metadata_JCP2022[train]
    x_test = adata_source.X[test]
    y_test = adata_source.obs.Metadata_JCP2022[test]
    model_unintegrated_source = LogisticRegression(random_state=52).fit(x_train, y_train)
    y_test_pred = model_unintegrated_source.predict(x_test)
    accuracies.append(balanced_accuracy_score(y_test, y_test_pred))

Starting with source_10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_11


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_13


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
for s, a in zip(sources, accuracies):
    print(f"{s}: balanced accuracy {a}")
print(f"Mean: {np.mean(accuracies)}")

source_10: balanced accuracy 0.02580917978268971
source_11: balanced accuracy 0.045732155997056656
source_13: balanced accuracy 0.025305551068755722
source_2: balanced accuracy 0.022118579095323283
source_3: balanced accuracy 0.11097889072847682
source_4: balanced accuracy 0.09651261096237847
source_5: balanced accuracy 0.06832157674856729
source_6: balanced accuracy 0.0758988468334918
source_7: balanced accuracy 0.030831493745401027
source_8: balanced accuracy 0.023103815169478646
source_9: balanced accuracy 0.3235229311733025
Mean: 0.07710323920953836


In [25]:
# Overall
train, test = [x for x in sss.split(adata.X, adata.obs.Metadata_JCP2022)][0]
x_train = adata.X[train]
y_train = adata.obs.Metadata_JCP2022[train]
x_test = adata.X[test]
y_test = adata.obs.Metadata_JCP2022[test]

In [26]:
model_unintegrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
y_test_pred = model_unintegrated_overall.predict(x_test)
print("Balanced Accuracy: " + str(balanced_accuracy_score(y_test, y_test_pred)))

Balanced Accuracy: 0.07905590727379076


**Low-Level Integrations**

Per source

In [33]:
# Harmony
sources = harmony_low.obs.Metadata_Source.unique()
accuracies = []

for source in sources:
    print(f"Starting with {source}")
    adata_source = harmony_low[harmony_low.obs.Metadata_Source==source].copy()
    train, test = [x for x in sss.split(adata_source.obsm["X_emb"], adata_source.obs.Metadata_JCP2022)][0]
    x_train = adata_source.obsm["X_emb"][train]
    y_train = adata_source.obs.Metadata_JCP2022[train]
    x_test = adata_source.obsm["X_emb"][test]
    y_test = adata_source.obs.Metadata_JCP2022[test]
    model_integrated_source = LogisticRegression(random_state=52).fit(x_train, y_train)
    y_test_pred = model_integrated_source.predict(x_test)
    accuracies.append(balanced_accuracy_score(y_test, y_test_pred))

Starting with source_10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_11


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_13


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
print("Harmony low level integration:")
for s, a in zip(sources, accuracies):
    print(f"{s}: balanced accuracy {a}")
print(f"Mean: {np.mean(accuracies)}")

Harmony low level integration:
source_10: balanced accuracy 0.04410710702101431
source_11: balanced accuracy 0.08870493009565857
source_13: balanced accuracy 0.06444839282829913
source_2: balanced accuracy 0.05813953488372093
source_3: balanced accuracy 0.22845612582781458
source_4: balanced accuracy 0.19911817199755766
source_5: balanced accuracy 0.08227080651849296
source_6: balanced accuracy 0.17787136849228985
source_7: balanced accuracy 0.0455849889624724
source_8: balanced accuracy 0.01937984496124031
source_9: balanced accuracy 0.1699078587059653
Mean: 0.107089920935866


In [35]:
# Scanorama
sources = scanorama_low.obs.Metadata_Source.unique()
accuracies = []

for source in sources:
    print(f"Starting with {source}")
    adata_source = scanorama_low[scanorama_low.obs.Metadata_Source==source].copy()
    train, test = [x for x in sss.split(adata_source.obsm["X_emb"], adata_source.obs.Metadata_JCP2022)][0]
    x_train = adata_source.obsm["X_emb"][train]
    y_train = adata_source.obs.Metadata_JCP2022[train]
    x_test = adata_source.obsm["X_emb"][test]
    y_test = adata_source.obs.Metadata_JCP2022[test]
    model_integrated_source = LogisticRegression(random_state=52).fit(x_train, y_train)
    y_test_pred = model_integrated_source.predict(x_test)
    accuracies.append(balanced_accuracy_score(y_test, y_test_pred))

Starting with source_10
Starting with source_11
Starting with source_13
Starting with source_2
Starting with source_3
Starting with source_4
Starting with source_5
Starting with source_6
Starting with source_7
Starting with source_8
Starting with source_9


In [36]:
print("Scanorama low level integration:")
for s, a in zip(sources, accuracies):
    print(f"{s}: balanced accuracy {a}")
print(f"Mean: {np.mean(accuracies)}")

Scanorama low level integration:
source_10: balanced accuracy 0.0033112582781456954
source_11: balanced accuracy 0.023178807947019868
source_13: balanced accuracy 0.005518763796909492
source_2: balanced accuracy 0.026578073089700997
source_3: balanced accuracy 0.06688741721854305
source_4: balanced accuracy 0.03780353200883002
source_5: balanced accuracy 0.03377483443708609
source_6: balanced accuracy 0.055439406390486785
source_7: balanced accuracy 0.017660044150110375
source_8: balanced accuracy 0.003875968992248062
source_9: balanced accuracy 0.030054577102277893
Mean: 0.02764388031012348


In [37]:
# scGen
sources = scgen_low.obs.Metadata_Source.unique()
accuracies = []

for source in sources:
    print(f"Starting with {source}")
    adata_source = scgen_low[scgen_low.obs.Metadata_Source==source].copy()
    train, test = [x for x in sss.split(adata_source.obsm["corrected_latent"], adata_source.obs.Metadata_JCP2022)][0]
    x_train = adata_source.obsm["corrected_latent"][train]
    y_train = adata_source.obs.Metadata_JCP2022[train]
    x_test = adata_source.obsm["corrected_latent"][test]
    y_test = adata_source.obs.Metadata_JCP2022[test]
    model_integrated_source = LogisticRegression(random_state=52).fit(x_train, y_train)
    y_test_pred = model_integrated_source.predict(x_test)
    accuracies.append(balanced_accuracy_score(y_test, y_test_pred))

Starting with source_10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_11


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_13


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
print("scGen low level integration:")
for s, a in zip(sources, accuracies):
    print(f"{s}: balanced accuracy {a}")
print(f"Mean: {np.mean(accuracies)}")

scGen low level integration:
source_10: balanced accuracy 0.5
source_11: balanced accuracy 0.3079470198675497
source_13: balanced accuracy 0.18543046357615894
source_2: balanced accuracy 0.26578073089701
source_3: balanced accuracy 0.9079470198675497
source_4: balanced accuracy 0.937430526200429
source_5: balanced accuracy 0.5927152317880795
source_6: balanced accuracy 0.5820456217807211
source_7: balanced accuracy 0.4713024282560706
source_8: balanced accuracy 0.17054263565891473
source_9: balanced accuracy 0.024462318147583408
Mean: 0.4496003632763697


In [39]:
# scVI
sources = scvi_low.obs.Metadata_Source.unique()
accuracies = []

for source in sources:
    print(f"Starting with {source}")
    adata_source = scvi_low[scvi_low.obs.Metadata_Source==source].copy()
    train, test = [x for x in sss.split(adata_source.obsm["X_emb"], adata_source.obs.Metadata_JCP2022)][0]
    x_train = adata_source.obsm["X_emb"][train]
    y_train = adata_source.obs.Metadata_JCP2022[train]
    x_test = adata_source.obsm["X_emb"][test]
    y_test = adata_source.obs.Metadata_JCP2022[test]
    model_integrated_source = LogisticRegression(random_state=52).fit(x_train, y_train)
    y_test_pred = model_integrated_source.predict(x_test)
    accuracies.append(balanced_accuracy_score(y_test, y_test_pred))

Starting with source_10
Starting with source_11
Starting with source_13
Starting with source_2
Starting with source_3
Starting with source_4
Starting with source_5
Starting with source_6
Starting with source_7
Starting with source_8
Starting with source_9


In [40]:
print("scVI low level integration:")
for s, a in zip(sources, accuracies):
    print(f"{s}: balanced accuracy {a}")
print(f"Mean: {np.mean(accuracies)}")

scVI low level integration:
source_10: balanced accuracy 0.0033112582781456954
source_11: balanced accuracy 0.0033112582781456954
source_13: balanced accuracy 0.0033112582781456954
source_2: balanced accuracy 0.0033222591362126247
source_3: balanced accuracy 0.0033112582781456954
source_4: balanced accuracy 0.0033112582781456954
source_5: balanced accuracy 0.0033112582781456954
source_6: balanced accuracy 0.0033112582781456954
source_7: balanced accuracy 0.0033112582781456954
source_8: balanced accuracy 0.003875968992248062
source_9: balanced accuracy 0.0033112582781456954
Mean: 0.0033635956937974497


In [41]:
# scanVI
sources = scanvi_low.obs.Metadata_Source.unique()
accuracies = []

for source in sources:
    print(f"Starting with {source}")
    adata_source = scanvi_low[scanvi_low.obs.Metadata_Source==source].copy()
    train, test = [x for x in sss.split(adata_source.obsm["X_emb"], adata_source.obs.Metadata_JCP2022)][0]
    x_train = adata_source.obsm["X_emb"][train]
    y_train = adata_source.obs.Metadata_JCP2022[train]
    x_test = adata_source.obsm["X_emb"][test]
    y_test = adata_source.obs.Metadata_JCP2022[test]
    model_integrated_source = LogisticRegression(random_state=52).fit(x_train, y_train)
    y_test_pred = model_integrated_source.predict(x_test)
    accuracies.append(balanced_accuracy_score(y_test, y_test_pred))

Starting with source_2
Starting with source_3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Starting with source_6
Starting with source_7
Starting with source_8




Starting with source_9
Starting with source_10
Starting with source_11
Starting with source_13


In [42]:
print("scanVI low level integration:")
for s, a in zip(sources, accuracies):
    print(f"{s}: balanced accuracy {a}")
print(f"Mean: {np.mean(accuracies)}")

scanVI low level integration:
source_2: balanced accuracy 0.054817275747508304
source_3: balanced accuracy 0.252317880794702
source_4: balanced accuracy 0.1979580573951435
source_5: balanced accuracy 0.12891832229580574
source_6: balanced accuracy 0.19641280353200882
source_7: balanced accuracy 0.14183222958057393
source_8: balanced accuracy 0.05426356589147287
source_9: balanced accuracy 0.08736203090507726
source_10: balanced accuracy 0.04856512141280353
source_11: balanced accuracy 0.10651214128035319
source_13: balanced accuracy 0.08057395143487858
Mean: 0.12268485275184798


**High-Level Integrations**

Overall

In [43]:
# Harmony
train, test = [x for x in sss.split(harmony_high.obsm["X_emb"], harmony_high.obs.Metadata_JCP2022)][0]
x_train = harmony_high.obsm["X_emb"][train]
y_train = harmony_high.obs.Metadata_JCP2022[train]
x_test = harmony_high.obsm["X_emb"][test]
y_test = harmony_high.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("Harmony: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

Harmony: balanced Accuracy = 0.03930124612340201


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
# Scanorama
train, test = [x for x in sss.split(scanorama_high.obsm["X_emb"], scanorama_high.obs.Metadata_JCP2022)][0]
x_train = scanorama_high.obsm["X_emb"][train]
y_train = scanorama_high.obs.Metadata_JCP2022[train]
x_test = scanorama_high.obsm["X_emb"][test]
y_test = scanorama_high.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("Scanorama: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

Scanorama: balanced Accuracy = 0.02424547419206523


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
# scGen
train, test = [x for x in sss.split(scgen_high.obsm["corrected_latent"], scgen_high.obs.Metadata_JCP2022)][0]
x_train = scgen_high.obsm["corrected_latent"][train]
y_train = scgen_high.obs.Metadata_JCP2022[train]
x_test = scgen_high.obsm["corrected_latent"][test]
y_test = scgen_high.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("scGen: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

scGen: balanced Accuracy = 0.009589011770837354


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
# scVI
train, test = [x for x in sss.split(scvi_high.obsm["X_emb"], scvi_high.obs.Metadata_JCP2022)][0]
x_train = scvi_high.obsm["X_emb"][train]
y_train = scvi_high.obs.Metadata_JCP2022[train]
x_test = scvi_high.obsm["X_emb"][test]
y_test = scvi_high.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("scVI: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

scVI: balanced Accuracy = 0.0033112582781456954


In [48]:
# scanVI
train, test = [x for x in sss.split(scanvi_high.obsm["X_emb"], scanvi_high.obs.Metadata_JCP2022)][0]
x_train = scanvi_high.obsm["X_emb"][train]
y_train = scanvi_high.obs.Metadata_JCP2022[train]
x_test = scanvi_high.obsm["X_emb"][test]
y_test = scanvi_high.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("scanVI: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

scanVI: balanced Accuracy = 0.10691896502030832


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# gaushANVI
train, test = [x for x in sss.split(gaushanvi_high.obsm["X_emb"], gaushanvi_high.obs.Metadata_JCP2022)][0]
x_train = gaushanvi_high.obsm["X_emb"][train]
y_train = gaushanvi_high.obs.Metadata_JCP2022[train]
x_test = gaushanvi_high.obsm["X_emb"][test]
y_test = gaushanvi_high.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("gaushANVI: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

gaushANVI: balanced Accuracy = 0.011259220781681756


**Direct Integrations**

Overall

In [51]:
# Harmony
train, test = [x for x in sss.split(harmony.obsm["X_emb"], harmony.obs.Metadata_JCP2022)][0]
x_train = harmony.obsm["X_emb"][train]
y_train = harmony.obs.Metadata_JCP2022[train]
x_test = harmony.obsm["X_emb"][test]
y_test = harmony.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("Harmony: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

Harmony: balanced Accuracy = 0.019557492102904327


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
# Scanorama
train, test = [x for x in sss.split(scanorama.obsm["X_emb"], scanorama.obs.Metadata_JCP2022)][0]
x_train = scanorama.obsm["X_emb"][train]
y_train = scanorama.obs.Metadata_JCP2022[train]
x_test = scanorama.obsm["X_emb"][test]
y_test = scanorama.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("Scanorama: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

Scanorama: balanced Accuracy = 0.014330616370151534


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
# scGen
train, test = [x for x in sss.split(scgen.obsm["corrected_latent"], scgen.obs.Metadata_JCP2022)][0]
x_train = scgen.obsm["corrected_latent"][train]
y_train = scgen.obs.Metadata_JCP2022[train]
x_test = scgen.obsm["corrected_latent"][test]
y_test = scgen.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("scGen: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

scGen: balanced Accuracy = 0.023557416396751487


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
# scVI
train, test = [x for x in sss.split(scvi.obsm["X_emb"], scvi.obs.Metadata_JCP2022)][0]
x_train = scvi.obsm["X_emb"][train]
y_train = scvi.obs.Metadata_JCP2022[train]
x_test = scvi.obsm["X_emb"][test]
y_test = scvi.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("scVI: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

scVI: balanced Accuracy = 0.0033112582781456954


In [55]:
# scanVI
train, test = [x for x in sss.split(scanvi.obsm["X_emb"], scanvi.obs.Metadata_JCP2022)][0]
x_train = scanvi.obsm["X_emb"][train]
y_train = scanvi.obs.Metadata_JCP2022[train]
x_test = scanvi.obsm["X_emb"][test]
y_test = scanvi.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("scanVI: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

scanVI: balanced Accuracy = 0.05313143563189186


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# gaushVI
gaushvi.obsm["X_emb"] = gaushvi.X
train, test = [x for x in sss.split(gaushvi.obsm["X_emb"], gaushvi.obs.Metadata_JCP2022)][0]
x_train = gaushvi.obsm["X_emb"][train]
y_train = gaushvi.obs.Metadata_JCP2022[train]
x_test = gaushvi.obsm["X_emb"][test]
y_test = gaushvi.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("gaushVI: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

gaushVI: balanced Accuracy = 0.008975810198045973


In [12]:
# gaushANVI
train, test = [x for x in sss.split(gaushanvi.obsm["X_emb"], gaushanvi.obs.Metadata_JCP2022)][0]
x_train = gaushanvi.obsm["X_emb"][train]
y_train = gaushanvi.obs.Metadata_JCP2022[train]
x_test = gaushanvi.obsm["X_emb"][test]
y_test = gaushanvi.obs.Metadata_JCP2022[test]

model_integrated_overall = LogisticRegression(random_state=52).fit(x_train, y_train)
y_test_pred = model_integrated_overall.predict(x_test)
print("gaushANVI: balanced Accuracy = " + str(balanced_accuracy_score(y_test, y_test_pred)))

gaushVI: balanced Accuracy = 0.0064666814969857765
