In [1]:
import os
import glob
import sys
import re
import gc

import numpy as np
import scanpy as sc
import scanpy.external as sce
import anndata as ad
import geosketch
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib as mpl
import squidpy as sq
import torch
from lightning.pytorch import seed_everything
import scvi
import cell2location
from cell2location.models import RegressionModel
from cell2location.utils.filtering import filter_genes

seed_everything(0)
### You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. 
### For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
torch.set_float32_matmul_precision('high')

PYTORCH_CUDA_ALLOC_CONF= {'expandable_segments': True}

  from .autonotebook import tqdm as notebook_tqdm
  doc = func(self, args[0].__doc__, *args[1:], **kwargs)
  doc = func(self, args[0].__doc__, *args[1:], **kwargs)
Seed set to 0


In [2]:
results_folder = '../../data/visium/cell2location/'

# create paths and names to results folders for reference regression and cell2location models
ref_run_name = f'{results_folder}/reference_signatures'
run_name = f'{results_folder}/cell2location_map'

In [3]:
adata_file = f"{ref_run_name}/sc.h5ad"
scrna = sc.read_h5ad(adata_file)
mod = cell2location.models.RegressionModel.load(f"{ref_run_name}", scrna)

[34mINFO    [0m File ..[35m/../data/visium/cell2location/[0m[35m/[0m[95mreference_signatures[0m\model.pt already downloaded                    


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Epoch 1/73:   1%|▏         | 1/73 [00:01<02:11,  1.82s/it, v_num=1]

`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 1/73:   1%|▏         | 1/73 [00:01<02:12,  1.83s/it, v_num=1]


In [4]:
# In this section, we export the estimated cell abundance (summary of the posterior distribution).
scrna = mod.export_posterior(scrna)

Sampling local variables, batch:   0%|          | 0/2135 [00:00<?, ?it/s]
Sampling global variables, sample: 100%|██████████| 999/999 [00:09<00:00, 108.05it/s]


In [5]:
# export estimated expression in each cluster
if 'means_per_cluster_mu_fg' in scrna.varm.keys():
    inf_aver = scrna.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'
                                    for i in scrna.uns['mod']['factor_names']]].copy()
else:
    inf_aver = scrna.var[[f'means_per_cluster_mu_fg_{i}'
                                    for i in scrna.uns['mod']['factor_names']]].copy()
inf_aver.columns = scrna.uns['mod']['factor_names']
inf_aver.iloc[0:5, 0:5]

Unnamed: 0,Bas Inf,Bas KC,Bulge,CCR7+ DC,CD1C+ DC
A1BG,0.720542,0.042461,1.215222,2.504237,1.758144
A1BG-AS1,0.067954,0.000573,0.112629,0.1335,0.067919
A2M,0.020403,0.001543,0.128286,2.543839,0.70841
A2M-AS1,0.004637,0.001752,0.043178,0.027449,0.022011
A2ML1,0.009888,0.005184,0.001393,0.02259,0.002169


In [6]:
visium = sc.read_h5ad("../../data/visium/seurat_objects/pan-skin_merged.visium_data.harmony_integrated.anndata.h5ad")
visium.X = visium.layers['counts']

visium.var['SYMBOL'] = visium.var.index.values.tolist()

# find mitochondria-encoded (MT) genes
visium.var['MT_gene'] = [gene.startswith('MT-') for gene in visium.var['SYMBOL']]

# remove MT genes for spatial mapping
visium = visium[:, ~visium.var['MT_gene'].values]


# find shared genes and subset both anndata and reference signatures
intersect = np.intersect1d(visium.var_names, inf_aver.index)
visium = visium[:, intersect]
inf_aver = inf_aver.loc[intersect, :]

In [7]:
torch.cuda.empty_cache()
gc.collect()

31848

In [8]:
visium
np.shape(inf_aver)

(14757, 41)

In [9]:
sample_ids = visium.obs['sample_id'].unique().to_list()
res_list = []
sample_ids

  sample_ids = visium.obs['sample_id'].unique().to_list()


['ThraneJID2023_P03_NS_S01_R01_V10F24-006_A1',
 'ThraneJID2023_P03_NS_S01_R02_V10F24-006_B1',
 'ThraneJID2023_P10_NS_S01_R01_V10F24-007_A1',
 'ThraneJID2023_P10_NS_S01_R02_V10F24-007_B1',
 'ThraneJID2023_P10_NS_S02_R01_V10F24-005_A1',
 'ThraneJID2023_P10_NS_S02_R02_V10F24-005_B1',
 'JiCell2020_SCC_T20_V1',
 'JiCell2020_SCC_T20_V2',
 'JiCell2020_SCC_T28_V3',
 'JiCell2020_SCC_T28_V4',
 'BergenstrahleNatBiotech2022_SCC_T10_V10F24.015_A1',
 'BergenstrahleNatBiotech2022_SCC_T10_V10F24.015_B1',
 'BergenstrahleNatBiotech2022_SCC_T10_V10F24.015_C1',
 'BergenstrahleNatBiotech2022_SCC_T10_V10F24.015_D1',
 'GanierPNAS2024_face_temple1a',
 'GanierPNAS2024_face_temple1b',
 'GanierPNAS2024_face_temple2a',
 'GanierPNAS2024_face_temple2b',
 'GanierPNAS2024_face_nose1a',
 'GanierPNAS2024_face_glabella1',
 'GanierPNAS2024_face_forehead1a',
 'GanierPNAS2024_face_cheek1',
 'GanierPNAS2024_body_back1a',
 'GanierPNAS2024_body_inguinal1a',
 'GanierPNAS2024_body_abdomen1b',
 'GanierPNAS2024_body_thigh1b',
 'G

In [10]:
for sample_id in sample_ids:
    adata_file = f"{run_name}/{sample_id}/sp.h5ad"

    # if it already ran, load in the results, 
    # if not, run it and save the results
    if os.path.exists(adata_file):
        sample_obj = sc.read_h5ad(adata_file)
    else: 
        sample_obj = visium[visium.obs['sample_id'] == sample_id]
        cell2location.models.Cell2location.setup_anndata(adata=sample_obj, batch_key="sample_id")
        # create and train the model
        mod = cell2location.models.Cell2location(sample_obj, cell_state_df=inf_aver, N_cells_per_location=10, detection_alpha=20)

        mod.train(max_epochs=30000, batch_size=None, train_size=1)

        # In this section, we export the estimated cell abundance (summary of the posterior distribution).
        sample_obj = mod.export_posterior(sample_obj, sample_kwargs={'num_samples': 1000, 'batch_size': mod.adata.n_obs})

        # Save model
        mod.save(f'{run_name}/{sample_id}', overwrite=True)

        # Save anndata object with results
        sample_obj.write(adata_file)

        torch.cuda.empty_cache()
        gc.collect()
        

    # add 5% quantile, representing confident cell abundance, 'at least this amount is present',
    # to adata.obs with nice names for plotting
    sample_obj.obs[sample_obj.uns['mod']['factor_names']] = sample_obj.obsm['q05_cell_abundance_w_sf']
    sample_res = sample_obj.obs[sample_obj.uns['mod']['factor_names']]
    sample_res.to_csv(f"{run_name}/{sample_id}.cell2location_results.csv")
    res_list.append(sample_res)

    del(sample_obj)
    print(sample_id)
    torch.cuda.empty_cache()
    gc.collect()

ThraneJID2023_P03_NS_S01_R01_V10F24-006_A1
ThraneJID2023_P03_NS_S01_R02_V10F24-006_B1
ThraneJID2023_P10_NS_S01_R01_V10F24-007_A1
ThraneJID2023_P10_NS_S01_R02_V10F24-007_B1
ThraneJID2023_P10_NS_S02_R01_V10F24-005_A1
ThraneJID2023_P10_NS_S02_R02_V10F24-005_B1
JiCell2020_SCC_T20_V1
JiCell2020_SCC_T20_V2
JiCell2020_SCC_T28_V3
JiCell2020_SCC_T28_V4
BergenstrahleNatBiotech2022_SCC_T10_V10F24.015_A1
BergenstrahleNatBiotech2022_SCC_T10_V10F24.015_B1
BergenstrahleNatBiotech2022_SCC_T10_V10F24.015_C1
BergenstrahleNatBiotech2022_SCC_T10_V10F24.015_D1
GanierPNAS2024_face_temple1a
GanierPNAS2024_face_temple1b
GanierPNAS2024_face_temple2a
GanierPNAS2024_face_temple2b
GanierPNAS2024_face_nose1a
GanierPNAS2024_face_glabella1
GanierPNAS2024_face_forehead1a
GanierPNAS2024_face_cheek1
GanierPNAS2024_body_back1a
GanierPNAS2024_body_inguinal1a
GanierPNAS2024_body_abdomen1b
GanierPNAS2024_body_thigh1b
GanierPNAS2024_body_pubis1
GanierPNAS2024_body_inguinal2
GanierPNAS2024_bcc_face_cheek1
GanierPNAS2024_bcc_

  adata.obs["_indices"] = np.arange(adata.n_obs).astype("int64")
  _verify_and_correct_data_format(adata, self.attr_name, self.attr_key)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_st

Epoch 30000/30000: 100%|██████████| 30000/30000 [1:04:40<00:00,  8.15it/s, v_num=1, elbo_train=1.91e+7]

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [1:04:40<00:00,  7.73it/s, v_num=1, elbo_train=1.91e+7]
Sampling local variables, batch: 100%|██████████| 1/1 [00:22<00:00, 22.77s/it]
Sampling global variables, sample: 100%|██████████| 999/999 [00:21<00:00, 46.81it/s]
YuImmunity2024_S07


  adata.obs["_indices"] = np.arange(adata.n_obs).astype("int64")
  _verify_and_correct_data_format(adata, self.attr_name, self.attr_key)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_st

Epoch 30000/30000: 100%|██████████| 30000/30000 [1:15:10<00:00,  6.58it/s, v_num=1, elbo_train=2.96e+7]

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [1:15:10<00:00,  6.65it/s, v_num=1, elbo_train=2.96e+7]
Sampling local variables, batch: 100%|██████████| 1/1 [00:25<00:00, 25.50s/it]
Sampling global variables, sample: 100%|██████████| 999/999 [00:23<00:00, 42.43it/s]
YuImmunity2024_S08


  adata.obs["_indices"] = np.arange(adata.n_obs).astype("int64")
  _verify_and_correct_data_format(adata, self.attr_name, self.attr_key)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_st

Epoch 30000/30000: 100%|██████████| 30000/30000 [53:28<00:00,  9.47it/s, v_num=1, elbo_train=1.86e+7]

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [53:28<00:00,  9.35it/s, v_num=1, elbo_train=1.86e+7]
Sampling local variables, batch: 100%|██████████| 1/1 [00:19<00:00, 19.56s/it]
Sampling global variables, sample: 100%|██████████| 999/999 [00:19<00:00, 50.92it/s]
YuImmunity2024_S09A


  adata.obs["_indices"] = np.arange(adata.n_obs).astype("int64")
  _verify_and_correct_data_format(adata, self.attr_name, self.attr_key)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_st

Epoch 30000/30000: 100%|██████████| 30000/30000 [38:09<00:00, 13.41it/s, v_num=1, elbo_train=3.71e+6]

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [38:09<00:00, 13.10it/s, v_num=1, elbo_train=3.71e+6]
Sampling local variables, batch: 100%|██████████| 1/1 [00:16<00:00, 16.45s/it]
Sampling global variables, sample: 100%|██████████| 999/999 [00:19<00:00, 50.85it/s]
YuImmunity2024_S09B


  adata.obs["_indices"] = np.arange(adata.n_obs).astype("int64")
  _verify_and_correct_data_format(adata, self.attr_name, self.attr_key)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_st

Epoch 30000/30000: 100%|██████████| 30000/30000 [35:11<00:00, 13.77it/s, v_num=1, elbo_train=4.1e+6] 

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [35:11<00:00, 14.21it/s, v_num=1, elbo_train=4.1e+6]
Sampling local variables, batch: 100%|██████████| 1/1 [00:16<00:00, 16.87s/it]
Sampling global variables, sample: 100%|██████████| 999/999 [00:19<00:00, 51.47it/s]
YuImmunity2024_S09C


  adata.obs["_indices"] = np.arange(adata.n_obs).astype("int64")
  _verify_and_correct_data_format(adata, self.attr_name, self.attr_key)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
c:\Users\Paula\miniconda3\envs\rapids\Lib\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_st

Epoch 30000/30000: 100%|██████████| 30000/30000 [6:13:19<00:00,  1.38it/s, v_num=1, elbo_train=2.49e+7]  

`Trainer.fit` stopped: `max_epochs=30000` reached.


Epoch 30000/30000: 100%|██████████| 30000/30000 [6:13:19<00:00,  1.34it/s, v_num=1, elbo_train=2.49e+7]
Sampling local variables, batch: 100%|██████████| 1/1 [00:37<00:00, 37.52s/it]
Sampling global variables, sample: 100%|██████████| 999/999 [00:34<00:00, 28.59it/s]
YuImmunity2024_S10


In [11]:
resultdf = pd.concat(res_list)

In [12]:
resultdf

Unnamed: 0,Bas Inf,Bas KC,Bulge,CCR7+ DC,CD1C+ DC,CD4+ Th,CD4+ Treg,CD8+ Tc,CLEC9A+ DC,Cyc Imm,...,Perivasc Fib II,Plasma,Retic Fib I,Retic Fib II,Retic Fib III,SM,Schwann,Spn KC I,Spn KC II,VEC
ThraneJID2023_P03_NS_S01_R01_V10F24-006_A1_AAACACCAATAACTGC-1,0.608897,0.019046,0.010796,0.016994,0.023833,0.028840,0.059538,0.048298,0.003970,0.006901,...,0.005849,0.069608,0.002936,0.001953,0.009131,0.003980,0.524795,0.023798,0.024073,0.004590
ThraneJID2023_P03_NS_S01_R01_V10F24-006_A1_AAACAGCTTTCAGAAG-1,0.014334,0.388248,0.008618,0.001075,0.001028,0.003609,0.007012,0.006508,0.000614,0.000572,...,0.002662,0.003343,0.000938,0.000775,0.001614,0.002354,0.004404,0.023475,0.454441,0.001576
ThraneJID2023_P03_NS_S01_R01_V10F24-006_A1_AAACAGGGTCTATATT-1,0.070747,0.013731,0.035779,0.004913,0.004792,0.014591,0.027719,0.074422,0.002258,0.001960,...,0.021677,0.009039,0.001808,0.001203,0.003102,0.003391,0.021440,0.025541,0.120024,0.007185
ThraneJID2023_P03_NS_S01_R01_V10F24-006_A1_AAACCGGGTAGGTACC-1,0.095425,0.030911,0.022439,0.002628,0.000934,0.009033,0.017656,0.016608,0.001406,0.004755,...,0.004334,0.012646,0.000969,0.001024,0.002938,0.002732,0.022539,0.049080,0.253083,0.003828
ThraneJID2023_P03_NS_S01_R01_V10F24-006_A1_AAACCGTTCGTCCAGG-1,0.008654,0.022087,1.617743,0.206912,0.013423,0.247254,0.057658,0.081761,0.118520,0.012834,...,0.003121,0.116905,0.001940,0.004032,0.005537,0.045996,0.719325,0.003972,0.004818,0.003681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YuImmunity2024_S10_TGTTGGCCAATATGGC-1,0.005119,0.003250,0.005055,0.031177,0.028287,0.054460,0.113133,0.243289,0.082789,0.101572,...,0.015585,2.634163,1.825134,0.346062,0.028821,0.024680,0.170519,0.000546,0.000168,0.031180
YuImmunity2024_S10_TGTTGGCCGGATTGGT-1,0.173728,2.305793,0.066211,3.356087,1.134739,1.086575,4.874635,12.851641,0.684524,3.756535,...,0.684959,27.445328,0.254051,0.085717,0.206918,0.183625,1.406260,0.320293,1.956749,0.110780
YuImmunity2024_S10_TGTTGGCCTGTAGCGG-1,0.005289,0.005957,0.013421,0.106241,0.022086,0.037266,0.154563,0.573018,0.029490,0.066105,...,0.044285,0.154408,0.025477,0.004159,0.247889,0.485817,0.667077,0.002188,0.001816,0.816927
YuImmunity2024_S10_TGTTGGTGCGGAATCA-1,0.007177,0.006228,0.026209,0.157332,0.183768,0.329302,0.248603,0.474865,0.409052,0.310032,...,0.044304,1.830205,6.717916,1.321997,0.151459,0.181582,0.658200,0.001860,0.000410,0.122279


In [13]:
resultdf.to_csv("../../data/visium/cell2location/cell2locationresultsmerged.csv")