In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

from sklearn.metrics import mean_squared_error

import os, warnings 
warnings.filterwarnings('ignore') 
import stan

## Loading scRNA dataset
The Processed scRNA-seq data from [this paper](https://www.nature.com/articles/s41588-021-00911-1) is available through the Gene Expression Omnibus under accession number [GSE176078](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176078).

In [2]:
fname = 'results_breast/scRNA.h5ad'
if os.path.isfile(fname):
    adata_scRNA = sc.read_h5ad(fname)
else:
    path = 'data/Breast_Wu/Wu_etal_2021_BRCA_scRNASeq/'
    adata_scRNA = sc.read_mtx(path+"count_matrix_sparse.mtx").transpose()
    adata_scRNA.obs = pd.read_csv(path+"metadata.csv", index_col=0)
    adata_scRNA.var_names = pd.read_csv(path+"count_matrix_genes.tsv", index_col=0, header=None).index.to_list()
    adata_scRNA

    os.makedirs('results_breast', exist_ok=True)
    adata_scRNA.write_h5ad(fname)

adata_scRNA

AnnData object with n_obs × n_vars = 100064 × 29733
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'subtype', 'celltype_subset', 'celltype_minor', 'celltype_major'

In [3]:
adatas = dict()
sample_list = ['CID4290A', 'CID4465', 'CID4535', 'CID44971']
for sample in sample_list:
    adatas[sample] = adata_scRNA[adata_scRNA.obs['orig.ident']==sample]
adatas

{'CID4290A': View of AnnData object with n_obs × n_vars = 5789 × 29733
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'subtype', 'celltype_subset', 'celltype_minor', 'celltype_major',
 'CID4465': View of AnnData object with n_obs × n_vars = 1564 × 29733
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'subtype', 'celltype_subset', 'celltype_minor', 'celltype_major',
 'CID4535': View of AnnData object with n_obs × n_vars = 3961 × 29733
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'subtype', 'celltype_subset', 'celltype_minor', 'celltype_major',
 'CID44971': View of AnnData object with n_obs × n_vars = 7986 × 29733
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'subtype', 'celltype_subset', 'celltype_minor', 'celltype_major'}

In [4]:
for sample in sample_list:
    sc.pp.filter_genes(adatas[sample], min_cells=3)
    sc.pp.filter_cells(adatas[sample], min_genes=200)

    adatas[sample].layers['raw'] = adatas[sample].X
    adatas[sample].obs['ncounts'] = adatas[sample].to_df('raw').T.sum()

In [5]:
del adata_scRNA

## Running STAN

In [7]:
for sample in sample_list:
    print(sample)
    adatas[sample] = stan.add_gene_tf_matrix(adatas[sample],
                                            min_cells_proportion = 0.2,
                                            min_tfs_per_gene= 5,
                                            min_genes_per_tf= 10,
                                            gene_tf_source="hTFtarget",
                                            tf_list="humantfs",
                                            source_dir="resources/")
    
    sc.pp.normalize_total(adatas[sample])
    adatas[sample].layers['scaled'] = np.sqrt(adatas[sample].to_df())

In [27]:
from pathlib import Path
path = Path('results_breast/scran_ridge')
if not os.path.exists(path):
    os.makedirs(path)
    
def ridge_wrap(adata, lam_range=[1e-3, 1e3], n_steps=4, use_pixel=True): 
    cor_list = []
    alpha_list = []
    adata.obsm['tfa_ridge'] = pd.DataFrame(index = adata.to_df().index,
                                                  columns = adata.varm['gene_tf'].columns)
    ridge_model = stan.Ridge(adata, layer='scaled')
    for spot in adata.to_df().index:
        ridge_model.update_spot(spot)
        ridge_model.fit(n_steps=4, stages=1, grid_search_params={'lam':lam_range})
        alpha_list.append(ridge_model.params['lam'])
        cor = ridge_model.evaluate(fold=-1)
        cor_list.append(cor)
        adata.obsm['tfa_ridge'].loc[spot,:] =  ridge_model.W_concat.T

    adata.obs['param_ridge'] = alpha_list
    adata.obs['pred_cor_ridge'] = cor_list
    print("Spot-wise correlation:" + str(round(np.nanmedian(cor), 4)))
    return adata

In [28]:
for sample in ['CID4465', 'CID4290A', 'CID4535', 'CID44971']:
    adata = adatas[sample].copy()
    stan.assign_folds(adata, n_folds=10, random_seed=0)
    adata = ridge_wrap(adata)
    adata.obs['pred_cor_ridge'].to_csv(path / ('adata_'+sample+'_pred_cor_ridge.csv'))
    adata.obsm['tfa_ridge'].to_csv(path / ('adata_'+sample+'_tfa_ridge.csv'))

Spot-wise correlation:0.1249
Spot-wise correlation:0.3497
Spot-wise correlation:0.2454
Spot-wise correlation:0.2813


In [31]:
for cfile in ['pred_cor_ridge', 'tfa_ridge']:
    old_name = f'{path}/adata_CID4290A_{cfile}.csv'
    new_name = f'{path}/adata_CID4290_{cfile}.csv'
    os.rename(old_name, new_name)

## Deconvolution (Optional)
The doconvolution results are provided in the directory `results_breast`.

In [2]:
from GraphST import GraphST
from GraphST.preprocess import filter_with_overlap_gene
from GraphST.utils import project_cell_to_spot

### scRNA data
Download the supplementary table of [this paper](https://www.nature.com/articles/s41588-021-00911-1#Sec39).

In [3]:
adata_sc = sc.read_h5ad("results_breast/scRNA.h5ad")
genes = pd.read_excel("resources/41467_2021_26271_MOESM16_ESM.xlsx", header=None)[0].to_list()
genes_for_stsc = np.intersect1d(adata_sc.var_names, genes)

In [4]:
adata_sc_list = dict()
adata_sc_list['CID4290'] = adata_sc[adata_sc.obs["orig.ident"] == 'CID4290A', genes_for_stsc]
adata_sc_list['CID4535'] = adata_sc[adata_sc.obs["orig.ident"] == 'CID4535', genes_for_stsc]
adata_sc_list['CID4465'] = adata_sc[adata_sc.obs["orig.ident"] == 'CID4465', genes_for_stsc]
adata_sc_list['CID44971'] = adata_sc[adata_sc.obs["orig.ident"] == 'CID44971', genes_for_stsc]

In [5]:
sample_list = adata_sc_list.keys()
for sample in sample_list:
    adata_sc_list[sample].var_names_make_unique()
    GraphST.preprocess(adata_sc_list[sample])

### ST data

In [9]:
adata_list = dict()
sample_list = ['CID4290', 'CID4465', 'CID4535', 'CID44971']
for sample_id in sample_list:
    print(sample_id)
    adata_list[sample_id] = stan.read_breast_wu("data/Breast_Wu/{}.h5ad".format(sample_id))

CID4290
CID4465
CID4535
CID44971


In [10]:
for sample in sample_list:
    GraphST.preprocess(adata_list[sample])
    GraphST.construct_interaction(adata_list[sample])
    GraphST.add_contrastive_label(adata_list[sample])

### Finding overlap genes between ST and reference data

In [11]:
adata_sc_prep = dict()
adata_prep = dict()
for sample in sample_list:
    adata_sc = adata_sc_list[sample]
    adata = adata_list[sample]
    adata, adata_sc = filter_with_overlap_gene(adata, adata_sc)
    adata_sc_prep[sample] = adata_sc
    adata_prep[sample] = adata
    GraphST.get_feature(adata_prep[sample])

Number of overlap genes: 805
Number of overlap genes: 897
Number of overlap genes: 921
Number of overlap genes: 878


### Implementing GraphST for cell type deconvolution

In [14]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

adata_sc_trained = dict()
adata_trained = dict()
model_trained = dict()

for sample in sample_list:
    adata_sc = adata_sc_prep[sample]
    adata = adata_prep[sample]
    
    # Train model
    model = GraphST.GraphST(adata, adata_sc, epochs=1200, random_seed=50, device=device, deconvolution=True)
    adata, adata_sc = model.train_map()
    
    adata_sc_trained[sample] = adata_sc
    adata_trained[sample] = adata
    model_trained[sample] = model

Begin to train ST data...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [01:42<00:00, 11.65it/s]


Optimization finished for ST data!
Begin to train scRNA data...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:40<00:00, 29.71it/s]


Optimization finished for cell representation learning!
Begin to learn mapping matrix...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [03:29<00:00,  5.73it/s]


Mapping matrix learning finished!
Begin to train ST data...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:37<00:00, 31.63it/s]


Optimization finished for ST data!
Begin to train scRNA data...


100%|████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:10<00:00, 111.06it/s]


Optimization finished for cell representation learning!
Begin to learn mapping matrix...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:43<00:00, 27.73it/s]


Mapping matrix learning finished!
Begin to train ST data...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:30<00:00, 39.64it/s]


Optimization finished for ST data!
Begin to train scRNA data...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:28<00:00, 41.51it/s]


Optimization finished for cell representation learning!
Begin to learn mapping matrix...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:54<00:00, 21.97it/s]


Mapping matrix learning finished!
Begin to train ST data...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:34<00:00, 34.40it/s]


Optimization finished for ST data!
Begin to train scRNA data...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:54<00:00, 22.06it/s]


Optimization finished for cell representation learning!
Begin to learn mapping matrix...


100%|█████████████████████████████████████████████████████████████████████████████████| 1200/1200 [01:42<00:00, 11.75it/s]

Mapping matrix learning finished!





In [None]:
for celltype_name in ['celltype_major', 'celltype_minor', 'celltype_subset']:
    path = Path(f'results_breast/{celltype_name}')
    if not os.path.exists(path):
        os.makedirs(path)

for celltype_name in ['celltype_major', 'celltype_minor', 'celltype_subset']:
    for sample in sample_list:
        adata_sc_trained[sample].obs['cell_type'] = adata_sc_trained[sample].obs[celltype_name]
        project_cell_to_spot(adata_trained[sample], adata_sc_trained[sample], retain_percent=0.15)
        celltypes = adata_sc_trained[sample].obs[celltype_name].unique()
        df = pd.DataFrame(index=adata_trained[sample].obs.index,
                         columns = celltypes)
        for celltype in celltypes:
            df[celltype] = adata_trained[sample].obs[celltype]
        df.to_csv('results_breast/{}/{}.csv'.format(celltype_name, sample))