In [1]:
import os
import sys
sys.path.append("../")
from Attune_function6_22 import *
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt

In [2]:
#Select an available GPU to run on a multi-GPU computer or you can run it directly on the CPU without executing this cell
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = '5'
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# train

In [None]:
RNA_tf_path = '../10x_pretrain/tfrecord/RNA/'
ATAC_tf_path = '../10x_pretrain/tfrecord/ATAC/'
weight_path = './weight/regulatory/'
saved_weight_path = '../10x_pretrain/weight/pretrain/'
mask_path = './attention_mask_zero_dist_1200kb.csv.npz'
dict_ = {'batch_size': 4, 'epoch_pretrain': 20,'epoch_transformer':10, 'lr': 1e-4, 'drop_rate': 0.1}
attune_train_regulatory(RNA_tf_path,ATAC_tf_path,weight_path,saved_weight_path,mask_path,
                          super_parameters = dict_)

# test

In [None]:
RNA_tf_path = '../10x_pretrain/tfrecord/RNA/'
ATAC_tf_path = '../10x_pretrain/tfrecord/ATAC/'
mask_path = './attention_mask_zero_dist_1200kb.csv.npz'
weight_path_pretrain = '../10x_pretrain/weight/pretrain/'
weight_path_transformer = './weight/regulatory/'
n_cells = 900
dict_ = {'batch_size': 5, 'epoch_pretrain': 20,'epoch_transformer':10, 'lr': 1e-4,'drop_rate': 0.1}
RNA_id_all, cross_attention_1, cross_attention_2 = attune_test_regulatory(RNA_tf_path=RNA_tf_path,
                                                                          ATAC_tf_path=ATAC_tf_path,
                                                                          mask_path=mask_path,
                                                                          saved_weight_path_pretrain=weight_path_pretrain,
                                                                          saved_weight_path = weight_path_transformer,
                                                                          n_cells_for_sample=n_cells,
                                                                          super_parameters=dict_,
                                                                          )
print('infer finished')
f = {'cross_attention_1': cross_attention_1,
     'cross_attention_2': cross_attention_2,
     'RNA_id': RNA_id_all, }
saved_result_path = './result'
if not os.path.exists(saved_result_path):
    os.makedirs(saved_result_path)    
np.savez_compressed(saved_result_path + '/result_crossattention.npz', **f)

# output cross_attention.csv

##### loading Mask
The dimensions of the original cross-attention matrix are n (gene) x m (peak). To accelerate the computation and reduce memory usage, we load the genomic distance file and mupltiply it with cross-attention matrix.

* pbmc_window.mtx：genomic distance within 1200kb ("window" in GLUE experiment)
* pbmc_gene.list: ordered genes list in window.mtx
* pbmc_peak.list: ordered peaks list in window.mtx
* result_crossattention.npz: cross-attention matrix generated by Attune

In [3]:
window = sc.read('./pbmc_window.mtx')
wgene = pd.read_csv('./pbmc_gene.list',header=None)
wpeak = pd.read_csv('./pbmc_peak.list',header=None)

In [4]:
saved_result_path = './result'
f = np.load(saved_result_path + '/result_crossattention.npz')
RNA_id = f['RNA_id']
cross_attention_1 = f['cross_attention_1']
cross_attention_2 = f['cross_attention_2']
cross_attention_1_array = np.vstack(cross_attention_1)
cross_attention_2_array = np.vstack(cross_attention_2)

To ensure consistency, only common cell types in the 10x Multiome and PCHi-C datasets, including T cells, B cells and monocytes, are utilized for the comparison of different methods. Prepocess files are utilized to add some information such as feature name and cell type for cross-attention matrix.
* rna_hvg.h5ad: preprocess rna file
* atac_wox_filter.h5ad: preprocess atac file

In [5]:
adata_rna = sc.read('../10x_pretrain/data/rna_hvg.h5ad')
adata_atac = sc.read('../10x_pretrain/data/atac_wox_filter.h5ad')
adata_rna_1 = adata_rna[RNA_id] 
adata_atac_1 = adata_atac[RNA_id]
celltype_list = adata_rna_1.obs['celltype'].tolist()

In [6]:
used_cts = {
    "CD4 Naive", "CD4 TCM", "CD4 TEM", "CD8 Naive", "CD8 TEM_1", "CD8 TEM_2",
    "CD14 Mono", "CD16 Mono", "Memory B", "Naive B"
}
cross_attention = cross_attention_2_array[[x in used_cts for x in celltype_list]].T @ cross_attention_1_array[[x in used_cts for x in celltype_list]]
cross_attention_wocls = cross_attention[1:,1:]

In [21]:
df = pd.DataFrame(cross_attention_wocls, columns=adata_atac_1.var_names, index=adata_rna_1.var_names)
df = df.loc[wgene[0].values,:]
df = df.loc[:,wpeak[0].values]
df2 = window.X.multiply(df.values)
df2 = pd.DataFrame.sparse.from_spmatrix(df2)
df2.columns = df.columns.values
df2['gene'] = df.index.values
df_new = pd.melt(df2, id_vars='gene',value_vars=df2.columns.values[:-1])
df_new_f = df_new[df_new.value>0]
df_new_f.to_csv(saved_result_path + '/cross_attention.csv', index=False)