# Topnet生成gene  embedding（DLPFC数据）

## 类似的，先用模拟数据把code跑通

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import networkx as nx

# 假设 expression_matrix 是 DLPFC 数据集的基因表达矩阵
# 行表示基因，列表示样本
expression_matrix = np.random.rand(1000, 20)  # 1000个基因，20个样本
gene_names = [f"Gene_{i}" for i in range(expression_matrix.shape[0])]

# 计算基因之间的相关系数，构建共表达网络
correlation_matrix = np.corrcoef(expression_matrix)

# 构建网络，基于相关系数阈值（如0.8）构建边
threshold = 0.8  # 这个参数可以在后期进行修改
G = nx.Graph()
for i in range(correlation_matrix.shape[0]):
    for j in range(i+1, correlation_matrix.shape[1]):
        if correlation_matrix[i, j] > threshold:
            G.add_edge(gene_names[i], gene_names[j], weight=correlation_matrix[i, j])


In [3]:
import torch
import torch.nn as nn
import networkx as nx
from sklearn.preprocessing import normalize

# TopNet模型类
class TopNet(nn.Module):
    def __init__(self, num_genes, num_topics):
        super(TopNet, self).__init__()
        self.num_topics = num_topics
        self.fc = nn.Linear(num_genes, num_topics)
    
    def forward(self, x, gene_network):
        # 结合基因网络进行消息传递和主题分布学习
        # 注意：此处为简化的TopNet实现（在DLPFC数据集上，会定义更标准、完整的Topnet模型）
        topic_distribution = torch.softmax(self.fc(x), dim=1)
        return topic_distribution

# 初始化TopNet模型
num_genes = expression_matrix.shape[0]
num_topics = 10  # 假设有10个潜在主题（同样的，可以后期调整）
topnet_model = TopNet(num_genes=num_genes, num_topics=num_topics)

# 生成基因的嵌入
# 将输入从 (1000x20) 转换为 (20x1000)，以适应线性层的输入
expression_tensor = expression_tensor.T  # 转置操作，变为 (20, 1000)
# 然后传入模型
gene_embeddings = topnet_model(expression_tensor, G)

# gene_embeddings 就是生成的基因嵌入

In [4]:
#gene_embeddings

tensor([[0.0971, 0.1021, 0.0941, 0.0991, 0.0993, 0.1043, 0.1044, 0.0990, 0.1046,
         0.0959],
        [0.0993, 0.1036, 0.0934, 0.0982, 0.1005, 0.1040, 0.1008, 0.0993, 0.1045,
         0.0964],
        [0.0997, 0.1019, 0.0949, 0.0954, 0.0999, 0.1041, 0.1043, 0.0972, 0.1030,
         0.0997],
        [0.0955, 0.1012, 0.0934, 0.1013, 0.0997, 0.1067, 0.1007, 0.0983, 0.1046,
         0.0987],
        [0.1013, 0.1003, 0.0944, 0.0947, 0.1004, 0.1018, 0.1046, 0.0974, 0.1063,
         0.0986],
        [0.0980, 0.1053, 0.0930, 0.0966, 0.0978, 0.1024, 0.1037, 0.0964, 0.1072,
         0.0995],
        [0.0983, 0.1020, 0.0952, 0.0988, 0.0975, 0.1031, 0.1021, 0.1003, 0.1075,
         0.0952],
        [0.0994, 0.1021, 0.0969, 0.0961, 0.1020, 0.1043, 0.1008, 0.0974, 0.1040,
         0.0970],
        [0.0958, 0.1020, 0.0958, 0.0972, 0.0986, 0.1062, 0.1044, 0.1014, 0.1044,
         0.0942],
        [0.0976, 0.0999, 0.0967, 0.0957, 0.1010, 0.1063, 0.1037, 0.0976, 0.1042,
         0.0973],
        [0

## 使用DLPFC数据以及标准的Topnet模型生成gene embedding

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import networkx as nx
import anndata
import scanpy as sc
from sklearn.preprocessing import normalize

# 定义标准的TopNet模型
class TopNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TopNet, self).__init__()
        # 图卷积层
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        
        # 全连接层，用于生成主题分布
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        # 图卷积部分
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        
        # 主题分布部分
        x = F.relu(self.fc1(x))
        topic_distribution = torch.softmax(self.fc2(x), dim=1)
        
        return topic_distribution

# 读取数据
input_dir = '/Users/lee/Documents/bio/STAGATE_pyG_main/Dataset/DLPFC/151673'
adata = sc.read_visium(path=input_dir, count_file = 'filtered_feature_bc_matrix.h5')
adata.var_names_make_unique()

# 获取基因表达矩阵
expression_matrix = adata.X.toarray()  # 将稀疏矩阵转换为密集矩阵
expression_matrix = normalize(expression_matrix, norm='l1', axis=1)  # 归一化，方法l1也可以变成其他的


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [14]:
adata

AnnData object with n_obs × n_vars = 3639 × 33538
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial'
    obsm: 'spatial'

In [16]:
expression_matrix.shape

(3639, 33538)

In [19]:
expression_matrix.shape[0]  # 这里取的是gene数3639，而不是细胞数33538

3639

In [20]:
# 获取基因数量
num_genes = expression_matrix.shape[0]
print(num_genes)

# 构建基因网络，确保节点数量与基因数量一致
G = nx.gnm_random_graph(num_genes, num_genes * 2)  # num_genes确保生成的网络大小匹配
edge_index = torch.tensor(list(G.edges), dtype=torch.long).t().contiguous()

# 确保 edge_index 中的索引范围在 [0, num_genes-1] 之间
valid_mask = (edge_index[0] < num_genes) & (edge_index[1] < num_genes)
edge_index = edge_index[:, valid_mask]


3639


In [24]:
# 将基因表达矩阵转换为tensor
expression_tensor = torch.tensor(expression_matrix, dtype=torch.float32)

# 定义TopNet模型
input_dim = expression_tensor.shape[1]
hidden_dim = 64  # 隐藏层维度，可以调整
output_dim = 10  # 生成的主题数，同样可以调整
topnet_model = TopNet(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)

# 将数据输入模型
gene_embeddings = topnet_model(expression_tensor, edge_index)

# 输出gene_embeddings
print("Gene embeddings shape:", gene_embeddings.shape)

Gene embeddings shape: torch.Size([3639, 10])


# 保存gene_embeddings到csv

In [25]:
import pandas as pd
gene_embeddings_df = pd.DataFrame(gene_embeddings.detach().numpy())
output_csv_path = "/Users/lee/Documents/bio/Topic_model/gene_embeddings_Topnet"
#gene_embeddings_df.to_csv(output_csv_path, index=False)
print(f"Gene embeddings saved to {output_csv_path}")

Gene embeddings saved to /Users/lee/Documents/bio/Topic_model/gene_embeddings_Topnet
