# 利用PLDA、DLPFC数据生成gene embedding

## 先用模拟数据进行演示

In [2]:
import gensim
from gensim import corpora
from sklearn.preprocessing import normalize
import numpy as np

# 假设你有一个基因表达矩阵 expression_matrix
# 行代表基因，列代表样本

# 1. 准备数据：将表达矩阵转换为LDA输入格式
expression_matrix = np.random.rand(100, 20)  # 假设有100个基因和20个样本
normalized_data = normalize(expression_matrix, norm='l1', axis=1)
gene_corpus = [list(enumerate(gene)) for gene in normalized_data]

# 2. 创建字典
gene_dict = corpora.Dictionary([[str(i) for i in range(expression_matrix.shape[1])]])

# 3. 训练PLDA模型
num_topics = 10  # 假设有10个潜在生物学主题
lda_model = gensim.models.LdaModel(corpus=gene_corpus, id2word=gene_dict, num_topics=num_topics)

# 4. 获取基因嵌入（每个基因的主题分布）
gene_embeddings = []
for i in range(expression_matrix.shape[0]):
    bow = gene_corpus[i]
    topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0)
    embedding = np.array([prob for _, prob in topic_distribution])
    gene_embeddings.append(embedding)

gene_embeddings = np.array(gene_embeddings)  # 这就是生成的基因嵌入

In [3]:
#gene_embeddings

array([[0.05000716, 0.05001754, 0.05001611, 0.05001943, 0.0500151 ,
        0.05000886, 0.05000826, 0.05001519, 0.5498769 , 0.05001539],
       [0.05000685, 0.05001654, 0.05001421, 0.05001934, 0.05001611,
        0.05001112, 0.05000754, 0.05001194, 0.5498819 , 0.05001445],
       [0.05000766, 0.05002069, 0.05001615, 0.05002092, 0.05001592,
        0.05000978, 0.05000959, 0.05001508, 0.5498688 , 0.05001539],
       [0.05000787, 0.05002069, 0.05001695, 0.0500198 , 0.05001522,
        0.05001243, 0.05000892, 0.05001422, 0.5498681 , 0.05001583],
       [0.05000841, 0.05001913, 0.05001662, 0.05002087, 0.05001606,
        0.05001087, 0.05001013, 0.05001472, 0.5498673 , 0.05001596],
       [0.0500083 , 0.05001824, 0.05001632, 0.05001923, 0.05001519,
        0.05001162, 0.05000953, 0.05001496, 0.5498711 , 0.05001549],
       [0.05000699, 0.05002408, 0.05001657, 0.05001949, 0.05001636,
        0.05001088, 0.05000925, 0.05001296, 0.54986686, 0.05001659],
       [0.05000699, 0.05001777, 0.0500153

## 下面用DLPFC数据

In [8]:
import warnings
import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import os
import sys
import rpy2
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.preprocessing import normalize
from sklearn.decomposition import LatentDirichletAllocation as LDA
warnings.filterwarnings("ignore")

# 读取 h5ad 文件
input_dir = '/Users/lee/Documents/bio/STAGATE_pyG_main/Dataset/DLPFC/151673'
adata = sc.read_visium(path=input_dir, count_file = 'filtered_feature_bc_matrix.h5')
adata.var_names_make_unique()

# 获取基因表达矩阵
# adata.X包含基因表达数据，按列为基因，按行为细胞
expression_matrix = adata.X.toarray()  # 将稀疏矩阵转换为密集矩阵

# 数据标准化（也可以选择 'l2' 或 'max' 归一化）
expression_matrix = normalize(expression_matrix, norm='l1', axis=1)

# 使用 PLDA 生成主题分布 (gene embedding)
num_topics = 10  # 这个参数需要具体情况具体修改
plda_model = LDA(n_components=num_topics, random_state=42)
gene_embeddings = plda_model.fit_transform(expression_matrix)

# 输出 gene_embeddings，形状为 (n_samples, n_topics)
print("Gene embeddings shape:", gene_embeddings.shape)

Gene embeddings shape: (3639, 10)


In [9]:
gene_embeddings

array([[0.4839599 , 0.05733778, 0.05733778, ..., 0.05733778, 0.05733778,
        0.05733778],
       [0.47432202, 0.05840867, 0.05840867, ..., 0.05840867, 0.05840867,
        0.05840867],
       [0.47550187, 0.05827757, 0.05827757, ..., 0.05827757, 0.05827757,
        0.05827757],
       ...,
       [0.49028713, 0.05663477, 0.05663477, ..., 0.05663477, 0.05663477,
        0.05663477],
       [0.47766438, 0.05803728, 0.05803728, ..., 0.05803728, 0.05803728,
        0.05803728],
       [0.4794875 , 0.05783472, 0.05783472, ..., 0.05783472, 0.05783472,
        0.05783472]], dtype=float32)

## 写出最终结果

In [11]:
import pandas as pd

# 创建一个 DataFrame
gene_embeddings_df = pd.DataFrame(gene_embeddings)

# 将结果保存为 CSV 文件
output_csv_path = "/Users/lee/Documents/bio/Topic_model/gene_embeddings_PLDA"
#gene_embeddings_df.to_csv(output_csv_path, index=False)

print(f"Gene embeddings saved to {output_csv_path}")

Gene embeddings saved to /Users/lee/Documents/bio/Topic_model/gene_embeddings_PLDA
