In [1]:
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pertpy as pt
import omicverse as ov

In [None]:
def MergeSubAdata(adata, subAdata, obsName, subObsName):
    obs = pd.Series(list(adata.obs[obsName].copy()), index=adata.obs_names)
    subObs = pd.Series(list(subAdata.obs[subObsName].copy()), index=subAdata.obs_names)
    obs.loc[subObs.index] = subObs
    return pd.DataFrame(obs, columns=[obsName])

def marker(adata, groupby, method='wilcoxon', prefix=''):
    sc.tl.rank_genes_groups(adata, groupby = groupby, method = method)
    sc.tl.dendrogram(adata, groupby=groupby, use_rep='X_pca_harmony')
    sc.pl.rank_genes_groups_dotplot(adata, groupby = groupby, save=prefix+groupby+'.svg')
    return adata

def clu(adata, key_added="majorType-fix", n_neighbors=50, n_pcs=30, rep='X_pca_harmony', do_har=False, max_iter=20, do_scrublet=False, har_key='batch', resolution=1):
    # Computing the neighborhood graph
    if do_scrublet:
        n0 = adata.shape[0]
        print("{0} Cell number: {1}".format(key_added, n0))
        sc.external.pp.scrublet(adata)
        adata = adata[adata.obs['predicted_doublet']==False,:].copy()
        print("{0} Cells retained after scrublet, {1} cells reomved.".format(adata.shape[0], n0-adata.shape[0]))
    else:
        print("Ignoring processing doublet cells...")
    if do_har and len(adata.obs[har_key].cat.categories) > 1:
        sc.external.pp.harmony_integrate(adata, key=har_key,max_iter_harmony=max_iter)
        sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, use_rep=rep)
    else:
        print("Evaluating neighbors only...")
        sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, use_rep=rep)
    # Run UMAP
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=resolution, key_added=key_added)
    sc.pl.umap(adata, color=key_added, legend_fontoutline=True, palette=sc.pl.palettes.default_20, legend_loc="on data")
    return adata

In [2]:
adata = sc.read_h5ad('adata-epi-tumor.h5ad')

In [None]:
adata.obs['clinical'] = 'Unkonwn'
adata.obs.loc[adata.obs['batch'].isin(['2T', '12T', '15T','16T', '19T']),'clinical'] = 'SD'
adata.obs.loc[adata.obs['batch'].isin(['1T', '8T', '11T', '13T','17T' ]),'clinical'] = 'PR'
adata.obs.loc[adata.obs['batch'].isin(['9T']),'clinical'] = 'PCR'

In [None]:
adata.obs['chemotherapy'] = 'Unkonwn'
adata.obs.loc[adata.obs['batch'].isin(['2T', '11T', '12T','16T']),'chemotherapy'] = 'Ia'
adata.obs.loc[adata.obs['batch'].isin(['8T', '13T', '15T']),'chemotherapy'] = 'Ib/IIa'
adata.obs.loc[adata.obs['batch'].isin(['1T', '17T', '19T']),'chemotherapy'] = 'IIa/IIb'
adata.obs.loc[adata.obs['batch'].isin(['9T']),'chemotherapy'] = 'IIIa/IIIb'

In [None]:
sc.pl.violin(adata, keys=['MKI67'], groupby='treat')

In [3]:
adata_epi = adata[adata.obs['treat'].isin(['After']), :]

In [4]:
adata_epi.obs['effect'] = 'better'
adata_epi.obs.loc[adata_epi.obs['batch'].isin(['13T', '19T']),'effect'] = 'better'
adata_epi.obs.loc[adata_epi.obs['batch'].isin(['11T', '12T', '16T']),'effect'] = 'worse'

In [None]:
adata_before = adata[adata.obs['treat'].isin(['Before']), :]

In [None]:
adata_before.obs['effect'] = 'better'
adata_before.obs.loc[adata_before.obs['batch'].isin(['1T', '8T', '9T', '15T', '17T']),'effect'] = 'better'
adata_before.obs.loc[adata_before.obs['batch'].isin(['2T']),'effect'] = 'worse'

In [None]:
sc.pl.violin(adata_epi, keys=['MKI67'], groupby='effect')

In [None]:
adata_epi = clu(adata_epi, 'leiden-epi-after', resolution=0.8, do_scrublet=False, do_har=True)
adata_epi = marker(adata_epi, 'leiden-epi-after')

In [None]:
adata_before = clu(adata_before, 'leiden-epi-before', resolution=0.5, do_scrublet=False, do_har=True)

In [None]:
sc.pl.rank_genes_groups_dotplot(adata_epi, var_names=['EPCAM', 'KRT8', 'KRT14', 'VIM','PDGFRA', "AQP1"], save='-EMT-marker.svg')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata_before, groupby= 'leiden-epi-before', var_names=['EPCAM', 'KRT8', 'KRT14', 'VIM','CD44', "CD24"], save='-EMT-marker-Before.svg')

In [None]:
adata_epi.obs['emt-type'] = 'Unknown'
adata_epi.obs.loc[adata_epi.obs['leiden-epi-after'].isin(['1', '2', '3']),'emt-type'] = 'Early hybrid'
adata_epi.obs.loc[adata_epi.obs['leiden-epi-after'].isin(['4', '6']),'emt-type'] = 'Late hybrid'
adata_epi.obs.loc[adata_epi.obs['leiden-epi-after'].isin(['0', '5']),'emt-type'] = 'Epithelial'
sc.pl.umap(adata_epi, color=['emt-type'], palette='tab20', save = '-EMT.svg')

In [None]:
adata_before.obs['emt-type'] = 'Unknown'
adata_before.obs.loc[adata_before.obs['leiden-epi-before'].isin(['2', '4', '6']),'emt-type'] = 'Early hybrid'
adata_before.obs.loc[adata_before.obs['leiden-epi-before'].isin(['7', '3', '8', '9']),'emt-type'] = 'Late hybrid'
adata_before.obs.loc[adata_before.obs['leiden-epi-before'].isin(['0', '1', '5', '10']),'emt-type'] = 'Epithelial'
sc.pl.umap(adata_before, color=['emt-type'], palette='tab20', save = '-EMT-Before.svg')

In [None]:
import pandas as pd

# 统计每个 batch 不同 emt-type 的数量
counts = adata_epi.obs.groupby(["batch", "emt-type"]).size().unstack(fill_value=0)

# 只取 Early hybrid 和 Late hybrid
counts = counts[["Early hybrid", "Late hybrid"]]

# 计算比例：Early hybrid / Late hybrid
counts["Early_Late_ratio"] = counts["Early hybrid"] / counts["Late hybrid"]

print(counts)

In [None]:
import pandas as pd

# 统计每个 batch 不同 emt-type 的数量
counts = adata_before.obs.groupby(["batch", "emt-type"]).size().unstack(fill_value=0)

# 只取 Early hybrid 和 Late hybrid
counts = counts[["Early hybrid", "Late hybrid"]]

# 计算比例：Early hybrid / Late hybrid
counts["Early_Late_ratio"] = counts["Early hybrid"] / counts["Late hybrid"]

print(counts)

In [None]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
plt.figure(figsize=(3, 3)) 
# 提取所需信息
df = adata_epi.obs[['emt-type', 'clinical', 'IHC', 'batch']].copy()

# --- 1) 以 batch 作为“样本”聚合，计算 Early/Late 数量与临床/IHC（取众数） ---
# 计数 Early / Late
ct = (
    df.groupby('batch')['emt-type']
      .value_counts()
      .unstack(fill_value=0)
)

# 确保缺失列存在
for col in ['Early hybrid', 'Late hybrid']:
    if col not in ct.columns:
        ct[col] = 0

# 计算比值：Early / Late；若 Late=0 则置为 NaN（也可改为 np.inf 或加伪计数）
ratio = np.where(ct['Late hybrid'] > 0, ct['Early hybrid'] / ct['Late hybrid'], np.nan)

# 聚合 clinical、IHC 为“样本级”属性（众数）
def mode_or_nan(s):
    s = s.dropna()
    return s.mode().iloc[0] if len(s) else np.nan

sample_meta = (
    df.groupby('batch')
      .agg(
        clinical=('clinical', mode_or_nan),
        # clinical=('chemotherapy', mode_or_nan),
          IHC=('IHC', mode_or_nan)
      )
)

# 汇总成样本级 DataFrame
g = sample_meta.copy()
g['ratio_early_late'] = ratio
g = g.dropna(subset=['ratio_early_late']).reset_index()  # 过滤无法计算比值的样本

# --- 取 log10 ---
g['log10_ratio'] = np.log10(g['ratio_early_late'])
# 如果比值 <= 0，会得到 -inf 或 NaN，可以过滤掉
g = g.replace([np.inf, -np.inf], np.nan).dropna(subset=['log10_ratio'])

# --- 2) IHC → 散点形状 ---
marker_map = {
    'HR+HER2-': 'o',  # 圆
    'TNBC': 's',      # 方
    'HR+HER2+': '*',   # 五角星
    'HE2+': 'D'
}

# --- IHC → 颜色 ---
color_map = {
    'HR+HER2-': '#4dbbd5',
    'TNBC': '#00a087',
    'HR+HER2+': '#3c5488',
    'HER2+': '#91d1c2'
}

# --- 3) batch 重映射（用于标注） ---
batch_map = {
    '11T': 'BCYL032T', '12T': 'BCYK372T',
    '13T': 'BCYC822T', '16T': 'BCYD452T', '19T': 'BCYY602T'
}
g['batch_label'] = g['batch'].map(lambda x: batch_map.get(str(x), str(x)))

# --- 4) 处理 X 轴：clinical 既支持数值也支持分类型 ---
clinical = g['clinical']
order = ["SD", "PR", "PCR"]
# order = ["Ia", "Ib/IIa", "IIa/IIb", "IIIa/IIIb"]
# 先尝试数值化（对象/字符串数字也能识别）
clinical_num = pd.to_numeric(clinical, errors='coerce')

# 条件：原本就是数值类型，或 ≥90% 的值可成功转为数值，就按连续数值处理
if is_numeric_dtype(clinical) or (clinical_num.notna().mean() >= 0.9):
    g['x'] = clinical_num.astype(float)
    xtick_locs, xtick_labels = None, None
else:
    # 分类型处理：保持出现顺序（你也可以替换为自定义顺序）
    cats = pd.Categorical(clinical.astype(str),
                          categories=order,
                          ordered=True)
    g['x'] = cats.codes.astype(float)
    xtick_locs = np.arange(len(cats.categories), dtype=float)
    xtick_labels = list(cats.categories)
# --- 5) 绘图 ---
plt.figure(figsize=(8, 5), dpi=120)

# y=0 灰色虚线
plt.axhline(0, color='gray', linestyle='--', linewidth=1)

# 分 IHC 画散点并标注 batch
handles, labels = [], []
for ihc, marker in marker_map.items():
    mask = (g['IHC'] == ihc)
    if not mask.any():
        continue
    sc = plt.scatter(
        g.loc[mask, 'x'],
        g.loc[mask, 'log10_ratio'],
        marker=marker,
        s=80,
        c=color_map[ihc],        # 使用自定义颜色
        edgecolors='black',
        linewidths=0.5,
        alpha=0.9,
        label=ihc   
    )
    handles.append(sc)
    labels.append(ihc)

    # 标注映射后的 batch 名称（向上偏移避免压到点）
    for xi, yi, txt in zip(g.loc[mask, 'x'], g.loc[mask, 'log10_ratio'], g.loc[mask, 'batch_label']):
        if pd.isna(xi) or pd.isna(yi):
            continue
        plt.annotate(
            txt,
            (xi, yi),
            xytext=(0, 8), textcoords='offset points',
            ha='center', va='bottom', fontsize=9
        )

# 轴/图例/标签
plt.xlabel('Clinical response')
plt.ylabel('Early hybrid cells / Late hybrid cells')

if xtick_locs is not None:
    plt.xticks(xtick_locs, xtick_labels)

if handles:
    plt.legend(handles, labels, title='IHC', frameon=True)

plt.tight_layout()
plt.savefig("figures/EMT-prop-After.svg", bbox_inches='tight')
plt.show()


In [None]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype

# 提取所需信息
df = adata_before.obs[['emt-type', 'clinical', 'IHC', 'batch']].copy()

# --- 1) 以 batch 作为“样本”聚合，计算 Early/Late 数量与临床/IHC（取众数） ---
# 计数 Early / Late
ct = (
    df.groupby('batch')['emt-type']
      .value_counts()
      .unstack(fill_value=0)
)

# 确保缺失列存在
for col in ['Early hybrid', 'Late hybrid']:
    if col not in ct.columns:
        ct[col] = 0

# 计算比值：Early / Late；若 Late=0 则置为 NaN（也可改为 np.inf 或加伪计数）
ratio = np.where(ct['Late hybrid'] > 0, ct['Early hybrid'] / ct['Late hybrid'], np.nan)

# 聚合 clinical、IHC 为“样本级”属性（众数）
def mode_or_nan(s):
    s = s.dropna()
    return s.mode().iloc[0] if len(s) else np.nan

sample_meta = (
    df.groupby('batch')
      .agg(
        clinical=('clinical', mode_or_nan),
        # clinical=('chemotherapy', mode_or_nan),
          IHC=('IHC', mode_or_nan)
      )
)

# 汇总成样本级 DataFrame
g = sample_meta.copy()
g['ratio_early_late'] = ratio
g = g.dropna(subset=['ratio_early_late']).reset_index()  # 过滤无法计算比值的样本
# --- 取 log10 ---
g['log10_ratio'] = np.log10(g['ratio_early_late'])
# 如果比值 <= 0，会得到 -inf 或 NaN，可以过滤掉
g = g.replace([np.inf, -np.inf], np.nan).dropna(subset=['log10_ratio'])


# --- 2) IHC → 散点形状 ---
marker_map = {
    'HR+HER2-': 'o',  # 圆
    'TNBC': 's',      # 方
    'HR+HER2+': '*',   # 五角星
    'HER2+': 'D'
}

# --- IHC → 颜色 ---
color_map = {
    'HR+HER2-': '#4dbbd5',
    'TNBC': '#00a087',
    'HR+HER2+': '#3c5488',
    'HER2+': '#91d1c2'
}

# --- 3) batch 重映射（用于标注） ---
batch_map = {
    '1T': 'BCYC021T', '2T': 'BCYD451T', '8T': 'BCYZ011T', '9T': 'BCYX921T',
    '15T': 'BCYS051T', '17T': 'BCYY651T'
}
g['batch_label'] = g['batch'].map(lambda x: batch_map.get(str(x), str(x)))

# --- 4) 处理 X 轴：clinical 既支持数值也支持分类型 ---
clinical = g['clinical']
# 自定义顺序
order = ["SD", "PR", "PCR"]
# order = ["Ia", "Ib/IIa", "IIa/IIb", "IIIa/IIIb"]
# 先尝试数值化（对象/字符串数字也能识别）
clinical_num = pd.to_numeric(clinical, errors='coerce')

# 条件：原本就是数值类型，或 ≥90% 的值可成功转为数值，就按连续数值处理
if is_numeric_dtype(clinical) or (clinical_num.notna().mean() >= 0.9):
    g['x'] = clinical_num.astype(float)
    xtick_locs, xtick_labels = None, None
else:
    # 分类型处理：保持出现顺序（你也可以替换为自定义顺序）
    cats = pd.Categorical(clinical.astype(str),
                          categories=order,
                          ordered=True)
    g['x'] = cats.codes.astype(float)
    xtick_locs = np.arange(len(cats.categories), dtype=float)
    xtick_labels = list(cats.categories)
# --- 5) 绘图 ---
plt.figure(figsize=(8, 5), dpi=120)

# y=0 灰色虚线
plt.axhline(0, color='gray', linestyle='--', linewidth=1)

# 分 IHC 画散点并标注 batch
handles, labels = [], []
for ihc, marker in marker_map.items():
    mask = (g['IHC'] == ihc)
    if not mask.any():
        continue
    sc = plt.scatter(
        g.loc[mask, 'x'],
        g.loc[mask, 'log10_ratio'],
        marker=marker,
        s=80,
        c=color_map[ihc],        # 使用自定义颜色
        edgecolors='black',
        linewidths=0.5,
        alpha=0.9,
        label=ihc   
    )
    handles.append(sc)
    labels.append(ihc)

    # 标注映射后的 batch 名称（向上偏移避免压到点）
    for xi, yi, txt in zip(g.loc[mask, 'x'], g.loc[mask, 'log10_ratio'], g.loc[mask, 'batch_label']):
        if pd.isna(xi) or pd.isna(yi):
            continue
        plt.annotate(
            txt,
            (xi, yi),
            xytext=(0, 8), textcoords='offset points',
            ha='center', va='bottom', fontsize=9
        )

# 轴/图例/标签
plt.xlabel('Clinical response')
plt.ylabel('Early hybrid cells / Late hybrid cells')

if xtick_locs is not None:
    plt.xticks(xtick_locs, xtick_labels)

if handles:
    plt.legend(handles, labels, title='IHC', frameon=True)

plt.tight_layout()
plt.savefig("figures/EMT-prop-Before.svg", bbox_inches='tight')
plt.show()


In [None]:
adata_17 = adata_before[adata_before.obs['batch']=='17T', :]
adata_17.obs['IHC']

In [None]:
fig, ax = plt.subplots(1,1,figsize=(4,4))
sc.pl.dotplot(adata_epi, groupby='emt-type',  cmap='Oranges', var_names=[
  'EPCAM', 'KRT8', 'KRT14', 'VIM','PDGFRA', "AQP1"], ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=8)
fig.savefig("figures/EMT-markers.svg", bbox_inches='tight')

In [None]:
sc.pl.umap(adata_epi, color=['emt-type', 'effect', 'leiden-epi-after', 'batch'], palette='tab20')

In [None]:
sc.pl.umap(adata_before, color=['emt-type', 'effect', 'leiden-epi-before', 'batch'], palette='Set3')

In [None]:
sc.pl.umap(adata_epi, color=['EPCAM', 'KRT8', 'KRT14', 'VIM','PDGFRA', "AQP1"], palette='tab20')

In [None]:
# 1. 计算每个样本中每种细胞类型的数量
cell_type_counts = adata_before.obs.groupby(['batch', 'emt-type']).size().unstack(fill_value=0)

# 2. 计算每个样本的总细胞数
sample_totals = cell_type_counts.sum(axis=1)

# 3. 计算每种细胞类型在每个样本中的占比
proportions = cell_type_counts.div(sample_totals, axis=0)

# 4. 绘制堆叠条形图
fig, ax = plt.subplots(figsize=(12, 8))

# 遍历每个细胞类型，逐层堆叠
bottom = np.zeros(len(cell_type_counts.index))  # 初始化底部高度
for cell_type in proportions.columns:
    ax.bar(proportions.index, proportions[cell_type], bottom=bottom, label=cell_type)
    bottom += proportions[cell_type]  # 更新底部高度

# 添加标题和标签
ax.set_title('Proportion of Cell Types in Each Sample', fontsize=16)
ax.set_xlabel('Sample', fontsize=14)
ax.set_ylabel('Proportion', fontsize=14)
ax.legend(title='Cell Type', bbox_to_anchor=(1.05, 1), loc='upper left')

# 显示图形
plt.tight_layout()
plt.show()

In [None]:
sc.tl.rank_genes_groups(adata_epi, groupby = 'effect', method = 'wilcoxon')
# sc.pl.rank_genes_groups_dotplot(adata_epi, groupby = 'effect', save='-epi-effect.svg')

In [None]:
adata_epi_filter = adata_epi[adata_epi.obs['tumor-type']!='Basal Cancer', :]

In [None]:
sc.tl.rank_genes_groups(adata_epi_filter, groupby = 'effect', method = 'wilcoxon')
# sc.pl.rank_genes_groups_dotplot(adata_epi_filter, groupby = 'effect', save='-epi-effect-luminal.svg')

In [None]:
adata_epi_basal = adata_epi[adata_epi.obs['tumor-type']=='Basal Cancer', :]

In [None]:
sc.tl.rank_genes_groups(adata_epi_basal, groupby = 'effect', method = 'wilcoxon')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata_epi_basal, groupby = 'effect', save='-epi-effect-basal.svg')

#### 基因集打分

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 数据中分组信息所在的列名
group_col = 'effect'
adata = adata_epi.copy()

# 1. 定义MP基因集
MP_gene_sets = {
    "MP1": ['ACTA2','TAGLN','TPM2','ECRG4','CXCL14','MYLK','MYL9','CNN1','MYH11','SYT8',
            'KRT14','ACTG2','PPP1R14A','DKK3','MATN2','POSTN','APOE','COL17A1','A2M','SPARCL1',
            'MIR205HG','TGFB1I1','KCNMB1','KRT17','TP63','TNS4','SNCG','PDLIM7','LMOD1','MSRB3',
            'LTBP2','OXTR','MME','TSHZ2','PDGFA','CARMN','FGF1','ADAMTS5','EGR1','NEXN','IL17B',
            'CD200','GAS1','LIMS2'],
    
    "MP2": ['EGR1','FOS','JUN','IER2','JUNB','ZFP36','DUSP1','SOCS3','DNAJB1','FOSB',
            'EGR2','CCNL1','DDIT3','HES1','GADD45B','SERTAD1','PPP1R15A','H2AX','MYC','KLF10',
            'IER3','IRF1','SNHG12','CSRNP1','RHOB','MCL1','DNAJB4','AC011446.2','CLK1'],
    
    "MP3": ['KRT15','MMP7','GABRP','CCL28','SLPI','LTF','KIT','RARRES1','KRT23','SNORC',
            'ALDH1A3','PIGR','LCN2','PROM1','KRT6B','SLC34A2','CRABP1','KRT14','TTYH1','NDRG2',
            'SAA2','JUN','KRT17','FOLR1'],
    
    "MP4": ['TFF1','ANKRD30A','STC2','TFF3','SCUBE2','AGR2','PLAT','DCDC2','AFF3','AGR3',
            'PDK4','AREG','TCIM','LRP2','SLC40A1','FBP1','AZGP1','C15orf48','ERBB4','SLC7A2',
            'INPP4B','VSIG10L2','NEK10','BATF','GDF15','CITED1','ACADSB','LINC00993','MGP','XBP1',
            'DAPL1','VSIG2','UCP2','ZFP36'],
    
    "MP5": ['TFF1','TCIM','MIDEAS','GPRC5A','ZFP36','RAB11FIP1','DNAJB1','ISG15','JUNB',
            'KIAA0040','SLC39A6','IER2','BATF','C15orf48','GADD45B','NR4A1','PIP','LURAP1L',
            'CLDN4','AGR3','CD55','ELF3','IER3','CEBPD'],
    
    "MP6": ['COL14A1','TAGLN','MYLK','EGR1','APOE','MYL9','MYH11','TPM2','SPARCL1','A2M',
            'IL17RD','SEMA3C','LMOD1','MIR205HG','COL17A1','SFRP2'],
    
    "MP7": ['MUCL1','SERPINA1','PIP','ALDH3B2','HMGCS2','NUDT8'],
    
    "MP8": ['HLA-DRA','CD74','HLA-DRB1','WARS1','HLA-DPA1','HLA-DQB1','PSMB9','PLAAT4','C1S',
            'SOD2','TYMP','UBE2L6','IL32'],
    
    "MP9": ['PIP','SCGB1D2','SCGB2A2','SCGB1B2P','JUN','DUSP1','STC2','JUNB','ZFP36','FADS2',
            'FOSB','AZGP1','CCNL1','ESR1'],
    
    "MP10": ['SNORC','SLC12A2','NCALD','S100A1','MELTF','PPP1R1B','TESC','KCNN4','ACTR3B','PLA2R1',
             'MRAS','PADI2','LTF','ZG16B','IGFBP7','CRABP1','ROCR','PDK3','SLPI','FOS','KRT15',
             'PIP','CCDC71L','CCL28','TTYH1','NDRG2','FSCN1','FRZB','STAC2','IER2','ELF5','SFRP1',
             'PDCD4','GABRP','PALMD','JUNB','LCN2','CBR3','JUN','PHLDA1','EGR2','SLC34A2','SPARCL1',
             'CKB','CD44','AC007906.2','MYBPC1','SOCS3','PROM1','PDGFA','SYNM','EGR1','INSIG1',
             'FABP7','CALML5','VIM','ANO1','AC036108.2','TPD52L1','NUDT8','FGFBP1','SAA2','PYGB',
             'CRISPLD1','RCAN1','LMO4','TNS4','PIGR','DBI','DEPP1','ITM2A','SOD3','SCGB2A2',
             'HTRA1','CAV1','EGLN3','S100A9'],
    
    "MP11": ['CALML5','S100P','CRABP1','CDKN2A','IMPA2','STMN1','C20orf27','KRT15','IER2','WARS1',
             'PCNA','FADS2','PCSK1N','COTL1','CKS1B','CD74','H2AW','UNG','LY6D','PHGDH','HLA-DPA1',
             'S100A4','IL32','DNAJB1','TAP1','MCM7','PKP1','MGP','IFI27','C1S','DKC1','EIF4EBP1',
             'AZGP1','PSIP1','FGFBP1','C4orf48','HMGA1','JUNB','HAPLN3','S100A16','SEPTIN11',
             'MCM3','H2AZ1','KHDRBS3','OPTN','SCD','MMP7','CDK2AP1','COX6C','GABRP','HES1','KRT5',
             'POLD2','IER5L','STC2','DHCR24','ELF5','DBI','ASS1','PYCARD','CDC42EP4','GALNT6',
             'TAGLN2','HSPA1A','HSPA6','EHF','MIR205HG','HSPA1B','MCL1','NT5DC2','GSTP1','CAVIN3',
             'CD24'],
    
    "MP12": ['C2CD4B','ARRDC4','DHRS2','AC093001.1','DNAJA4','TFPI','MGP','APOD','MGST1','IFI27',
             'JUN','EGR1','FOS','IDH3A','CCND1','AGR2','THSD4','JUNB','SCGB2A2','COX6C','IER2',
             'YEATS4','KLHL42','RAB11FIP1','S100P','MDM2','TFF3','GADD45B','ARMT1','METRN','DUSP1',
             'GGH','CRABP2','EZR','ZFP36','ETFRF1','CCNL1','KRT19','S100A9','RASSF8','PPP1R15A',
             'MID1IP1','TMEM176B','SMIM22','SPDEF','IER3','KRT7','S100A14','SCGB1D2','VMP1',
             'SLC9A3R1','PAWR']
}


In [None]:
# 用于记录无法找到的基因
missing_genes = {}

# 对每个MP打分
for mp_name, genes in MP_gene_sets.items():
    # 过滤掉表达矩阵中不存在的基因
    valid_genes = [g for g in genes if g in adata.var_names]
    if len(valid_genes) < 1:
        print(f"[Warning] {mp_name} 有效基因少于3个，跳过或打分不稳定")
        missing_genes[mp_name] = list(set(genes) - set(valid_genes))
    
    sc.tl.score_genes(adata, gene_list=valid_genes, score_name=f'{mp_name}_score', use_raw=False)

# 可选：打印哪些MP有基因缺失
print("\n以下MP存在未找到的基因：")
for mp, genes in missing_genes.items():
    print(f"{mp}: {genes}")

In [None]:
sc.pl.violin(adata_epi, keys=['KRT5', 'KRT14', 'KRT17'], groupby='effect')

In [None]:
sc.pl.violin(adata_epi, keys=['KRT5', 'KRT14', 'KRT17'], groupby='leiden-epi-after')

In [None]:
sc.pl.umap(adata_epi, color=['KRT5', 'KRT14', 'KRT17'])

In [None]:
sc.pl.umap(adata_epi, color=['effect'], palette=['#5495cf', '#db4743'], save='-effect-tumor-type.svg')

In [None]:
# 所有得分列名
score_cols = [f"{k}_score" for k in MP_gene_sets.keys()]

# 绘图，每个MP一个子图（可改为多子图布局）
for col in score_cols:
    sc.pl.violin(adata, keys=col, groupby=group_col, stripplot=False, rotation=45, size=0.5, show=True)


In [None]:
df_avg = adata.obs.groupby(group_col)[score_cols].mean()
print("\n每组MP平均得分：")
print(df_avg)

In [None]:
df_avg.T.plot(kind='bar', figsize=(12,6))
plt.title('Average MP Scores per Group')
plt.ylabel('Mean Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# 1) 映射行索引（treat）到目标标签
label_map = {
    "better": "Better",
    "worse": "Worse"
}
df_plot = df_avg.rename(index=label_map)

# 2) 指定颜色字典
color_map = {
    "Better": '#5495cf',
    "Worse": "#db4743"
}

# 3) 绘图（转置后列=legend），按顺序映射颜色
ax = df_plot.T.plot(
    kind='bar',
    figsize=(10, 6),
    color=[color_map[col] for col in df_plot.index],
    edgecolor='black'
)

# 4) 标题与坐标轴
plt.title('Average MP Scores per Group', fontsize=16, fontweight='bold')
plt.ylabel('Mean Score', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)

# 5) 去掉顶部和右侧边框
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# 6) 美化图例
leg = ax.legend(title='effect', frameon=True)
leg.get_frame().set_edgecolor('none')

# 添加背景格线
# ax.grid(True, axis='y', linestyle='--', alpha=0.7)  # 只加水平格线
ax.grid(True, axis='both')  # 如果想要横纵格线都显示

plt.tight_layout()
plt.savefig("figures/MP_scores_after_mean.svg", bbox_inches='tight')
plt.show()


#### 基因交集分析

In [None]:
better100 = pd.DataFrame(adata_epi.uns['rank_genes_groups']['names'])['better'][:100]
worse100 = pd.DataFrame(adata_epi.uns['rank_genes_groups']['names'])['worse'][:100]
set(better100).intersection(worse100)

In [None]:
# whether over express in young breast cancer
from scipy.stats import mannwhitneyu
adata_better_normal = adata_epi[np.array(adata_epi.obs['cnv_status'] == 'Normal') * np.array(adata_epi.obs['effect'] == 'better'), :]
adata_better_tumor = adata_epi[np.array(adata_epi.obs['cnv_status'] == 'Tumor') * np.array(adata_epi.obs['effect'] == 'better'), :]
adata_worse_normal = adata_epi[np.array(adata_epi.obs['cnv_status'] == 'Normal') * np.array(adata_epi.obs['effect'] == 'worse'), :]
adata_worse_tumor = adata_epi[np.array(adata_epi.obs['cnv_status'] == 'Tumor') * np.array(adata_epi.obs['effect'] == 'worse'), :]

In [None]:
from supervenn import supervenn

In [None]:
# whether over express in young breast cancer
over_expr_better_tumor = []
for gene in better100[:100]:
  expr1 = adata_better_normal[:, gene].X.todense().A.flatten()
  expr2 = adata_better_tumor[:, gene].X.todense().A.flatten()
  over_expr = expr1.mean() < expr2.mean()
  stat, pval = mannwhitneyu(expr1, expr2)
  if pval <10e-10 and over_expr:
    over_expr_better_tumor.append(gene)
over_expr_better_tumor

In [None]:
# whether over express in young breast cancer
lack_expr_worse_tumor = []
for gene in better100:
  expr1 = adata_worse_normal[:, gene].X.todense().A.flatten()
  expr2 = adata_worse_tumor[:, gene].X.todense().A.flatten()
  lack_expr = expr1.mean() > expr2.mean()
  stat, pval = mannwhitneyu(expr1, expr2)
  if pval < 10e-10 and lack_expr:
    lack_expr_worse_tumor.append(gene)
lack_expr_worse_tumor

In [None]:
valuable_gene = set(over_expr_better_tumor).intersection(lack_expr_worse_tumor)
valuable_gene

In [None]:
import matplotlib.pyplot as plt
figpath = 'figure-9.6'
sets = [set(better100), set(worse100), set(over_expr_better_tumor), set(lack_expr_worse_tumor)]
labels = ['Better-specific', 'Worse-specific', 'Upregulated in better tumors','Downregulated in worse tumors']
fig, ax = plt.subplots(figsize=(4.5, 3.5))
with plt.style.context('bmh'):
  supervenn(sets, labels, widths_minmax_ratio=0.1, ax=ax,sets_ordering='minimize gaps', col_annotations_area_height=0.8,side_plot_width=0.5)
# fig.savefig(f"{figpath}/supervenn.svg", bbox_inches='tight')

In [None]:
adata_epi

#### 差异基因分析

In [None]:
# adata_epi_highvar = adata_epi[:, adata_epi.var['highly_variable']].copy()  # 总的样本
adata_epi_highvar = adata_epi_filter[:, adata_epi_filter.var['highly_variable']].copy() # Luminal-like样本 
# adata_epi_highvar = adata_epi_basal[:, adata_epi_basal.var['highly_variable']].copy() # Basal-like样本
dds = ov.bulk.pyDEG(adata_epi_highvar.to_df().T) 
dds.drop_duplicates_index() 
better_groups = adata_epi_highvar.obs[adata_epi_highvar.obs['effect']=='better'].index.tolist() # 实验组 
worse_groups = adata_epi_highvar.obs[adata_epi_highvar.obs['effect']=='worse'].index.tolist() # 对照组 
deg_result = dds.deg_analysis(better_groups, worse_groups, method='ttest')

dds.foldchange_set(fc_threshold=-1,
                   pval_threshold=0.05,
                   logp_max=10)

In [None]:
# deg_result = deg_result[np.isfinite(deg_result["-log(pvalue)"])]
deg_result 

In [None]:
deg_result_filter = deg_result[deg_result['sig'] != 'normal']
deg_result_filter

In [None]:
deg_result_filter.to_excel("after_deg.xlsx", index=True)

In [None]:
# # 阈值（可改）
# q_cut = 0.001
# fc_cut = 2

# # 只保留显著基因
# sig = deg_result[(deg_result['qvalue'] <= q_cut) & (deg_result['log2FC'].abs() >= fc_cut)].copy()

# # 按 log2FC 大小优先，其次 qvalue 小
# up = (sig[sig['log2FC'] > 0]
#       .sort_values(['log2FC','qvalue'], ascending=[False, True])
#       .head(10))

# down = (sig[sig['log2FC'] < 0]
#         .sort_values(['log2FC','qvalue'], ascending=[True, True])
#         .head(10))

# selected_genes = up.index.tolist() + down.index.tolist()
# print(f"Selected {len(selected_genes)} sig genes for labeling.")

# 阈值（可改）
q_cut = 0.001
fc_cut = 2

# 只保留显著基因
# sig = deg_result[(deg_result['qvalue'] <= q_cut) & (deg_result['log2FC'].abs() >= fc_cut)].copy()
sig = deg_result[(deg_result['sig'] != 'normal') & (deg_result['log2FC'].abs() >= fc_cut)].copy()

up_sig = sig[sig['log2FC'] > 0]
down_sig = sig[sig['log2FC'] < 0]

# 上调：|log2FC| 最大前5、-log(qvalue) 最大前5
up_by_fc = up_sig.nlargest(5, 'abs(log2FC)')
up_by_q  = up_sig.nlargest(5, '-log(qvalue)')
# 合并并去重（保留出现顺序）
up_genes = pd.Index(up_by_fc.index.tolist() + up_by_q.index.tolist()).unique().tolist()

# 下调：|log2FC| 最大前5、-log(qvalue) 最大前5
down_by_fc = down_sig.nlargest(5, 'abs(log2FC)')
down_by_q  = down_sig.nlargest(5, '-log(qvalue)')
down_genes = pd.Index(down_by_fc.index.tolist() + down_by_q.index.tolist()).unique().tolist()

# 最终清单（先上调后下调；如需相反顺序可调换拼接顺序）
selected_genes = up_genes + down_genes
print(f"Selected {len(selected_genes)} genes "
      f"(up {len(up_genes)}, down {len(down_genes)}) for labeling.")

In [None]:
# 上调基因
up_genes = [
    "LGALS7B",  # 半乳糖凝集素家族成员，参与细胞黏附和免疫应答调节
    "APOE",  # 载脂蛋白E，参与脂质转运、免疫调控，与肿瘤进展相关
    "WIF1",  # Wnt 抑制因子1，抑制 Wnt 信号通路，发挥抑癌作用
    "GRP",  # 胃泌素释放肽，调控细胞增殖及神经内分泌信号
    "DPP4",  # 二肽基肽酶4，参与免疫调控、代谢及肿瘤微环境重塑
    "ID4",  # DNA 结合抑制因子4，调节转录与细胞分化
    "MYH11", # 平滑肌肌球蛋白重链11，平滑肌收缩和细胞骨架相关
    "ACTG2",  # 平滑肌肌动蛋白γ2，维持细胞骨架和收缩功能
    "NTRK2",  # 神经营养因子受体TrkB，调控细胞存活、分化和信号传导
    "FRZB" # 分泌型Frizzled相关蛋白，拮抗Wnt信号通路
]

# 下调基因
down_genes = [
    "KRT19",
    "PVALB",
    "SCGB1B2P",
    "AC093001.1",
    "CPB1",  # 羧肽酶B1，胰腺消化酶，与消化功能相关
    "CYP4F8",  # 细胞色素P450家族成员，参与花生四烯酸代谢和炎症反应
    "FGFBP1",  # 成纤维生长因子结合蛋白1，调节FGF信号及血管生成
    "CGA",  # 糖蛋白激素α亚基，参与促甲状腺素、促性腺激素的形成
    "CXCL13",  # 趋化因子CXCL13，B细胞趋化，参与免疫反应
    "GBP5",  # 鸟苷酸结合蛋白5，参与干扰素介导的抗感染和免疫应答
    "SLC26A3",  # 阴离子转运体，参与肠道电解质吸收，和炎症性肠病相关
    "MZB1",  # 边缘区B细胞特异蛋白1，调控B细胞分化和抗体分泌
    "IDO1",  # 色氨酸代谢酶，免疫抑制，与耐药相关
    "KLHDC7B",  # Kelch结构域蛋白，功能尚未完全明确，部分研究提示与肿瘤相关
    # "CDKN2A",
    "CLDN4",
    "PROM1",  # CD133，干细胞标志，与耐药和复发相关
    "VTCN1"   # B7-H4免疫检查点分子，免疫抑制
]
selected_genes = up_genes + down_genes


In [None]:
# 上调基因
up_genes_basal = [
    "GFRA1",  # GDNF 家族受体α1，介导神经营养因子信号，参与细胞存活与分化
    "LGALS7B",  # 半乳糖凝集素家族，参与细胞黏附和免疫反应调节
    "MUCL1",  # 乳腺相关黏蛋白，乳腺癌中高表达，可能参与肿瘤发生
    "EHF",  # ETS 转录因子家族，参与上皮细胞分化和癌症进程
    "SCGB1D2",  # 分泌球蛋白家族成员，主要在乳腺组织表达，功能尚未完全明确
    "FAM78B",  # 功能尚未明确
    "KIT",  # 干细胞因子受体，调控增殖和分化，在部分乳腺癌亚型中与预后相关
    "DCLK1",  # 功能尚未明确
    "DPP4",  # 功能尚未明确
    "IGKV2-30"  # 功能尚未明确
]

# 下调基因
down_genes_basal = [
    "CXCL10",  # 免疫趋化因子，参与免疫细胞募集，下调可能影响免疫微环境
    "APOL4",  # 功能尚未明确
    "IGLV1-44",  # 功能尚未明确
    "IFI6",  # 干扰素刺激基因，调控凋亡与细胞应激反应
    "GBP5",  # 功能尚未明确
    "IGKV1-6",  # 功能尚未明确
    "RSAD2",  # 功能尚未明确
    "OAS3",  # 功能尚未明确
    "HLA-DQA2",  # 功能尚未明确
    "IGLV2-11",  # 功能尚未明确
    "CHI3L1", # 参与炎症反应和组织重塑，常见于肿瘤进展中，下调可能与微环境变化相关
    "PDK4", # 代谢调控基因，影响糖代谢和能量平衡，下调或提示代谢重编程改变
    "RSAD2" # 抗病毒免疫相关，下调可能提示干扰素信号减弱

]
selected_genes = up_genes_basal + down_genes_basal

In [None]:
# 上调基因
up_genes_luminal = [
    "AQP5",  # 水通道蛋白5，参与水转运，与肿瘤细胞迁移和侵袭相关
    "S100B",  # 钙结合蛋白，参与细胞增殖、分化和肿瘤进展
    "SCGB3A1",  # 分泌球蛋白家族成员，在乳腺组织表达，可能具有抑癌作用
    "SOX10",  # 转录因子，调控神经嵴和乳腺发育，乳腺癌分型标志物
    "FRZB",  # Frizzled相关蛋白，Wnt信号通路抑制因子
    "NPY1R", # 神经肽Y受体1，参与细胞增殖和血管生成调控
    "DLL1",  # Delta样配体1，Notch信号通路关键分子
    "SNORC",  # 软骨相关蛋白，功能研究有限，可能参与细胞外基质调控
    "AC007906.2",  # 长链非编码RNA，功能尚未明确
    "PTHLH"  # 甲状旁腺激素相关蛋白，参与骨重塑和肿瘤转移
]

# 下调基因
down_genes_luminal = [
    "CPB1",  # 羧肽酶B1，胰腺消化酶，代谢相关，下调可能提示代谢改变
    "NPB",  # 神经肽B，参与神经内分泌信号调控
    "AC093001.1",  # 长链非编码RNA，功能尚未明确
    "CSTA",  # 胱天蛋白酶抑制剂A，参与蛋白水解和细胞凋亡调控
    "CYP4F8",  # 细胞色素P450家族，花生四烯酸代谢相关
    "DSG3",  # 桥粒芯糖蛋白3，参与细胞黏附，在肿瘤侵袭中有作用
    "LINC01588",  # 长链非编码RNA，功能尚未明确
    "C12orf75",  # 染色体12开放阅读框75，功能尚未明确
    "CHI3L1", # 几丁质酶样蛋白1，参与炎症反应和肿瘤进展
    "GUCY1A2"  # 鸟苷酸环化酶亚基α2，参与NO信号通路调控
]
selected_genes = up_genes_luminal + down_genes_luminal

In [None]:
import numpy as np
import pandas as pd

# —— 1) 拷贝并确保基因是索引（若你的基因在第一列而非索引，解除注释下一行）
# deg_result = deg_result.set_index(deg_result.columns[0])

df = deg_result.copy()

# —— 2) 列类型转为数值；把非数字（比如 'nan' 字符串）变成 NaN
for col in ['qvalue', 'log2FC']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# —— 3) 关键：给 qvalue 设置极小正下限，避免 -log10(0) = inf
# 用 float64 能表示的最小正数；也可以改成 1e-300
# eps = np.nextafter(0, 1)
# df['qvalue'] = df['qvalue'].clip(lower=eps)
# 给 qvalue 设下限，保证 -log10(qvalue) ≤ 50
df['qvalue'] = df['qvalue'].clip(lower=10**-50)

# —— 4) 把其余无穷大/无穷小替换成 NaN，再丢弃会影响坐标的行
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=['qvalue', 'log2FC'])

# # —— 5) 如果还保留了你自己算的 -log 列，也把 inf 压到有限值，避免后续使用时报错
# for col in ['-log(pvalue)', '-log(qvalue)']:
#     if col in df.columns:
#         finite_mask = np.isfinite(df[col])
#         if (~finite_mask).any() and finite_mask.any():
#             df.loc[~finite_mask, col] = df.loc[finite_mask, col].max() + 1

# —— 5) 如果还保留了你自己算的 -log 列，把 inf 压到上限 50
for col in ['-log(pvalue)', '-log(qvalue)']:
    if col in df.columns:
        # 将 inf / -inf 替换为 NaN
        df[col].replace([np.inf, -np.inf], np.nan, inplace=True)
        # 再用 clip 限制最大值为 50
        df[col] = df[col].clip(upper=50)

# —— 6) 只标注“清洗后仍存在且坐标有限”的基因，避免为无效点加注释
plot_genes = []
for g in selected_genes:
    if g in df.index:
        qv = df.at[g, 'qvalue']
        fc = df.at[g, 'log2FC']
        if np.isfinite(qv) and np.isfinite(fc) and (qv > 0):
            plot_genes.append(g)

print(f"After cleaning: {df.shape[0]} rows; labeling {len(plot_genes)} genes.")

# —— 7) 正式绘图：一定要把 result=df（清洗后的表），并传入原始 qvalue
import matplotlib.pyplot as plt
import omicverse as ov

fig, ax = plt.subplots(figsize=(5, 5), dpi=150)
grp1, grp2 = 'better', 'worse'

ov.pl.volcano(
    result=df,
    pval_name='qvalue',      # 传原始 qvalue，函数内部会做 -log10
    fc_name='log2FC',
    pval_max=None,
    FC_max=None,
    figsize=(5, 5),
    title=f'Volcano: {grp1} vs {grp2}',
    titlefont={'weight': 'normal', 'size': 14},
    up_color='#e25d5d',
    down_color='#7388c1',
    normal_color='#d7d7d7',
    up_fontcolor='#e25d5d',
    down_fontcolor='#7388c1',
    normal_fontcolor='#d7d7d7',
    legend_bbox=(0.8, -0.2),
    legend_ncol=2,
    legend_fontsize=8,
    # 阈值
    pval_threshold=0.001,
    fc_max=1.5,               # 注意这里是 fc_max（小写）
    # 标注
    plot_genes=plot_genes,
    plot_genes_num=len(plot_genes),
    plot_genes_fontsize=5,
    ticks_fontsize=8,
    ax=ax
)

plt.tight_layout()
plt.savefig("figures/after_luminal_DEG.svg", bbox_inches='tight')
plt.show()


In [None]:
dds.plot_boxplot(genes=['IFI6','IFI27', 'CDKN2A', 'HLA-DRB1', 'CD74', 'AREG', 'TFF1', 'CXCL2', 'KIT', 'RDH10'],treatment_groups=better_groups,
                treatment_name ='Better',control_name='Worse',title = 'Better VS Worse Gene Expression',
                control_groups=worse_groups,figsize=(10,3),fontsize=12,
                 legend_bbox=(1,0.55))

fig = plt.gcf()
fig.tight_layout()
fig.savefig('figures/epi-after-DEGs-Expression.svg', format='svg')

In [None]:
dds.plot_boxplot(genes=['NRG1', 'GAS1', 'ACTG2',  'CLU', 'PER1', 'CRISPLD1', 'ZBTB16', 'CDKN1A', 'CDKN2A'],treatment_groups=better_groups,
                treatment_name ='Better',control_name='Worse',title = 'Better VS Worse Gene Expression',
                control_groups=worse_groups,figsize=(10,3),fontsize=12,
                 legend_bbox=(1,0.55))

fig = plt.gcf()
fig.tight_layout()
# fig.savefig('figures/epi-after-DEGs-Expression.svg', format='svg')

In [None]:
# ====== 参数，可按需修改 ======
topn = 30                 # 展示的基因总数（down + up）
use_raw = False           # 用 .raw 矩阵则设 True
layer = None              # 或者指定某个 layer 名称，否则用 .X
# 列（样本）顺序与分组
worse_batches  = ["11T", "12T", "16T"]
better_batches = ["13T", "19T"]
# 横坐标展示名映射
batch_rename = {
    "11T": "BCYL032T",
    "12T": "BCYK372T",
    "13T": "BCYC822T",
    "16T": "BCYD452T",
    "19T": "BCYY602T",
}
# 颜色
effect_palette = {"better": "#62CAD1", "worse": "#FD7C76"}

# ====== 0) 选 top 基因（按照你的 DE 结果：better vs worse） ======
import pandas as pd
import numpy as np
from scipy import sparse
import seaborn as sns
import matplotlib.pyplot as plt
import palettable
from scipy.cluster.hierarchy import linkage, leaves_list

# 取 topn/2 的 up 与剩余 down
up   = deg_result_filter.sort_values("log2FC", ascending=False).head(topn // 2)      # better 上调
down = deg_result_filter.sort_values("log2FC", ascending=True).head(topn - len(up))  # worse 上调(= better 下调)

# 只保留目标 batch 的细胞
keep_batches = worse_batches + better_batches
adata_sub = adata_epi[adata_epi.obs["batch"].isin(keep_batches)].copy()

# 只保留在对象中的基因
var_names = (adata_sub.raw.var_names if use_raw else adata_sub.var_names)
down_genes = [g for g in down.index if g in var_names]
up_genes   = [g for g in up.index   if g in var_names]
assert len(down_genes) + len(up_genes) > 0, "No selected genes found in AnnData."

# ====== 1) 取表达矩阵，按 batch 做 pseudobulk（均值），得到 genes x batches ======
if use_raw:
    X = adata_sub.raw[:, down_genes + up_genes].X
else:
    X = adata_sub[:, down_genes + up_genes].X if layer is None else adata_sub[:, down_genes + up_genes].layers[layer]
if sparse.issparse(X):
    X = X.toarray()

expr_gc = pd.DataFrame(X.T, index=down_genes + up_genes, columns=adata_sub.obs_names)
cell_to_batch = adata_sub.obs["batch"]
expr_gb = expr_gc.groupby(cell_to_batch, axis=1).mean()

# 列顺序：先 worse，再 better；并做重命名用于显示
ordered_batches = [b for b in (worse_batches + better_batches) if b in expr_gb.columns]
expr_gb = expr_gb.loc[:, ordered_batches]
expr_gb = expr_gb.rename(columns=batch_rename)

# 顶部颜色条（按 batch → effect）
batch_to_effect = {**{b: "worse" for b in worse_batches}, **{b: "better" for b in better_batches}}
col_effect = pd.Series(index=ordered_batches, data=[batch_to_effect[b] for b in ordered_batches])
col_colors = pd.Series(index=expr_gb.columns, dtype=object)
# 注意：expr_gb.columns 已经是重命名后的；需要把 effect 映射到重命名后列名
for b in ordered_batches:
    col_colors.loc[batch_rename[b]] = effect_palette[col_effect.loc[b]]

# ====== 2) 行内聚类（先 down 再 up），但保持整体顺序为 “down → up” ======
# 为了稳定，按行做 z-score 后用于“计算聚类顺序”（显示时也会再 z-score）
expr_mean = expr_gb.mean(axis=1)
expr_std  = expr_gb.std(axis=1).replace(0, np.nan)
expr_z_for_order = expr_gb.sub(expr_mean, axis=0).div(expr_std, axis=0).fillna(0.0)

# 对 down/up 两段分别聚类并取叶序
def cluster_order(mat):
    if mat.shape[0] <= 2:
        return list(mat.index)  # 太少时不聚类直接返回
    Z = linkage(mat.values, method="average", metric="correlation")
    order_idx = leaves_list(Z)
    return list(mat.index[order_idx])

down_order = cluster_order(expr_z_for_order.loc[[g for g in down_genes if g in expr_gb.index]])
up_order   = cluster_order(expr_z_for_order.loc[[g for g in up_genes   if g in expr_gb.index]])

row_order = down_order + up_order
expr_gb_ordered = expr_gb.loc[row_order]

# （可选）行颜色条标注 “worse_up / better_up”
row_class = pd.Series(index=expr_gb_ordered.index, dtype=object)
row_class.loc[down_order] = "worse_up"
row_class.loc[up_order]   = "better_up"
row_palette = {"better_up": effect_palette["better"], "worse_up": effect_palette["worse"]}
row_colors = row_class.map(row_palette)

# ====== 3) 绘图：行不再聚类（保持我们算好的顺序），显示时对行做标准化 ======
g = sns.clustermap(
    expr_gb_ordered,
    cmap=palettable.colorbrewer.diverging.RdYlGn_3_r.mpl_colors,
    standard_scale=0,       # 显示时按行 z-score
    figsize=(4.2, 8),
    row_cluster=False,      # 关键：保持我们自定义的行顺序（down→up），但组内已聚类
    col_cluster=False,      # 列不聚类，按样本顺序显示
    col_colors=col_colors.values,  # 顶部 effect 颜色条
    row_colors=row_colors.values,  # 左侧基因类别颜色条（可去掉）
    xticklabels=True,
    yticklabels=True,
    dendrogram_ratio=(.12, .12),
)

# 美化
g.ax_heatmap.yaxis.set_tick_params(labelsize=8)
g.ax_heatmap.xaxis.set_tick_params(labelsize=11)
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right")

# legend（effect）
from matplotlib.patches import Patch
handles_col = [Patch(facecolor=effect_palette[k], label=k) for k in ["better", "worse"]]
g.ax_col_dendrogram.legend(handles=handles_col, loc="best",
                           bbox_to_anchor=(-0.45, 0., 0.5, 0.5), ncol=1, fontsize=10, title=None)

# legend（gene class）
handles_row = [Patch(facecolor=row_palette[k], label=k) for k in ["worse_up", "better_up"]]
g.ax_row_dendrogram.legend(handles=handles_row, loc="best",
                           bbox_to_anchor=(0.0, 0.9, 0.5, 0.5), ncol=1, fontsize=9, title=None)

# 调整颜色条
g.cax.set_position([-.10, .2, .03, .45])
plt.setp(g.cax.yaxis.get_majorticklabels(), fontsize=10)
# 保存
plt.savefig("figures/after_luminal_deg_heatmap_by_batch.svg", bbox_inches="tight")
plt.show()


#### 通路富集分析

In [None]:
sc.tl.rank_genes_groups(adata_epi, groupby = 'emt-type', method = 'wilcoxon')

In [5]:
sc.tl.rank_genes_groups(adata_epi, groupby = 'effect', method = 'wilcoxon')

... storing 'effect' as categorical


In [None]:
import gseapy as gp
gseada=deg_result_filter.loc[deg_result_filter['sig']!='normal']
#倍数变化规则计算基因倍数
gseada['fcsign']=-np.sign(gseada['log2FC'])
gseada['logp']=-np.log10(gseada['pvalue'])
gseada['metric']=gseada['logp']/gseada['fcsign']
gseada.head()

In [None]:
# 数据排序
gseada=gseada.sort_values(by=['metric'],ascending=False)
gseada.head()

In [None]:
# rnk矩阵提取 我们发现数据中有inf，这对后续的分析有一定的影响，所以我们对inf进行赋值
rnk=pd.DataFrame()
rnk['gene_name']=gseada.index
rnk['rnk']=gseada['metric'].values
k=1
total=0
for i in range(len(rnk)):
    if rnk.loc[i,'rnk']==np.inf: 
        total+=1
#200跟274根据你的数据进行更改，保证inf比你数据最大的大，-inf比数据最小的小就好
for i in range(len(rnk)):
    if rnk.loc[i,'rnk']==np.inf: 
        rnk.loc[i,'rnk']=200+(total-k)
        k+=1
    elif rnk.loc[i,'rnk']==-np.inf: 
        rnk.loc[i,'rnk']=-(274+k)
        k+=1
#rnk=rnk.replace(np.inf,300)
#rnk=rnk.replace(-np.inf,-300)
rnk.head()

```
rnk：输入排序矩阵

gene_sets：需要富集到的通路数据集

processes：并行使用的线程数

permutation_num：检验的速度

outdir：结果输出目录
```

In [None]:
#我们可以通过以下函数观察都有哪些数据集可被用来富集
names = gp.get_library_name() # default: Human
names

In [None]:
pre_res = gp.prerank(rnk=rnk, gene_sets='KEGG_2021_Human',# KEGG_2021_Human， MSigDB_Hallmark_2020
                     processes=4,
                     permutation_num=100, # reduce number to speed up testing
                     outdir='gseapy/after_total_KEGG', format='svg', seed=6)

In [None]:
pre_res.res2d.sort_index()

In [None]:
from gseapy.plot import gseaplot

# 指定要可视化的 pathway 名称
term = "Cytokine-cytokine receptor interaction"

# 绘制富集曲线
gseaplot(rank_metric=pre_res.ranking,
         term=term,
         **pre_res.results[term])

In [6]:
import gseapy as gp
from gseapy import Msigdb
msig = Msigdb()
gmt = msig.get_gmt(category='mh.all', dbver="2024.1.Hs")
gmt

In [7]:
# get deg result
result = adata_epi.uns['rank_genes_groups']
groups = result['names'].dtype.names
degs = pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups for key in ['names','scores', 'pvals','pvals_adj','logfoldchanges']})
degs

Unnamed: 0,better_names,better_scores,better_pvals,better_pvals_adj,better_logfoldchanges,worse_names,worse_scores,worse_pvals,worse_pvals_adj,worse_logfoldchanges
0,APOE,33.339470,1.035174e-243,4.113263e-239,3.710596,KRT19,31.427105,8.628224e-217,1.142808e-212,5.395644
1,SPARCL1,32.597927,4.388298e-233,8.718452e-229,3.029246,KRT18,31.398434,2.125431e-216,2.111350e-212,4.111528
2,TPM2,31.125917,1.074475e-212,8.538850e-209,3.106501,CD24,30.232851,8.768407e-201,5.806877e-197,4.623875
3,ID4,30.042572,2.729969e-198,1.549647e-194,2.934598,CLDN4,29.284061,1.655119e-188,7.307351e-185,4.153338
4,ECRG4,29.813145,2.639052e-195,1.310784e-191,2.868367,ELF3,28.769592,5.152225e-182,1.861124e-178,3.748566
...,...,...,...,...,...,...,...,...,...,...
39730,ELF3,-28.769592,5.152225e-182,1.861124e-178,-3.748566,ECRG4,-29.813145,2.639052e-195,1.310784e-191,-2.868367
39731,CLDN4,-29.284061,1.655119e-188,7.307351e-185,-4.153338,ID4,-30.042572,2.729969e-198,1.549647e-194,-2.934598
39732,CD24,-30.232851,8.768407e-201,5.806877e-197,-4.623875,TPM2,-31.125917,1.074475e-212,8.538850e-209,-3.106501
39733,KRT18,-31.398434,2.125431e-216,2.111350e-212,-4.111528,SPARCL1,-32.597927,4.388298e-233,8.718452e-229,-3.029246


In [8]:
# subset up or down regulated genes
degs_sig = degs[degs.better_pvals_adj < 0.05]
degs_up = degs_sig[degs_sig.better_logfoldchanges > 0]
degs_dw = degs_sig[degs_sig.better_logfoldchanges < 0]

In [9]:
degs_up

Unnamed: 0,better_names,better_scores,better_pvals,better_pvals_adj,better_logfoldchanges,worse_names,worse_scores,worse_pvals,worse_pvals_adj,worse_logfoldchanges
0,APOE,33.339470,1.035174e-243,4.113263e-239,3.710596,KRT19,31.427105,8.628224e-217,1.142808e-212,5.395644
1,SPARCL1,32.597927,4.388298e-233,8.718452e-229,3.029246,KRT18,31.398434,2.125431e-216,2.111350e-212,4.111528
2,TPM2,31.125917,1.074475e-212,8.538850e-209,3.106501,CD24,30.232851,8.768407e-201,5.806877e-197,4.623875
3,ID4,30.042572,2.729969e-198,1.549647e-194,2.934598,CLDN4,29.284061,1.655119e-188,7.307351e-185,4.153338
4,ECRG4,29.813145,2.639052e-195,1.310784e-191,2.868367,ELF3,28.769592,5.152225e-182,1.861124e-178,3.748566
...,...,...,...,...,...,...,...,...,...,...
35903,SRRT,-6.971061,3.145592e-12,3.008186e-11,0.017128,FADS6,-0.017310,9.861894e-01,1.000000e+00,-2.711586
35909,SNX2,-6.977830,2.997743e-12,2.872325e-11,0.028342,AC048382.2,-0.017310,9.861894e-01,1.000000e+00,-3.087893
35923,MRPL22,-6.991432,2.720944e-12,2.615938e-11,0.001455,ZFHX4-AS1,-0.017310,9.861894e-01,1.000000e+00,-5.637582
35942,CDC40,-7.012558,2.340007e-12,2.260092e-11,0.036961,SLC6A1,-0.017310,9.861894e-01,1.000000e+00,-2.986125


In [13]:
# enr_up = gp.enrichr(degs_up.better_names,
#                     gene_sets='KEGG_2021_Human',
#                     outdir=None)
# cc_up = gp.enrichr(degs_up.better_names,
#                     gene_sets='GO_Cellular_Component_2023',
#                     outdir=None)
# bp_up = gp.enrichr(degs_up.better_names,
#                     gene_sets='GO_Biological_Process_2023',
#                     outdir=None)
# mf_up = gp.enrichr(degs_up.better_names,
#                     gene_sets='GO_Molecular_Function_2023',
#                     outdir=None)
hm_up = gp.enrichr(degs_up.better_names,
                    gene_sets='MSigDB_Hallmark_2020',
                    outdir=None)

SSLError: HTTPSConnectionPool(host='maayanlab.cloud', port=443): Max retries exceeded with url: /Enrichr/datasetStatistics (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))

In [None]:
enr_down = gp.enrichr(degs_dw.better_names,
                    gene_sets='KEGG_2021_Human',
                    outdir=None)
cc_down = gp.enrichr(degs_dw.better_names,
                    gene_sets='GO_Cellular_Component_2023',
                    outdir=None)
bp_down = gp.enrichr(degs_dw.better_names,
                    gene_sets='GO_Biological_Process_2023',
                    outdir=None)
mf_down = gp.enrichr(degs_dw.better_names,
                    gene_sets='GO_Molecular_Function_2023',
                    outdir=None)
hm_down = gp.enrichr(degs_dw.better_names,
                    gene_sets='MSigDB_Hallmark_2020',
                    outdir=None)

In [None]:
go_res = pd.concat([enr_up.res2d.head(15),hm_up.res2d.head(15), cc_up.res2d.head(15), bp_up.res2d.head(15), mf_up.res2d.head(15)])
ax = gp.barplot(go_res, figsize=(6,20),
                group ='Gene_set',
                top_term=15,
                color = ['#8b9cc4', '#f08961', '#62bb9f', '#e71f19', '#3d5eaa'],
                title ="The Most Enriched Terms Up in Better")
ax.figure.savefig(f"figures/Epi_After_total_effect_Enrichment_UP.svg")

In [None]:
import re, numpy as np, pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle, PathPatch
from matplotlib.path import Path
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import gseapy as gp

# ---------- helpers ----------
def parse_overlap(s):
    m = re.match(r'^\s*(\d+)\s*/\s*(\d+)\s*$', str(s))
    if not m: return np.nan, np.nan, np.nan
    x, y = int(m.group(1)), int(m.group(2))
    return x, y, (x/y if y else np.nan)

def std_enrich_cols(df):
    cols = {c.lower().replace(' ','').replace('-',''): c for c in df.columns}
    def get(k): return cols.get(k.lower().replace(' ','').replace('-',''), k)
    return df.rename(columns={
        get('Term'):'Term',
        get('Overlap'):'Overlap',
        get('P-value'):'P-value',
        get('Adjusted P-value'):'Adjusted P-value',
        get('Genes'):'Genes',
        get('Combined Score'):'Combined Score'
    })

def topk(df, k=5):
    d = df.replace([np.inf, -np.inf], np.nan).dropna(subset=['Adjusted P-value']).copy()
    d = d.sort_values('Adjusted P-value').head(k).reset_index(drop=True)
    xy = d['Overlap'].map(parse_overlap).apply(pd.Series)
    xy.columns = ['HitCount', 'GeneSetSize', 'GeneRatio']
    d = pd.concat([d, xy], axis=1)
    d['HitCount'] = d['HitCount'].fillna(0).astype(int)
    d['GeneRatio'] = d['GeneRatio'].astype(float)
    d['neglog10P'] = -np.log10(d['P-value'].clip(lower=1e-300))
    d['neglog10Padj'] = -np.log10(d['Adjusted P-value'].clip(lower=1e-300))
    d['Term_clean'] = d['Term'].astype(str)
    return d

def split_genes(s):
    return [x.strip().upper() for x in re.split(r'[;,/]', str(s)) if x.strip()]

def pick_rep_genes(all_genes_in_term, allowed_pool, logfc_map, n=8):
    genes = [g for g in all_genes_in_term if g in allowed_pool]
    genes = sorted(genes, key=lambda g: abs(logfc_map.get(g, 0)), reverse=True)
    return genes[:n]

def spaced_positions(n, top_padding=0.02, bottom_padding=0.02):
    if n <= 1: return [0.5]
    span = 1.0 - top_padding - bottom_padding
    return list(np.linspace(top_padding, top_padding+span, n))

# ---------- main ----------
def plot_sankey_bubble_sns(
    degs_up, degs,
    gene_sets='MSigDB_Hallmark_2020',
    TOP_K=8,              # 选取 -log10(Adjusted P-value) 最大的前 K 个通路
    N_REP=8,
    # 气泡大小缩放：作用在 HitCount 上
    BUBBLE_SIZE_SCALE=0.6,
    GENE_COLORMAP='tab20',
    PATH_COLORMAP='Set2',
    svg_path='overlapped_genes.svg',
    figsize=(10,8),
    font_family='Noto Sans CJK SC',
    # Sankey 样式
    EDGE_LW=3.0, EDGE_ALPHA=0.45, CURV=0.35,
    GENE_NODE_W=0.030, GENE_NODE_H=0.020,
    PATH_NODE_W=0.050, PATH_NODE_H=0.028,
    GAP_TEXT2PATH=0.010,
    # 子图间距与贴边控制
    GRID_WSPACE=0.01,          # 两子图基础间距（越小越靠近）
    BUBBLE_SHIFT_LEFT=0.035,   # 再把气泡图整体向左平移（越大越贴近色块）
    # 颜色条细条设置
    CBAR_WIDTH="3%", CBAR_HEIGHT="70%"
):
    # ----- inputs -----
    if 'After_names' not in degs_up.columns:
        raise ValueError("需要 degs_up['After_names'] 作为上调基因列表。")
    if not {'After_names','After_logfoldchanges'} <= set(degs.columns):
        raise ValueError("需要 degs['After_names','After_logfoldchanges']。")

    up_genes = (degs_up['After_names'].astype(str).str.upper().dropna().unique().tolist())
    logfc_map = (degs[['After_names','After_logfoldchanges']]
                 .dropna()
                 .assign(After_names=lambda d: d['After_names'].astype(str).str.upper())
                 .drop_duplicates('After_names')
                 .set_index('After_names')['After_logfoldchanges']
                 .to_dict())

    # ----- enrichment (up only) -----
    enr_up = gp.enrichr(degs_up.After_names, gene_sets=gene_sets, outdir=None)
    res_up = std_enrich_cols(enr_up.results.copy())

    # 计算排序所需统计量
    d = res_up.replace([np.inf, -np.inf], np.nan).dropna(subset=['Adjusted P-value']).copy()
    # 统一解析 overlap、补充列
    xy = d['Overlap'].map(parse_overlap).apply(pd.Series)
    xy.columns = ['HitCount', 'GeneSetSize', 'GeneRatio']
    d = pd.concat([d, xy], axis=1)
    d['HitCount'] = d['HitCount'].fillna(0).astype(int)
    d['GeneRatio'] = d['GeneRatio'].astype(float)
    d['neglog10P'] = -np.log10(d['P-value'].clip(lower=1e-300))
    d['neglog10Padj'] = -np.log10(d['Adjusted P-value'].clip(lower=1e-300))
    d['Term_clean'] = d['Term'].astype(str)

    # **严格按 -log10(Adjusted P-value) 排序并取前 K 个**
    top_up = (d.sort_values(['neglog10Padj', 'neglog10P'], ascending=[False, False])
                .head(int(TOP_K))
                .reset_index(drop=True))

    path_order = top_up['Term_clean'].tolist()

    # 气泡数据（保持与通路顺序一致）
    bub = (top_up.set_index('Term_clean').loc[path_order].reset_index())

    # ----- rep genes & sankey edges -----
    top_up2 = top_up.copy()
    top_up2['RepGenes'] = top_up2['Genes'].apply(split_genes).apply(
        lambda gl: pick_rep_genes(gl, set(up_genes), logfc_map, n=N_REP)
    )
    top_up2 = (top_up2.set_index('Term_clean').loc[path_order].reset_index())

    gene_nodes = sorted(set(g for lst in top_up2['RepGenes'] for g in lst))
    path_nodes = path_order[:]
    edges = [(g, p) for _, r in top_up2.iterrows() for g in r['RepGenes'] for p in [r['Term_clean']]]

    # ----- colors -----
    cmap_gene = mpl.cm.get_cmap(GENE_COLORMAP, max(len(gene_nodes), 1))
    cmap_path = mpl.cm.get_cmap(PATH_COLORMAP, max(len(path_nodes), 1))
    node_color_map = {g: cmap_gene(i) for i, g in enumerate(gene_nodes)}
    node_color_map.update({p: cmap_path(i) for i, p in enumerate(path_nodes)})

    # ----- fixed node positions -----
    y_genes = spaced_positions(len(gene_nodes))
    y_paths = spaced_positions(len(path_nodes))
    x_left, x_right = 0.06, 0.94
    x_block_left = x_right - PATH_NODE_W
    pos_gene = {g: (x_left, y_genes[i]) for i, g in enumerate(gene_nodes)}
    pos_path = {p: (x_block_left, y_paths[i]) for i, p in enumerate(path_nodes)}

    # ----- figure -----
    mpl.rcParams['font.family'] = font_family
    fig = plt.figure(figsize=figsize)
    gs = fig.add_gridspec(ncols=2, nrows=1, width_ratios=[0.68, 0.32], wspace=GRID_WSPACE)
    ax_sankey = fig.add_subplot(gs[0,0])
    ax_bubble = fig.add_subplot(gs[0,1])

    # ===== Left: Sankey =====
    ax_sankey.set_xlim(0, 1)
    ax_sankey.set_ylim(0, 1)
    ax_sankey.axis('off')

    # gene nodes & labels
    for g in gene_nodes:
        x, y = pos_gene[g]
        ax_sankey.add_patch(Rectangle((x, y - GENE_NODE_H/2), GENE_NODE_W, GENE_NODE_H,
                                      facecolor=node_color_map[g],
                                      edgecolor=(0,0,0,0.35), lw=0.6))
        ax_sankey.text(x - 0.012, y, g, va='center', ha='right', fontsize=9)

    # path nodes: 文本在左、色块在右
    for p in path_nodes:
        xb, y = pos_path[p]
        ax_sankey.text(xb - GAP_TEXT2PATH, y, p, va='center', ha='right', fontsize=10)
        ax_sankey.add_patch(Rectangle((xb, y - PATH_NODE_H/2), PATH_NODE_W, PATH_NODE_H,
                                      facecolor=node_color_map[p],
                                      edgecolor=(0,0,0,0.35), lw=0.6))

    # edges (贝塞尔曲线)
    def bezier(x0, y0, x1, y1, curv=CURV):
        cx0, cy0 = x0 + curv*(x1 - x0), y0
        cx1, cy1 = x1 - curv*(x1 - x0), y1
        verts = [(x0, y0), (cx0, cy0), (cx1, cy1), (x1, y1)]
        codes = [Path.MOVETO, Path.CURVE4, Path.CURVE4, Path.CURVE4]
        return Path(verts, codes)
    for (g, p) in edges:
        x0, y0 = pos_gene[g][0] + GENE_NODE_W, pos_gene[g][1]
        x1, y1 = pos_path[p][0], pos_path[p][1]
        ax_sankey.add_patch(PathPatch(bezier(x0, y0, x1, y1),
                                      facecolor='none',
                                      edgecolor=(0.6,0.6,0.6,EDGE_ALPHA),
                                      lw=EDGE_LW, capstyle='round'))

    # ===== Right: Bubble =====
    # X = GeneRatio；颜色 = -log10(Adjusted P-value)；大小 = HitCount
    with sns.axes_style("whitegrid"):
        sizes_px = (bub["HitCount"].fillna(0).astype(float) * (40 * BUBBLE_SIZE_SCALE) + 20)
        sc = ax_bubble.scatter(
            bub["GeneRatio"], bub["Term_clean"],
            s=sizes_px,
            c=bub["neglog10Padj"],       # 用调整后 P 值着色
            cmap="Reds",
            edgecolors="k", linewidths=0.4, alpha=0.9
        )

    # 将右图往左“贴”到色块边上
    pos = ax_bubble.get_position()
    ax_bubble.set_position([pos.x0 - BUBBLE_SHIFT_LEFT, pos.y0, pos.width, pos.height])

    # 隐藏 y 标签与刻度（共用左侧通路名）
    ax_bubble.set_ylabel("")
    ax_bubble.set_yticklabels([])
    ax_bubble.tick_params(axis='y', length=0)

    ax_bubble.set_xlabel("Gene Ratio")
    ax_bubble.grid(True, alpha=0.3, linewidth=0.6)

    # 颜色条（瘦长、在右侧）
    norm = mpl.colors.Normalize(vmin=bub["neglog10Padj"].min(), vmax=bub["neglog10Padj"].max())
    mappable = mpl.cm.ScalarMappable(norm=norm, cmap="Reds")
    cb_ax = inset_axes(ax_bubble, width=CBAR_WIDTH, height=CBAR_HEIGHT, loc="center right",
                       bbox_to_anchor=(0.10, 0., 1, 1), bbox_transform=ax_bubble.transAxes,
                       borderpad=0)
    cbar = plt.colorbar(mappable, cax=cb_ax)
    cbar.set_label("-Log10(Adjusted P-value)")
    cbar.ax.tick_params(labelsize=9)

    plt.tight_layout()
    fig.savefig(svg_path, format="svg", dpi=300)
    plt.close(fig)
    return svg_path


In [None]:
# 假设你已有 degs_up, degs 两个 DataFrame
out = plot_sankey_bubble_sns(
    degs_up, degs,
    gene_sets='MSigDB_Hallmark_2020',
    N_REP=8,
    BUBBLE_SIZE_SCALE=0.1,
    GENE_COLORMAP='tab20',
    PATH_COLORMAP='Set3',
    svg_path='figures/Epi-after-total-upgrades-Hallmark.svg'   # 输出文件名
)

In [None]:
godown_res = pd.concat([enr_down.res2d.head(15),hm_down.res2d.head(15), 
                    cc_down.res2d.head(15), bp_down.res2d.head(15), mf_down.res2d.head(15)])
ax = gp.barplot(godown_res, figsize=(6,20),
                group ='Gene_set',
                top_term=15,
                color = ['#8b9cc4', '#f08961', '#62bb9f', '#e71f19', '#3d5eaa'],
                title ="The Most Enriched Terms Down in Better")
ax.figure.savefig(f"figures/Epi_Better_Enrichment_DOWN.svg")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable, get_cmap

def _get_results_df(enrichr_obj):
    """兼容不同 gseapy 版本：优先用 .results，备用 .res2d"""
    if hasattr(enrichr_obj, "results") and isinstance(enrichr_obj.results, pd.DataFrame):
        return enrichr_obj.results.copy()
    elif hasattr(enrichr_obj, "res2d") and isinstance(enrichr_obj.res2d, pd.DataFrame):
        return enrichr_obj.res2d.copy()
    else:
        raise AttributeError("enrichr object has neither .results nor .res2d as a DataFrame")

def top10_from_enrichr(enrichr_obj, label):
    """
    从 gseapy.enrichr 结果中取前10条：
    排序依据为 -log10(Adjusted P-value)（越大越显著）
    """
    df = _get_results_df(enrichr_obj)
    df = df.rename(columns={
        'Term': 'term',
        'Adjusted P-value': 'padj',
        'P-value': 'pvalue',
        'Overlap': 'overlap'
    })
    # Count = Overlap 的分子
    df['Count'] = df['overlap'].astype(str).str.split('/').str[0].astype(int)
    # 计算 -log10(FDR)
    df['neglog10_padj'] = -np.log10(df['padj'].clip(lower=1e-300))
    df['set'] = label
    # 先按显著性降序取前10，再按显著性降序用于绘图
    df = df.sort_values('neglog10_padj', ascending=False).head(10)
    df = df.sort_values('neglog10_padj', ascending=False)
    return df[['term', 'Count', 'padj', 'neglog10_padj', 'set']]

# 取三个集合（根据你的变量名）
df_bp   = top10_from_enrichr(bp_up,  'BP')
df_kegg = top10_from_enrichr(enr_up, 'KEGG')
df_hm   = top10_from_enrichr(hm_up,  'HALLMARK')

df_all = pd.concat([df_bp, df_kegg, df_hm], ignore_index=True)

# 颜色映射改为映射到 -log10(Adjusted P-value)，越显著颜色越深
sets_order = ['BP', 'KEGG', 'HALLMARK']   # 控制分面顺序
cmap = plt.get_cmap("RdBu_r")   # 红-蓝渐变，r 表示反转，让小的 padj 对应红色
norm = Normalize(vmin=df_all['neglog10_padj'].min(),
                 vmax=df_all['neglog10_padj'].max())

fig = plt.figure(figsize=(9, 9))
gs = fig.add_gridspec(nrows=3, ncols=1, hspace=0.35)

for i, s in enumerate(sets_order):
    ax = fig.add_subplot(gs[i, 0])
    sub = df_all[df_all['set'] == s].copy()
    # 确保从上到下显著性递减
    sub = sub.sort_values('neglog10_padj', ascending=False)

    y = np.arange(len(sub))
    colors = cmap(norm(sub['neglog10_padj'].values))

    ax.barh(y, sub['Count'].values, color=colors, edgecolor='none')
    ax.set_yticks(y)
    ax.set_yticklabels(sub['term'].values, fontsize=9)
    ax.invert_yaxis()  # 让最显著的在最上方
    ax.set_xlabel('Count')
    ax.set_title(s, loc='right', fontsize=11, fontweight='bold')

    # 美化边框
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

# 统一颜色条：显示 -log10(Adjusted P-value)
cbar = fig.colorbar(
    ScalarMappable(norm=norm, cmap=cmap),
    ax=fig.axes,
    shrink=0.6, pad=0.01
)
cbar.set_label(r'$-\log_{10}$(Adjusted P-value)')

fig.suptitle('Enrichment (Top 10 per Set, sorted by -log10 FDR)', fontsize=13, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.savefig("figures/Epi_After_Luminal_Better_Enrichment_Up_Top10.svg", bbox_inches='tight')
plt.show()



#### 细胞比例分布

In [None]:
sccoda_model = pt.tl.Sccoda()
sccoda_data = sccoda_model.load(
    adata_epi,
    type="cell_level",
    generate_sample_level=True,
    cell_type_identifier="epi-type",
    sample_identifier="batch",
    covariate_obs=["effect"],
)
sccoda_data


In [None]:
sccoda_model.plot_boxplots(sccoda_data, modality_key="coda", feature_name="effect", add_dots=True)
plt.show()
plt.savefig('figures/Proportion-epi-after-epitype.svg', format='svg')


In [None]:
sccoda_model = pt.tl.Sccoda()
sccoda_data = sccoda_model.load(
    adata_epi,
    type="cell_level",
    generate_sample_level=True,
    cell_type_identifier="annotation",
    sample_identifier="batch",
    covariate_obs=["effect"],
)
sccoda_data

In [None]:
sccoda_model.plot_boxplots(sccoda_data, modality_key="coda", feature_name="effect", add_dots=True)
plt.show()
plt.savefig('figures/Proportion-epi-after-majortype.svg', format='svg')


In [None]:
# 创建milo对象
milo = pt.tl.Milo()
mdata = milo.load(adata_epi)

In [None]:
sc.pp.neighbors(mdata["rna"], use_rep="X_pca", n_neighbors=30)

In [None]:
milo.make_nhoods(mdata["rna"], prop=0.1)

In [None]:
mdata["rna"].obsm["nhoods"]

In [None]:
mdata["rna"][mdata["rna"].obs["nhood_ixs_refined"] != 0].obs[["nhood_ixs_refined", "nhood_kth_distance"]]

In [None]:
nhood_size = np.array(mdata["rna"].obsm["nhoods"].sum(0)).ravel()
plt.hist(nhood_size, bins=100)
plt.xlabel("# cells in nhood")
plt.ylabel("# nhoods")

In [None]:
mdata = milo.count_nhoods(mdata, sample_col="batch")
mdata

In [None]:
mdata["milo"]

In [None]:
mdata["rna"].obs["effect"] = mdata["rna"].obs["effect"].cat.reorder_categories(["better", "worse"])
milo.da_nhoods(mdata, design="~effect")

In [None]:
mdata["milo"].obs

In [None]:
mdata["milo"].var

In [None]:
old_figsize = plt.rcParams["figure.figsize"]
plt.rcParams["figure.figsize"] = [10, 5]
plt.subplot(1, 2, 1)
plt.hist(mdata["milo"].var.PValue, bins=50)
plt.xlabel("P-Vals")
plt.subplot(1, 2, 2)
plt.plot(mdata["milo"].var.logFC, -np.log10(mdata["milo"].var.SpatialFDR), ".")
plt.xlabel("log-Fold Change")
plt.ylabel("- log10(Spatial FDR)")
plt.tight_layout()
plt.rcParams["figure.figsize"] = old_figsize

In [None]:
milo.annotate_nhoods(mdata, anno_col="epi-type")

In [None]:
plt.hist(mdata["milo"].var["nhood_annotation_frac"], bins=30)
plt.xlabel("celltype fraction")

In [None]:
mdata["milo"].var

In [None]:
milo.plot_da_beeswarm(mdata, alpha=0.1)

In [None]:
## Get IDs of plasmablast neighbourhood
pl_nhoods = mdata["milo"].var_names[
    (mdata["milo"].var["SpatialFDR"] < 0.1) & (mdata["milo"].var["nhood_annotation"] == "LumHR Normal")
]

## Visualize cell counts by condition (x-axis) and individuals on all neighbourhoods
milo.plot_nhood_counts_by_cond(mdata, test_var="effect", subset_nhoods=pl_nhoods, log_counts=False)

#### 细胞交互分析

In [None]:
adata_epi.write_h5ad('epi-after.h5ad')