## 1. 环境设置和导入


In [1]:
# 设置工作目录
import os
import sys
import torch
import pandas as pd
import scanpy as sc
from os.path import join as pjoin

# 添加项目路径到Python路径
project_root = "/home/mjin/GEARS_Clean"
sys.path.append(project_root)
os.chdir(project_root)

print(f"当前工作目录: {os.getcwd()}")
print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA设备数量: {torch.cuda.device_count()}")
    print(f"当前CUDA设备: {torch.cuda.current_device()}")


当前工作目录: /home/mjin/GEARS_Clean
PyTorch版本: 2.4.1+cu118
CUDA可用: True
CUDA设备数量: 7
当前CUDA设备: 0


In [2]:
# 导入GEARS相关模块
from gears.pertdata import PertData
from gears.gears import GEARS

print("GEARS模块导入成功")


  from .autonotebook import tqdm as notebook_tqdm


loading embedding from scfoundation for dataset adamson
embedding path: /home/mjin/GEARS_Clean/embed/scfoundation_adamson_embedding.npy
embedding shape: torch.Size([3952, 1069, 512])
GEARS模块导入成功


## 2. 配置参数


In [3]:
# 训练配置参数
class Config:
    # 数据相关参数
    data_dir = "/home/mjin/GEARS_Clean/data"
    data_name = "adamson"
    split = "simulation"
    seed = 3
    train_gene_set_size = 0.75
    
    # 训练参数
    batch_size = 32
    test_batch_size = 32
    epochs = 15
    lr = 1e-3
    weight_decay = 5e-4
    
    # 模型参数
    device = "cuda" if torch.cuda.is_available() else "cpu"
    hidden_size = 64
    accumulation_steps = 5
    highres = 0
    
    # 模型类型和路径
    model_type = "emb"
    singlecell_model_path = None
    finetune_method = "random"
    mode = "v1"
    
    # 嵌入相关参数
    pretrained_model_name = "scfoundation"
    custom_embedding_path = None
    
    # 结果保存路径
    result_dir = "/home/mjin/GEARS_Clean/result/training_result"

# 创建配置实例
config = Config()

print("配置参数:")
for attr in dir(config):
    if not attr.startswith('_'):
        print(f"  {attr}: {getattr(config, attr)}")


配置参数:
  accumulation_steps: 5
  batch_size: 32
  custom_embedding_path: None
  data_dir: /home/mjin/GEARS_Clean/data
  data_name: adamson
  device: cuda
  epochs: 15
  finetune_method: random
  hidden_size: 64
  highres: 0
  lr: 0.001
  mode: v1
  model_type: emb
  pretrained_model_name: scfoundation
  result_dir: /home/mjin/GEARS_Clean/result/training_result
  seed: 3
  singlecell_model_path: None
  split: simulation
  test_batch_size: 32
  train_gene_set_size: 0.75
  weight_decay: 0.0005


## 3. 数据加载和预处理


In [4]:
# 检查必要文件
gene2go_pkl = os.path.join(config.data_dir, "gene2go.pkl")
go_csv = os.path.join(config.data_dir, "go.csv")
embedding_file = "/home/mjin/GEARS_Clean/embed/scfoundation_adamson_embedding.npy"

print("检查必要文件:")
print(f"gene2go.pkl: {'存在' if os.path.exists(gene2go_pkl) else '不存在'}")
print(f"go.csv: {'存在' if os.path.exists(go_csv) else '不存在'}")
print(f"embedding文件: {'存在' if os.path.exists(embedding_file) else '不存在'}")

# 测试GO文件加载
print("\n测试GO文件加载:")
from gears.utils import get_go_auto
test_genes = ['GENE1', 'GENE2', 'GENE3']  # 测试用基因列表
go_df = get_go_auto(test_genes, config.data_dir, config.data_name)
print(f"GO文件加载成功，形状: {go_df.shape}")


检查必要文件:
gene2go.pkl: 存在
go.csv: 存在
embedding文件: 存在

测试GO文件加载:
Looking for GO file at: /home/mjin/GEARS_Clean/data/go.csv


GO文件加载成功，形状: (1178995, 3)


In [None]:
# 初始化PertData
print("初始化PertData...")
pert_data = PertData(data_path=config.data_dir, gi_go=False)
print("PertData初始化完成")


初始化PertData...
read /home/mjin/GEARS_Clean/data/gene2go.pkl
✓ PertData初始化完成


In [None]:
# 加载数据集
print(f"加载数据集: {config.data_name}")

# 直接使用现有的h5ad文件
adata = sc.read_h5ad(os.path.join(config.data_dir, f"{config.data_name}.h5ad"))
print(f"读取h5ad文件: {adata.shape}")

# 使用new_data_process方法
pert_data.new_data_process(dataset_name=config.data_name, adata=adata, skip_calc_de=True)
print("数据处理成功")

print(f"数据集形状: {pert_data.adata.shape}")
print(f"基因数量: {pert_data.adata.n_vars}")
print(f"细胞数量: {pert_data.adata.n_obs}")


加载数据集: adamson
读取h5ad文件: (47795, 1069)


Creating pyg object for each cell in the data...
 38%|███▊      | 30/79 [01:16<02:40,  3.27s/it]

In [None]:
# 数据划分
print("准备数据划分...")
pert_data.prepare_split(
    split=config.split,
    seed=config.seed,
    train_gene_set_size=config.train_gene_set_size
)
print("数据划分完成")

# 创建数据加载器
print("创建数据加载器...")
pert_data.get_dataloader(
    batch_size=config.batch_size,
    test_batch_size=config.test_batch_size
)
print("数据加载器创建完成")

# 显示数据划分信息
if hasattr(pert_data, 'set2conditions'):
    print("\n数据划分统计:")
    for split_name, conditions in pert_data.set2conditions.items():
        print(f"  {split_name}: {len(conditions)} 个扰动条件")


Local copy of split is detected. Loading...
Simulation split test composition:
combo_seen0:0
combo_seen1:0
combo_seen2:0
unseen_single:20
Done!
Creating dataloaders....
Done!


准备数据划分...
数据划分完成
创建数据加载器...
数据加载器创建完成

数据划分统计:
  test: 20 个扰动条件
  train: 53 个扰动条件
  val: 6 个扰动条件


## 4. 模型初始化和训练


In [None]:
# 初始化GEARS模型
print("初始化GEARS模型...")
gears_model = GEARS(pert_data, device=config.device)
print(f"GEARS模型初始化完成，使用设备: {config.device}")


初始化GEARS模型...
GEARS模型初始化完成，使用设备: cuda


In [None]:
# 模型配置
print("配置模型参数...")
gears_model.model_initialize(
    hidden_size=config.hidden_size,
    model_type=config.model_type,
    load_path=config.singlecell_model_path,
    finetune_method=config.finetune_method,
    accumulation_steps=config.accumulation_steps,
    highres=config.highres,
    pretrained_model_name=config.pretrained_model_name,
    dataset_name=config.data_name,
    custom_embedding_path=config.custom_embedding_path
)
print("✓ 模型配置完成")

# 显示模型信息
print(f"\n模型配置信息:")
print(f"  隐藏层大小: {config.hidden_size}")
print(f"  模型类型: {config.model_type}")
print(f"  微调方法: {config.finetune_method}")
print(f"  累积步数: {config.accumulation_steps}")
print(f"  高分辨率: {config.highres}")
print(f"  预训练模型: {config.pretrained_model_name}")
print(f"  数据集: {config.data_name}")


配置模型参数...
Use accumulation steps: 5
Use mode: v1
Use higres: 0
loading embedding from scfoundation for dataset adamson
embedding path: /home/mjin/GEARS_Clean/embed/scfoundation_adamson_embedding.npy


embedding shape: torch.Size([3952, 1069, 512])
No G_go
Looking for GO file at: /home/mjin/GEARS_Clean/data/go.csv


ModuleNotFoundError: No module named 'load'

In [None]:
# 创建结果目录
os.makedirs(config.result_dir, exist_ok=True)
print(f"结果目录创建: {config.result_dir}")

# 保存配置参数
param_dict = {attr: getattr(config, attr) for attr in dir(config) if not attr.startswith('_')}
param_df = pd.DataFrame(param_dict, index=['params']).T
param_df.to_csv(os.path.join(config.result_dir, 'params.csv'))
print("配置参数已保存")


In [None]:
# 开始训练
print("开始训练...")
print(f"训练轮数: {config.epochs}")
print(f"学习率: {config.lr}")
print(f"权重衰减: {config.weight_decay}")
print("-" * 50)

import time
start_time = time.time()

gears_model.train(
    epochs=config.epochs,
    result_dir=config.result_dir,
    lr=config.lr,
    weight_decay=config.weight_decay
)

end_time = time.time()
training_time = end_time - start_time

print("-" * 50)
print(f"训练完成！总用时: {training_time:.2f} 秒")


## 5. 模型保存和结果分析


In [None]:
# 保存模型
print("保存模型...")
gears_model.save_model(config.result_dir)
print("模型已保存")

# 检查保存的文件
saved_files = os.listdir(config.result_dir)
print(f"\n保存的文件:")
for file in saved_files:
    file_path = os.path.join(config.result_dir, file)
    if os.path.isfile(file_path):
        size = os.path.getsize(file_path)
        print(f"  {file} ({size} bytes)")
