In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import torch 
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim 
from datasets import load_dataset 

import os 
import h5py
from tqdm import tqdm
import pyarrow
import time 
import json 

import sys 
sys.path.append('../code')
import utils
import model 
import loss 
from custom_dataset import CustomDataset
import train_v1 as train


In [3]:
torch.cuda.set_device(1)

In [4]:
(gene_dict, dataset_gene, dataset_gene_ids) = utils.generate_gene_dic()

tokenizer = utils.tokenizer_v1(gene_dict= gene_dict,
                         dataset_gene= dataset_gene,
                         dataset_gene_ids= dataset_gene_ids) 

vocab_size = tokenizer.vocab_size
print(vocab_size)

tokenizer.add_token(token = '<cls>')
tokenizer.add_token(token = '<pad>')
#tokenizer.gene_dict['<cls>'] = vocab_size
print(tokenizer.vocab_size)
print(tokenizer.gene_dict['<cls>']) 
print(tokenizer.gene_dict['<pad>']) 

33524
33526
33524
33525


In [5]:

collate_fn = utils.collater(tokenizer= tokenizer, max_expression= 100, mask_ratio = 0.1, 
                            max_num = 2000,  rho = 0.1, pad_idx = tokenizer.gene_dict['<pad>'])

In [6]:

#dataset_1  = load_dataset(path = 'mus_brain', cache_dir = 'huggingface_cache')
dataset_1 = load_dataset(path = '/work/sunrui/pretrain_dataset/allen_2021_data',
                     cache_dir = '/work/sunrui/huggingface')
dataset_2 = load_dataset(path = '/work/sunrui/pretrain_dataset/allen_2023_data', 
                     cache_dir = '/work/sunrui/huggingface') 

dataset_1 = dataset_1['train'].select(range(3000)).train_test_split(test_size = 0.05)
dataset_2= dataset_2['train'].select(range(3000)).train_test_split(test_size = 0.05)

#dataset_1 = dataset_1['train'].train_test_split(test_size = 0.05)
#dataset_2= dataset_2['train'].train_test_split(test_size = 0.05)

train_dataset_1, test_dataset_1 = dataset_1['train'], dataset_2['test']
train_dataset_2, test_dataset_2 = dataset_2['train'], dataset_2['test']

train_dataset = CustomDataset([train_dataset_1, train_dataset_2]) 
test_dataset = CustomDataset([test_dataset_1, test_dataset_2]) 


Resolving data files:   0%|          | 0/117 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/117 [00:00<?, ?it/s]

  table = cls._concat_blocks(blocks, axis=0)


Resolving data files:   0%|          | 0/419 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/412 [00:00<?, ?it/s]

In [7]:
count_embedding_num = 104
gene_embedding_num = tokenizer.vocab_size

d_model = 256
gene_padding_idx = tokenizer.gene_dict['<pad>']
count_padding_idx = 103
n_head = 8
dim_ffn = 4*d_model
dropout = 0.1
layer_norm_eps =1e-5
batch_first = True
norm_first = False
num_layers = 4
norm = None
num_hiddens = 256

my_model = model.sc_pretrain(count_embedding_num,
                 gene_embedding_num,
                 d_model,
                 gene_padding_idx,
                 count_padding_idx,
                 n_head,
                 dim_ffn,
                 dropout,
                 layer_norm_eps,
                 batch_first,
                 norm_first,
                 num_layers,
                 norm,
                 num_hiddens) 



In [8]:

# 创建 DataLoader 实例
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn= collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn= collate_fn)


In [9]:

pretrain_loss = loss.pretrain_loss()

my_model = train.train_multi_epoch(my_model, 
                train_loader,
                test_loader,
                pretrain_loss, 
                #optimizer = optim.SGD(my_model.parameters(), lr=1e-4, momentum=0.9),
               optimizer = optim.Adam(my_model.parameters(), lr = 5e-5, weight_decay=0.01),
                device = 'cuda',
                gradient_accumulation_steps = 24,
                save_steps = 100,
                save_dir = 'test_1',
                epochs = 5)



  batch_data['counts_0'] = torch.tensor(batch_data['counts_0'], dtype = torch.int)


Step 1, Loss: 6.6520, Exp_loss : 4.6722, Clip_loss ; 1.9798
Step 2, Loss: 6.3365, Exp_loss : 4.4048, Clip_loss ; 1.9317
Step 3, Loss: 6.0938, Exp_loss : 4.1683, Clip_loss ; 1.9256
Step 4, Loss: 5.8609, Exp_loss : 3.9679, Clip_loss ; 1.8930
Step 5, Loss: 5.5725, Exp_loss : 3.7774, Clip_loss ; 1.7951
Step 6, Loss: 5.3786, Exp_loss : 3.6483, Clip_loss ; 1.7303
Step 7, Loss: 5.2578, Exp_loss : 3.5547, Clip_loss ; 1.7032
Step 8, Loss: 5.0828, Exp_loss : 3.4517, Clip_loss ; 1.6311
Step 9, Loss: 4.9553, Exp_loss : 3.3990, Clip_loss ; 1.5564
Step 10, Loss: 4.8056, Exp_loss : 3.3351, Clip_loss ; 1.4706
Step 11, Loss: 4.6533, Exp_loss : 3.2768, Clip_loss ; 1.3764
Step 12, Loss: 4.5233, Exp_loss : 3.2559, Clip_loss ; 1.2674
Step 13, Loss: 4.5071, Exp_loss : 3.1847, Clip_loss ; 1.3225
Step 14, Loss: 4.4165, Exp_loss : 3.1789, Clip_loss ; 1.2376
Step 15, Loss: 4.4342, Exp_loss : 3.1412, Clip_loss ; 1.2931
Step 16, Loss: 4.3070, Exp_loss : 3.0387, Clip_loss ; 1.2683
Step 17, Loss: 4.2755, Exp_loss :

  return torch._transformer_encoder_layer_fwd(


avg total loss:3.7659, avg exp loss:2.8355, avg_clip_loss:0.9304
Step 1, Loss: 3.7774, Exp_loss : 2.8165, Clip_loss ; 0.9609
Step 2, Loss: 3.8758, Exp_loss : 2.8283, Clip_loss ; 1.0475
Step 3, Loss: 3.8207, Exp_loss : 2.7873, Clip_loss ; 1.0334
Step 4, Loss: 3.8379, Exp_loss : 2.7694, Clip_loss ; 1.0685
Step 5, Loss: 3.8010, Exp_loss : 2.7981, Clip_loss ; 1.0029
Step 6, Loss: 3.7237, Exp_loss : 2.7408, Clip_loss ; 0.9829
Step 7, Loss: 3.7368, Exp_loss : 2.7218, Clip_loss ; 1.0150
Step 8, Loss: 3.6974, Exp_loss : 2.7432, Clip_loss ; 0.9542
Step 9, Loss: 3.5855, Exp_loss : 2.6808, Clip_loss ; 0.9047
Step 10, Loss: 3.5943, Exp_loss : 2.6627, Clip_loss ; 0.9316
Step 11, Loss: 3.5857, Exp_loss : 2.7173, Clip_loss ; 0.8683
Step 12, Loss: 3.6215, Exp_loss : 2.6999, Clip_loss ; 0.9216
Step 13, Loss: 3.5969, Exp_loss : 2.6805, Clip_loss ; 0.9164
Step 14, Loss: 3.5635, Exp_loss : 2.6711, Clip_loss ; 0.8924
Step 15, Loss: 3.5787, Exp_loss : 2.6745, Clip_loss ; 0.9041
Step 16, Loss: 3.5488, Exp_lo

Step 30, Loss: 2.9867, Exp_loss : 2.4012, Clip_loss ; 0.5854
Finished Training
avg total loss:2.9696, avg exp loss:2.4418, avg_clip_loss:0.5279
