# 2.5 基于多模态数据构建大模型

In [1]:
import subprocess
import os
# 设置环境变量, autodl一般区域
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import AutoTokenizer

In [3]:
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) #use_regex=False,空格当成一般字符串
trainer = trainers.BpeTrainer(vocab_size=90000, special_tokens=["<|endoftext|>"]) #9w words

In [None]:
tokenizer.train(["../01-data_env/data/dna_1g.txt","../01-data_env/data/protein_1g.txt","../01-data_env/data/english_500m.txt"]
                , trainer=trainer) #all file list, take 10-20 min

In [None]:
tokenizer.save("gene_eng_dict.json")

In [3]:
#然后我们可以使用from_file() 方法从该文件里重新加载 Tokenizer 对象：
new_tokenizer = Tokenizer.from_file("gene_eng_dict.json")

#要在 🤗 Transformers 中使用这个标记器，我们必须将它包裹在一个 PreTrainedTokenizerFast 类中
from transformers import GPT2TokenizerFast
gene_eng_tokenizer = GPT2TokenizerFast(tokenizer_object=new_tokenizer)
gene_eng_tokenizer.save_pretrained("gene_eng_dict")
#dna_tokenizer.push_to_hub("dna_bpe_dict_1g", organization="dnagpt", use_auth_token="hf_*****") # push to huggingface

('gene_eng_dict/tokenizer_config.json',
 'gene_eng_dict/special_tokens_map.json',
 'gene_eng_dict/vocab.json',
 'gene_eng_dict/merges.txt',
 'gene_eng_dict/added_tokens.json',
 'gene_eng_dict/tokenizer.json')

In [4]:
tokenizer_new = AutoTokenizer.from_pretrained('gene_eng_dict')
tokenizer_new.tokenize("TGGCGTGAACCCGGGATCGGG,hello world hello gene, MANITWMANHTGWSDFILLGLFRQSKHPALLCVVIFVVFLMAL")

['TGGCGTGAACCC',
 'GGGATC',
 'GGG',
 ',',
 'h',
 'elloĠ',
 'worldĠ',
 'h',
 'elloĠ',
 'gene',
 ',Ġ',
 'M',
 'AN',
 'I',
 'TWM',
 'AN',
 'H',
 'TGW',
 'SD',
 'FILL',
 'GLF',
 'RQ',
 'SKHP',
 'ALLC',
 'VVIF',
 'VVFL',
 'MAL']

## 训练混合模型

In [3]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig,GPT2Tokenizer
from transformers import GPT2Tokenizer,GPT2Model,AutoModel
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import LineByLineTextDataset
from tokenizers import Tokenizer
from datasets import load_dataset

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("gene_eng_dict")
tokenizer.pad_token = tokenizer.eos_token

In [5]:
max_length = 256 #最大输入长度

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=max_length, #最大长度
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

model = GPT2LMHeadModel(config) #for pretrain,从头预训练

In [6]:
# 1. load dna dataset
raw_dataset = load_dataset('text',  
                           data_files=["../01-data_env/data/dna_1g.txt","../01-data_env/data/protein_1g.txt","../01-data_env/data/english_500m.txt"])

dataset = raw_dataset["train"].train_test_split(test_size=0.05, shuffle=True)

# 2. tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)

# 3. 对数据集应用分词函数
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'], num_proc=15)  # 设置为你的 CPU 核心数或根据需要调整

# 4. 创建一个数据收集器，用于动态填充和遮蔽
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

Map (num_proc=15):   0%|          | 0/3115167 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/163957 [00:00<?, ? examples/s]

In [7]:
run_path = "gpt2_run"
train_epoches = 5
batch_size = 10


training_args = TrainingArguments(
        output_dir=run_path,
        overwrite_output_dir=True,
        num_train_epochs=train_epoches,
        per_device_train_batch_size=batch_size,
        save_steps=2000,
        save_total_limit=2,
        prediction_loss_only=True,
        fp16=True, #v100没法用
    )


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

[2024-12-27 17:09:13,776] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'

In [None]:
trainer.train()
trainer.save_model("gene_eng_gpt2_v0")
tokenizer.save_pretrained("gene_eng_gpt2_v0")

Step,Training Loss
500,8.2515
1000,7.4957
1500,7.4472
2000,7.4032
2500,7.2965
3000,7.1948
3500,7.3128
4000,7.2252
4500,7.1797
5000,7.1645


In [9]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 365.99


In [10]:
#upload model
model.push_to_hub("gene_eng_gpt2_v0", organization="dnagpt", use_auth_token="hf***")



model.safetensors:   0%|          | 0.00/620M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dnagpt/gene_eng_gpt2_v0/commit/6ce5d5f05e9bce5497de036e114de0827c824f32', commit_message='Upload model', commit_description='', oid='6ce5d5f05e9bce5497de036e114de0827c824f32', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnagpt/gene_eng_gpt2_v0', endpoint='https://huggingface.co', repo_type='model', repo_id='dnagpt/gene_eng_gpt2_v0'), pr_revision=None, pr_num=None)