In [7]:
import subprocess
import os
import json

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [8]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers import AutoTokenizer, DataCollatorWithPadding
import math

In [9]:
aws_dataset = load_dataset("dnagpt/kaggle_amazon_reviews_multi")

train-00000-of-00001.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/5.30M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1200000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [10]:
aws_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 1200000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 30000
    })
})

In [11]:
english_dataset = aws_dataset.filter(lambda example: example["language"] == "en", num_proc=4)
english_dataset

Filter (num_proc=4):   0%|          | 0/1200000 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/30000 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/30000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})

In [13]:
# 加载 GPT-2 分词器
tokenizer = AutoTokenizer.from_pretrained("dnagpt/gene_eng_gpt2_v1_ft")
tokenizer.pad_token = tokenizer.eos_token  # 设置填充标记为 EOS 标记
max_length = 256

english_dataset = english_dataset.filter(lambda x: x["review_title"] and (len(x["review_title"].split()) > 2) )

# 数据预处理函数
def preprocess_function(example):
    return tokenizer(example["review_body"]+ " TL;DR:" +example["review_title"], truncation=True,max_length=max_length, padding="max_length")

# 应用预处理
tokenized_dataset = english_dataset.map(
    preprocess_function,
    batched=False,
)

Filter:   0%|          | 0/200000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/125893 [00:00<?, ? examples/s]

Map:   0%|          | 0/3146 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

In [14]:
# 5. 数据收集器
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # 因果语言建模
)

# 6. 加载 GPT-2 模型，并调整词表大小
model = GPT2LMHeadModel.from_pretrained("dnagpt/gene_eng_gpt2_v1_ft")
model.config.pad_token_id = model.config.eos_token_id

# 7. 训练参数
training_args = TrainingArguments(
    output_dir="./gpt2-summary-train",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=2000,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=2000,
    logging_dir="./logs",
    logging_steps=5000,
    learning_rate=1e-5,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,  # 启用混合精度训练
    #deepspeed="ds_zero2_no_offload.json"
)

# 8. 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

  trainer = Trainer(


[2025-01-19 18:06:51,923] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'

In [15]:
# 9. 开始训练
trainer.train()

# 10. 保存模型和分词器
trainer.save_model("./gpt2-gene-summary-ft")
tokenizer.save_pretrained("./gpt2-gene-summary-ft")

# 11. 评估模型 - 计算 perplexity
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity}")

Step,Training Loss,Validation Loss
2000,No log,3.408186
4000,No log,3.336873
6000,3.539200,3.300543
8000,3.539200,3.269731
10000,3.357800,3.247886
12000,3.357800,3.231866
14000,3.357800,3.216528
16000,3.303800,3.204599
18000,3.303800,3.195496
20000,3.203000,3.187745


Perplexity: 22.93847425950335
