In [1]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# import torch
# torch.cuda.set_device(0)

In [1]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
from transformers import GPT2Tokenizer,GPT2Model,AutoModel

from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import LineByLineTextDataset
from tokenizers import Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# #然后我们可以使用from_file() 方法从该文件里重新加载 Tokenizer 对象：
new_tokenizer = Tokenizer.from_file("tokenizer8.json")
# #或者下面方法
# from transformers import GPT2TokenizerFast
# tokenizer = GPT2TokenizerFast(tokenizer_object=new_tokenizer)

from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=new_tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>",
    padding_side="left",
)

In [3]:
#model = GPT2LMHeadModel.from_pretrained("gpt2")
context_length = 512
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer), #这里确定词的id纬度
    n_ctx=context_length, #  Dimensionality of the causal mask (usually same as n_positions).default 1024
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

model = GPT2LMHeadModel(config)

In [4]:
from datasets import load_dataset
dna_dataset = load_dataset("text", data_files="human3.fna.line")
dna_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 634318
    })
})

In [5]:
ds_train_devtest = dna_dataset['train'].train_test_split(test_size=0.1, seed=42)
ds_train_devtest

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 570886
    })
    test: Dataset({
        features: ['text'],
        num_rows: 63432
    })
})

In [6]:
from datasets import load_dataset, DatasetDict

raw_datasets = DatasetDict(
    {
        "train": ds_train_devtest["train"],  # .shuffle().select(range(50000)),
        "valid": ds_train_devtest["test"],  # .shuffle().select(range(500))
    }
)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 570886
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 63432
    })
})

In [7]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids) #不要padding，只要长度足够的
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 541337
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 60111
    })
})

In [8]:
tokenized_datasets["train"][0]

{'input_ids': [73,
  107,
  20547,
  19867,
  14306,
  22312,
  4412,
  36978,
  33592,
  5468,
  29037,
  22001,
  15135,
  5025,
  12427,
  31382,
  19551,
  590,
  16250,
  203,
  21008,
  11141,
  1028,
  2153,
  45467,
  167,
  21638,
  490,
  3626,
  46856,
  16632,
  6603,
  3940,
  19247,
  3880,
  21002,
  15365,
  8824,
  2271,
  12868,
  24938,
  29905,
  11964,
  39709,
  12863,
  22195,
  12588,
  116,
  35063,
  20,
  7790,
  1040,
  14252,
  32395,
  1234,
  10215,
  1782,
  25701,
  5631,
  14778,
  3227,
  5135,
  36054,
  9635,
  45091,
  15549,
  32,
  4087,
  35213,
  32651,
  425,
  35970,
  33822,
  23929,
  29644,
  343,
  41474,
  25537,
  18959,
  129,
  28308,
  36009,
  3986,
  30262,
  4326,
  35175,
  48905,
  11467,
  4792,
  6366,
  17781,
  2703,
  24981,
  49026,
  2009,
  26687,
  37821,
  8902,
  10946,
  21718,
  31230,
  37402,
  43040,
  25200,
  29235,
  91,
  26718,
  17708,
  30280,
  15076,
  4658,
  10128,
  7676,
  27260,
  6232,
  1961,
  27

In [9]:
from transformers import DataCollatorForLanguageModeling
max_seq_length = context_length
out_model_path = "mygpt_unigram8"
train_epoches = 5
batch_size = 15

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)



training_args = TrainingArguments(
        output_dir=out_model_path,
        overwrite_output_dir=True,
        num_train_epochs=train_epoches,
        per_device_train_batch_size=batch_size,
        save_steps=2000,
        save_total_limit=2,
        prediction_loss_only=True,
        #fp16=True, v100没法用
    )


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
)


In [10]:
trainer.train()
trainer.save_model(out_model_path)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,10.4464
1000,10.1696
1500,9.9125
2000,9.7478
2500,9.6526
3000,9.5704
3500,9.495
4000,9.4427
4500,9.4088
5000,9.3622


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")