In [3]:
name_model = "EleutherAI/gpt-neo-1.3B"
alt_name_model = name_model.replace("/", "_")
name_assistant_model = "EleutherAI/gpt-neo-125M"
alt_name_assistant_model = name_assistant_model.replace("/", "_")


In [4]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,    
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)

import torch
import os
import math

batch_size = 8
num_workers = os.cpu_count()
# max_steps = 5
bf16 = True
fp16 = False
gradient_accumulation_steps = 32
context_length = 512
logging_steps = 100
save_steps = 100
learning_rate = 0.00005
num_train_epochs=1



model_name='EleutherAI/gpt-neo-1.3B'
ass_model_name = 'EleutherAI/gpt-neo-125M'

dataset_dict = load_dataset('glue', 'sst2')
dataset_train = dataset_dict['train']
dataset_valid = dataset_dict['validation']

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token



data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")


In [7]:
print(dataset_train.column_names)
print(dataset_train[0])

['sentence', 'label', 'idx']
{'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0}


In [5]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'


def preprocess_function(example):
    final_tokens = tokenizer(
        example['sentence'], 
        # example['text'],
        max_length=context_length,
        truncation=True,
        padding="max_length"
    )
    return final_tokens


def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= context_length:
        total_length = (total_length // context_length) * context_length
    result = {
        k: [t[i : i + context_length] for i in range(0, total_length, context_length)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 2048)
    (wpe): Embedding(2048, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPTNeoBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
          )
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
          (c_proj):

Step,Training Loss,Validation Loss
100,3.8035,3.530045
200,3.4708,3.511658


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


('neo_model/tokenizer_config.json',
 'neo_model/special_tokens_map.json',
 'neo_model/vocab.json',
 'neo_model/merges.txt',
 'neo_model/added_tokens.json',
 'neo_model/tokenizer.json')

In [None]:
tokenized_train = dataset_train.map(
    preprocess_function,
    num_proc=num_workers,
    remove_columns=dataset_train.column_names,
)

tokenized_valid = dataset_valid.map(
    preprocess_function,
    num_proc=num_workers,
    remove_columns=dataset_valid.column_names,
)


lm_dataset_train = tokenized_train.map(
    group_texts, num_proc=num_workers, batched=True
)
lm_dataset_valid = tokenized_valid.map(
    group_texts, num_proc=num_workers, batched=True
)


# os.environ['HF_HOME'] = '/root/autodl-tmp/cache/'

if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)
    

print(model)
total_params = sum(p.numel() for p in model.parameters())
print(f"main model {total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"main model {total_trainable_params:,} training parameters.")



training_args = TrainingArguments(
    output_dir='logs',
    eval_strategy='steps',
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    save_strategy='steps',
    # max_steps=max_steps,
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    bf16=bf16,
    fp16=fp16,
    report_to='tensorboard',
    num_train_epochs=num_train_epochs,
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset_train,
    eval_dataset=lm_dataset_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
torch.cuda.empty_cache()
history = trainer.train()



model.save_pretrained('neo_model')
tokenizer.save_pretrained('neo_model')

In [6]:
dataset_dict = load_dataset('wikitext', 'wikitext-2-raw-v1')
dataset_train = dataset_dict['train']
dataset_valid = dataset_dict['validation']


tokenized_train = dataset_train.map(
    preprocess_function,
    num_proc=num_workers,
    remove_columns=dataset_train.column_names,
)

tokenized_valid = dataset_valid.map(
    preprocess_function,
    num_proc=num_workers,
    remove_columns=dataset_valid.column_names,
)

lm_dataset_train = tokenized_train.map(
    group_texts, num_proc=num_workers, batched=True
)
lm_dataset_valid = tokenized_valid.map(
    group_texts, num_proc=num_workers, batched=True
)

if bf16:
    ass_model = AutoModelForCausalLM.from_pretrained(ass_model_name).to(dtype=torch.bfloat16)
else:
    ass_model = AutoModelForCausalLM.from_pretrained(ass_model_name)
    

# os.environ['CURL_CA_BUNDLE'] = ''

print(ass_model)
total_params = sum(p.numel() for p in ass_model.parameters())
print(f"assistant model {total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in ass_model.parameters() if p.requires_grad)
print(f"assistant model {total_trainable_params:,} training parameters.")




config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_fe

In [7]:
ass_trainer = Trainer(
    model=ass_model,
    args=training_args,
    train_dataset=lm_dataset_train,
    eval_dataset=lm_dataset_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
ass_history = ass_trainer.train()
ass_model.save_pretrained('neo_assis_model')
tokenizer.save_pretrained('neo_assis_model')

Step,Training Loss,Validation Loss
100,4.9797,4.429636
200,4.5114,4.314662


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


('neo_assis_model/tokenizer_config.json',
 'neo_assis_model/special_tokens_map.json',
 'neo_assis_model/vocab.json',
 'neo_assis_model/merges.txt',
 'neo_assis_model/added_tokens.json',
 'neo_assis_model/tokenizer.json')