In [1]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch
import os
from trl import SFTTrainer
from accelerate import init_empty_weights
from datasets import load_dataset, DatasetDict
import tensorboard


In [2]:
# Load Data and model
cache_dir = "/workspace/Sys_team/yuxuan_workspace/dsc180a/.cache"
model_name = "NousResearch/Meta-Llama-3-8B"
model_name = "Weyaxi/Einstein-v8-Llama3.2-1B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = cache_dir)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir = cache_dir ).to(device)
# dataset = load_from_disk('./dataset/RedPajama-Data-1T-Sample')
# model

In [3]:
# - reinitialize weights
# with init_empty_weights():
#     model = AutoModelForCausalLM.from_config(model.config)
model = AutoModelForCausalLM.from_config(model.config)
model.to(device)
model_size = sum(t.numel() for t in model.parameters())
print(f"TinyLlama size: {model_size/1000**2:.1f}M parameters")

TinyLlama size: 1235.8M parameters


In [None]:
# - load Small Dataset

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train[:10%]", cache_dir=cache_dir)
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation[:10%]", cache_dir=cache_dir)

raw_datasets = DatasetDict(
    {
        "train": ds_train,  # .shuffle().select(range(50000)),
        "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

raw_datasets


DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 6067
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 166
    })
})

In [5]:
context_length = 2048
outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 3
Input chunk lengths: [2048, 504, 1483]
Chunk mapping: [0, 0, 1]


In [6]:
# - batch tokenized dataset
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names, batch_size=64, num_proc=64
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 7047
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 161
    })
})

In [7]:
# - initialize data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 2048])
attention_mask shape: torch.Size([5, 2048])
labels shape: torch.Size([5, 2048])


In [None]:
# Set up training arguments
torch.cuda.empty_cache()
args = TrainingArguments(
    output_dir="model_checkpoints",
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    evaluation_strategy="steps",
    eval_steps=30,
    logging_steps=50,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_0,
    fp16=True,
    report_to=["tensorboard"],
    logging_dir = "tensorboard_logs",
    gradient_checkpointing=True
)


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"]
)


  trainer = Trainer(


In [10]:
trainer.train()

Step,Training Loss,Validation Loss
30,No log,9.502385
60,10.264400,7.706743
90,10.264400,6.708943
120,7.201500,6.178133
150,6.081800,5.649419


TrainOutput(global_step=176, training_loss=7.4955572648481885, metrics={'train_runtime': 2692.7325, 'train_samples_per_second': 2.617, 'train_steps_per_second': 0.065, 'total_flos': 8.418445674676224e+16, 'train_loss': 7.4955572648481885, 'epoch': 0.9985815602836879})