In [1]:
import torch
from datasets import load_from_disk
from trl import SFTConfig, SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True




In [2]:
# Model used (HuggingFace)
model_id = "microsoft/phi-1_5" 

# Path to the tokenized dataset
tokenized_dataset_path = "C:/AI_Stuff/data_processed"

# Load the tokenized dataset
tokenized_dataset = load_from_disk(tokenized_dataset_path)

max_length = 1024

In [3]:
# Create DataLoader
def create_dataloader(dataset, batch_size=2, shuffle=True):
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

train_dataloader = create_dataloader(tokenized_dataset, batch_size=2)

In [4]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_cache=False,
    trust_remote_code=True,
    torch_dtype="bfloat16",
    attn_implementation="flash_attention_2"
)

# Select device and move model to the device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

model.gradient_checkpointing_enable()

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [5]:
# Create optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-4)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [6]:
sft_config = SFTConfig(
    dataset_text_field=["inputs", "outputs"],
    max_seq_length=max_length,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    neftune_noise_alpha=True,
    bf16=True,
    tf32=True,
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    optim="adamw_hf",
    num_train_epochs=1,
    remove_unused_columns=True,
    output_dir="C:/AI_Stuff/Models/temp"
)

In [7]:
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=tokenized_dataset,  # The dataset can be passed directly here
    data_collator=lambda data: {
        'input_ids': torch.stack([f['input_ids'] for f in data]),
        'attention_mask': torch.stack([f['attention_mask'] for f in data]),
        'labels': torch.stack([f['labels'] for f in data]),
    }
)


In [8]:
trainer.train()
model.save_pretrained("C:/AI_Stuff/Models/phi-1_5-test-4")




Step,Training Loss
500,0.7115
1000,0.6757
1500,0.6561
2000,0.6535
2500,0.645
3000,0.6562
3500,0.6647
4000,0.6345
4500,0.6485
5000,0.6271


RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 4211178240 vs 4211178128