In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check if GPU is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [3]:

# Load dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

dataset


DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2',use_fast=true)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

tokenised_datasets = dataset.map(tokenize_function, batched=True)

tokenised_datasets

Map: 100%|██████████| 36718/36718 [00:02<00:00, 13789.85 examples/s]


DatasetDict({
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3760
    })
})

In [5]:
%env PYTORCH_ENABLE_MPS_FALLBACK=1

env: PYTORCH_ENABLE_MPS_FALLBACK=1


In [6]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='model/',
    eval_strategy='epoch',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='model/logs',
    # no_cuda=True,
    use_mps_device=True,  #VERY IMPORTANT PARAM
)

model = AutoModelForCausalLM.from_pretrained('distilgpt2')
model.to(device)
# base_model.to("mps")

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_datasets['train'],
    eval_dataset=tokenised_datasets['validation']
)

# Train the model
# trainer.train(resume_from_checkpoint='model/checkpoints/checkpoint')
trainer.train()



`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,1.3791,1.382495


TrainOutput(global_step=9180, training_loss=1.4348974506319998, metrics={'train_runtime': 1876.0005, 'train_samples_per_second': 19.572, 'train_steps_per_second': 4.893, 'total_flos': 1199286761029632.0, 'train_loss': 1.4348974506319998, 'epoch': 1.0})

In [7]:
# save the model and tokenizer explicitly
model_output_dir = 'model/trained_model'

model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

('model/trained_model/tokenizer_config.json',
 'model/trained_model/special_tokens_map.json',
 'model/trained_model/vocab.json',
 'model/trained_model/merges.txt',
 'model/trained_model/added_tokens.json',
 'model/trained_model/tokenizer.json')