In [None]:
import torch
from transformers import DataCollatorForLanguageModeling, GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import pandas as pd

# Load the GPT2 tokenizer and add a padding token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos token

# Load the dataset from parquet file
data = pd.read_parquet("C:\\Users\\USER\\Downloads\\0005.parquet")

# Assuming the text data is in a column named 'text'
texts = data['text'].tolist()

# Select a portion of the dataset (1/10 of 250MB equivalent)
portion_size = int(len(texts) / 10)
small_texts = texts[:portion_size]

# Tokenize the dataset
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=1024)

tokenized_texts = tokenize_function(small_texts)

# Ensure all token IDs are within the valid range
def ensure_valid_token_ids(tokenized_data, tokenizer):
    input_ids = tokenized_data['input_ids']
    attention_mask = tokenized_data['attention_mask']
    vocab_size = len(tokenizer)
    
    new_input_ids = []
    for ids in input_ids:
        new_ids = [
            tokenizer.convert_tokens_to_ids(tokenizer.convert_ids_to_tokens(id)) 
            if tokenizer.convert_tokens_to_ids(tokenizer.convert_ids_to_tokens(id)) is not None and tokenizer.convert_tokens_to_ids(tokenizer.convert_ids_to_tokens(id)) < vocab_size 
            else tokenizer.convert_tokens_to_ids(tokenizer.pad_token) 
            for id in ids
        ]
        new_input_ids.append(new_ids)
    
    tokenized_data['input_ids'] = new_input_ids
    return tokenized_data

tokenized_texts = ensure_valid_token_ids(tokenized_texts, tokenizer)

# Convert to PyTorch tensors
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx])
        }

# Create an instance of the custom dataset
train_dataset = CustomDataset(tokenized_texts)

# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We are not using masked language modeling for GPT-2
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,  # Log every 500 steps
    report_to="all"  # Report to all available loggers (TensorBoard, console, etc.)
)

# Initialize the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Resize token embeddings to account for new tokens

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

# Train the model
trainer.train()


An optimized version of the above algorithm

In [1]:
import torch
from transformers import DataCollatorForLanguageModeling, GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup
import pandas as pd

# Enable CUDA launch blocking
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Load the GPT2 tokenizer and add a padding token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos token

# Load the dataset from parquet file
data = pd.read_parquet("C:\\Users\\USER\\Downloads\\0005.parquet")

# Assuming the text data is in a column named 'text'
texts = data['text'].tolist()

# Select a portion of the dataset (1/10 of 250MB equivalent)
portion_size = int(len(texts) / 100)
small_texts = texts[:portion_size]

# Tokenize the dataset
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=1024)

tokenized_texts = tokenize_function(small_texts)

# Ensure all token IDs are within the valid range
def ensure_valid_token_ids(tokenized_data, tokenizer):
    input_ids = tokenized_data['input_ids']
    attention_mask = tokenized_data['attention_mask']
    vocab_size = len(tokenizer)
    
    new_input_ids = []
    for ids in input_ids:
        new_ids = [
            tokenizer.convert_tokens_to_ids(tokenizer.convert_ids_to_tokens(id)) 
            if tokenizer.convert_tokens_to_ids(tokenizer.convert_ids_to_tokens(id)) is not None and tokenizer.convert_tokens_to_ids(tokenizer.convert_ids_to_tokens(id)) < vocab_size 
            else tokenizer.convert_tokens_to_ids(tokenizer.pad_token) 
            for id in ids
        ]
        new_input_ids.append(new_ids)
    
    tokenized_data['input_ids'] = new_input_ids
    return tokenized_data

tokenized_texts = ensure_valid_token_ids(tokenized_texts, tokenizer)

# Convert to PyTorch tensors
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx])
        }

# Create an instance of the custom dataset
train_dataset = CustomDataset(tokenized_texts)

# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We are not using masked language modeling for GPT-2
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=1000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,  # Log every 500 steps
    report_to="all",  # Report to all available loggers (TensorBoard, console, etc.)
    no_cuda=False,
    fp16=True,
    gradient_accumulation_steps=4,
    dataloader_num_workers=4,
    dataloader_pin_memory=True
)

# Initialize the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Resize token embeddings to account for new tokens

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers=(optimizer, scheduler)
)

# Train the model
trainer.train()


  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmsfasha[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/19 [00:00<?, ?it/s]

Loading the Saved Model
To load the saved model and tokenizer for inference or further training, you can use the following code:

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./results")
tokenizer = GPT2Tokenizer.from_pretrained("./results")

# Use the model and tokenizer for inference or further training
inputs = tokenizer("Hello, how are you?", return_tensors="pt")
outputs = model.generate(inputs.input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Summary
Checkpoints are saved in the ./results directory at intervals defined by save_steps.
Logs are saved in the ./logs directory.
The final model and tokenizer are saved in the ./results directory at the end of the training.
This setup ensures that you have access to intermediate checkpoints and the final trained model, which can be used for evaluation, inference, or further fine-tuning.