In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'


In [2]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

with open('stupid_questions.txt', 'r', encoding='utf-8') as file:
    # Read the entire content as a single string
    content = file.read().strip()  # strip() to remove any trailing newlines at the end of the file

# Split the content by double newlines which separate sections
texts = content.split('\n\n')

# Tokenize and encode sequences
encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Assume encodings contain all the tokenized data
total_data_size = len(encodings['input_ids'])
# Define a minimum size for the evaluation dataset
min_eval_size = max(2, int(0.1 * total_data_size))  # At least 10% or 1 sample for evaluation

# Calculate the split index ensuring there's at least some eval data
split_index = max(1, total_data_size - min_eval_size)

# Split the dataset into training and evaluation
train_encodings = {
    'input_ids': encodings['input_ids'][:split_index],
    'attention_mask': encodings['attention_mask'][:split_index]
}
eval_encodings = {
    'input_ids': encodings['input_ids'][split_index:],
    'attention_mask': encodings['attention_mask'][split_index:]
}

# Print out the sizes to confirm the split
print(f"Training dataset size: {len(train_encodings['input_ids'])}")
print(f"Evaluation dataset size: {len(eval_encodings['input_ids'])}")


Training dataset size: 72
Evaluation dataset size: 7


In [4]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()  # Ensure labels are included for loss computation
        return item

# Example usage of the TextDataset
train_dataset = TextDataset(train_encodings)  
eval_dataset = TextDataset(eval_encodings)   


In [5]:
from accelerate import Accelerator, DataLoaderConfiguration

accelerator = Accelerator(
    dataloader_config=DataLoaderConfiguration(
        dispatch_batches=None, 
        split_batches=False, 
        even_batches=True, 
        use_seedable_sampler=True
    )
)


In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# Path to the output directory and specific checkpoint
output_dir = r'C:\Users\Maisy 2\Desktop\small and stupid questions\results'
checkpoint_path = r'C:\Users\Maisy 2\Desktop\small and stupid questions\results\checkpoint'

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load the model from the checkpoint if it exists, otherwise from 'gpt2'
#if os.path.exists(checkpoint_path):
    #model = GPT2LMHeadModel.from_pretrained(checkpoint_path)
    #print("Loaded model from checkpoint:", checkpoint_path)
#else:
model = GPT2LMHeadModel.from_pretrained('gpt2')
print("Loaded pre-trained 'gpt2' model")

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,  # Ensure this matches the directory containing the checkpoints
    num_train_epochs=600,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=os.path.join(output_dir, 'logs'),  # Logs can be in a subdirectory
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=10,
    gradient_accumulation_steps=2,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train the model
trainer.train(resume_from_checkpoint=checkpoint_path if os.path.exists(checkpoint_path) else None)

# Save the model and tokenizer at the end of training
model.save_pretrained(os.path.join(output_dir, 'newmodel'))
tokenizer.save_pretrained(os.path.join(output_dir, 'newmodel'))

print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")




Loaded pre-trained 'gpt2' model


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  0%|          | 10/5400 [02:29<21:42:42, 14.50s/it]

{'loss': 8.6363, 'grad_norm': 152.97425842285156, 'learning_rate': 1.0000000000000002e-06, 'epoch': 1.11}


                                                    
  0%|          | 10/5400 [02:34<21:42:42, 14.50s/it]

{'eval_loss': 9.943297386169434, 'eval_runtime': 4.5152, 'eval_samples_per_second': 1.55, 'eval_steps_per_second': 0.221, 'epoch': 1.11}


  0%|          | 12/5400 [03:03<23:18:27, 15.57s/it]