## Fine Tuning GPT 2 Pretrained Model

Here the GPT 2 model is fine-tuned on a custom dataset of stories.

These dataset is generated by combining a list of stories stored in a set location within google drive.

In [None]:
# The libraries that are required to run and fine-tune the GPT 2 model
# These libraries are provided by Hugging Face
!pip install -U accelerate
!pip install -U transformers

# This resets the environment automatically to apply the installations above.
import os
os.kill(os.getpid(), 9)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Here the code checks if the system has a GPU and if so the processing is set
# to the GPU
import torch

device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
print(f'Using Device: {device}')

Using Device: cuda


In [4]:
# Here the individual story files are accessed and combined to one text file.
# This allows the model to be fine-tuned at once on all the data of all the
# selected stories.
import os
import string
import re
from google.colab import files

data_dir = "/content/drive/MyDrive/Colab Notebooks/Story Gen/Dataset/01 Normal"

# This dataset contains stories that have been repeated to match the longest
# story in the collection to try and eliminate bias
# data_dir = "/content/drive/MyDrive/Colab Notebooks/The Story Gen/Dataset/02 Equal Length"

output_file = "all_data.txt"

def is_hidden(filepath):
  return os.path.basename(filepath).startswith('.')

with open(output_file, "w") as outfile:
  for filename in os.listdir(data_dir):
    filepath = os.path.join(data_dir,filename)
    if not is_hidden(filepath):
      with open(filepath,"r",encoding='utf-8-sig') as infile:
        for line in infile:
          if line.strip():
            clean = line.replace("\n", " ").replace('. ', '.\n')
            outfile.write(clean)

# files.download('all_data.txt')

In [5]:
# Here the pretrained GPT-2 model and relevant Tokenizer is loaded onto memory
# Additionally the pad token is set to the EOS token of the GPT 2 tokenizer
from transformers import GPT2Tokenizer,GPT2LMHeadModel,TrainingArguments,Trainer,DataCollatorWithPadding

from torch.utils.data import Dataset

tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Gen/GPT2 Pretrained/tokenizer")
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Gen/GPT2 Pretrained/model")

# If a local copy of the fine-tuned model is unavailable, use the following two lines to load the model and tokenizer to load it directly from Huggingface:

# tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
# model = GPT2LMHeadModel.from_pretrained("gpt2-medium")


if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.pad_token_id = tokenizer.eos_token_id


In [28]:
 hempnuts


NameError: name 'hempnuts' is not defined

In [26]:
# This class creates the dataset using the combined text file and the GPT-2
# Tokenizer. The combined text file's words are converted to Tokens for
# processing.
class CustomDataset(Dataset):
  def __init__(self, tokenizer, file_path, block_size):
    self.tokenizer = tokenizer
    with open(file_path,"r") as f:
      self.text = f.read().splitlines()

  def __len__(self):
    return len(self.text)

  def __getitem__(self,idx):
    tokenized_inputs = self.tokenizer(
        self.text[idx],
        truncation = True,
        max_length = 128,
        padding = "max_length",
        return_tensors = "pt"
    )
    global jembus
    jembus = tokenized_inputs
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"]
    return tokenized_inputs

In [27]:
# Here, the dataset, data collator and the training arguments are set.
# These training arguments control the how fast the model learns as well as
# the sizing of batches given to the model. The Epochs for training are set
# here as well to ensure sufficient repetitions are carried out on the dataset.
data = CustomDataset(tokenizer, "all_data.txt", 128)
print(CustomDataset.jembus)
rented = CustomDataset.jembus
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    per_device_train_batch_size = 50,
    num_train_epochs = 4,
    learning_rate = 1e-4,
    output_dir = './trained',
    load_best_model_at_end = False,
    evaluation_strategy = "no",
    remove_unused_columns = False,
    push_to_hub = False,
    save_total_limit = 5,
)

IndexError: string index out of range

In [None]:
# The infamous trainer.train() command passes all the training arguments
# tokenized dataset and data collator on for training the model.
# The training loss of the model can be seen in steps of 500.
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset = data,
    eval_dataset = None,
    data_collator = data_collator,
)

trainer.train()

Step,Training Loss
500,0.8311
1000,0.7803
1500,0.7584
2000,0.7443
2500,0.6559
3000,0.6356
3500,0.6459
4000,0.6481
4500,0.5777
5000,0.5479


TrainOutput(global_step=8528, training_loss=0.60569942243551, metrics={'train_runtime': 4196.9564, 'train_samples_per_second': 32.506, 'train_steps_per_second': 2.032, 'total_flos': 3.1675194588266496e+16, 'train_loss': 0.60569942243551, 'epoch': 4.0})

In [None]:
# The infamous trainer.train() command passes all the training arguments
# tokenized dataset and data collator on for training the model.
# The training loss of the model can be seen in steps of 500.
# This training was on the dataset with equal length stories and so
# consumed over 3 hours for training (on a A 100 GPU)
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset = data,
    eval_dataset = None,
    data_collator = data_collator,
)

trainer.train()

Step,Training Loss
500,0.6898
1000,0.607
1500,0.5488
2000,0.4902
2500,0.4478
3000,0.4115
3500,0.3424
4000,0.3227
4500,0.3065
5000,0.2871


TrainOutput(global_step=12252, training_loss=0.3047974041930375, metrics={'train_runtime': 12547.4402, 'train_samples_per_second': 48.809, 'train_steps_per_second': 0.976, 'total_flos': 1.4219150593779302e+17, 'train_loss': 0.3047974041930375, 'epoch': 4.0})

In [None]:
#The below two lines of code save the fine-tuned model in Google Drive
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Gen/GPT 2 Finetuned/model")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Generator/GPT 2 Finetuned/tokenizer")

In [None]:
# Here the pretrained GPT-2 model and relevant Tokenizer is loaded onto memory
# Additionally the pad token is set to the EOS token of the GPT 2 tokenizer
from transformers import GPT2Tokenizer,GPT2LMHeadModel,TrainingArguments,Trainer,DataCollatorWithPadding

from torch.utils.data import Dataset

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.pad_token_id = tokenizer.eos_token_id


# The below two lines of code save the pretrained model in Google Drive to
# easily load the model each time it is required instead of downloading from
# Hugging face
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Gen/GPT2 Pretrained/model")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Story Generator/GPT2 Pretrained/tokenizer")