# About

Proof of concept notebook for finetuning GPT using HuggingFace and Torch 

## Setup

In [1]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [2]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# Transformers 
import transformers 
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead
from transformers import AutoTokenizer
import datasets 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



In [3]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 

if IS_LOCAL:
    SMALL_MODEL = True if not IS_CUDA_ENV else False # Use a small dataset if no cuda env. 

if IS_COLAB:
    SMALL_MODEL = False 

In [None]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [4]:
# Project Paths
NOTEBOOK_NAME = "gpt_finetune_textDataset_poc"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue" 
# --- Input data dirs. 
DATASET_NAME = "in_conversation_corpus_poc"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", DATASET_NAME)

# --- Result dirs. 
SAVE_MODEL_DIR = os.path.join(PROJECT_ROOT_DIR,"models",NOTEBOOK_NAME)
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"reports",NOTEBOOK_NAME)

os.makedirs(REPORTS_DIR,exist_ok=True)
os.makedirs(SAVE_MODEL_DIR,exist_ok=True)
PROCESSED_DATA_DIR


'/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/in_conversation_corpus_poc'

## Finetuning HuggingFace GPT

In [5]:

# NOTE: The below should be the same in the dataset - assuming there are 2 speakers! 
SPEAKER_1_TOKEN = "<SP1>"
SPEAKER_2_TOKEN = "<SP2>"
CONV_START_TOKEN = "<START>"
CONV_END_TOKEN = "<END>"
PAD_TOKEN = "<PAD>"
EOS_TOKEN = "<|endoftext|>"



In [7]:
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(REPORTS_DIR, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


### Tokenizing

In [8]:
# Tokenizer vars. 

TOKENIZER_CHECKPOINT = "gpt2"
TOKENIZER_BATCH_SIZE = 128

In [9]:
# Load the tokenizer with special tokens defined. 
tokenizer = AutoTokenizer.from_pretrained(
    TOKENIZER_CHECKPOINT, 
    pad_token = PAD_TOKEN, 
    eos_token = EOS_TOKEN, 
    additional_special_tokens=(
        SPEAKER_1_TOKEN, SPEAKER_2_TOKEN, CONV_START_TOKEN, 
        CONV_END_TOKEN))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
# Save the tokenizer after adding new tokens 
tokenizer.save_pretrained(SAVE_MODEL_DIR) 

('/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_textDataset_poc/tokenizer_config.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_textDataset_poc/special_tokens_map.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_textDataset_poc/vocab.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_textDataset_poc/merges.txt',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_textDataset_poc/added_tokens.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_textDataset_poc/tokenizer.json')

In [11]:
tokenizer = tokenizer.from_pretrained(SAVE_MODEL_DIR)
tokenizer

PreTrainedTokenizerFast(name_or_path='/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_textDataset_poc', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<PAD>', 'additional_special_tokens': ['<SP1>', '<SP2>', '<START>', '<END>']})

In [12]:
# Create the data collator, which is responsible for creating batches from the
# datasets during training.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False, 
    return_tensors="pt")


In [13]:
dataset_paths = glob.glob("{}/*.csv".format(PROCESSED_DATA_DIR))
dataset_paths = {os.path.splitext(os.path.basename(p))[0] : p for p in dataset_paths}
# Only keep the required keys / verify that they exist.
dataset_paths = {k : dataset_paths[k] for k in ('train','validation')} 
dataset_paths

{'train': '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/in_conversation_corpus_poc/train.csv',
 'validation': '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/in_conversation_corpus_poc/validation.csv'}

In [14]:
# NOTE: The TextDataset is deprecated - please see the next cell for the updated 
# method of creating the text dataset. Note that this is still included in 
# case the new approach does not work. 

####  UNCOMMENT IF NEEDED ######

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_paths["train"],
    block_size=128)
validation_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_paths["validation"],
    block_size=128)

#################################

Token indices sequence length is longer than the specified maximum sequence length for this model (119311 > 1024). Running this sequence through the model will result in indexing errors


# Model and Training

### Model / Training

In [15]:
MODEL_CHECKPOINT = "distilgpt2" if SMALL_MODEL else "gpt2-large"


In [16]:
from transformers import AutoModelForCausalLM

In [17]:
# Load the model 
model = AutoModelForCausalLM.from_pretrained(
    MODEL_CHECKPOINT, 
    pad_token_id = tokenizer.pad_token_id, 
    eos_token_id = tokenizer.eos_token_id
)
model.resize_token_embeddings(len(tokenizer))

Embedding(50262, 768)

In [18]:
# Defining training arguments 
training_args = TrainingArguments(
        output_dir=SAVE_MODEL_DIR,
        overwrite_output_dir=False,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        eval_steps=1,
        save_strategy="epoch", 
        # save_steps=1,
        warmup_steps=1,
        prediction_loss_only=True,
        evaluation_strategy='epoch',
        logging_dir=REPORTS_DIR)



In [20]:
# Create the trainer 
# NOTE: Trainer should automatically put the model and dataset to GPU 
trainer = Trainer(
    model=model,
    args=training_args, 
    data_collator=data_collator, 
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

In [21]:
import gc 

In [22]:
# Clear caches before training 
torch.cuda.empty_cache()
gc.collect() 


181

In [23]:

trainer.train() 
trainer.save_model()

***** Running training *****
  Num examples = 932
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 59


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/59 [00:00<?, ?it/s]