# About

Proof of concept notebook for finetuning GPT using HuggingFace and Torch 

## Setup

In [47]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [48]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# Transformers 
import transformers 
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead
from transformers import AutoTokenizer
import datasets 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



In [49]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 

if IS_LOCAL:
    SMALL_MODEL = True if not IS_CUDA_ENV else False # Use a small dataset if no cuda env. 

if IS_COLAB:
    SMALL_MODEL = False 

In [50]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [51]:
# Project Paths
NOTEBOOK_NAME = "2.0-MU-GPT-Finetune-TextDataset-POC"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue" 
# --- Input data dirs. 
DATASET_NAME = "ICC/julia_dissertation"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data","datasets", "processed", DATASET_NAME)

# --- Result dirs. 
SAVE_MODEL_DIR = os.path.join(PROJECT_ROOT_DIR,"data","models",NOTEBOOK_NAME)
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"data","reports",NOTEBOOK_NAME)

PROCESSED_DATA_DIR


'/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/datasets/processed/ICC/julia_dissertation'

## Finetuning HuggingFace GPT

In [52]:

# NOTE: The below should be the same in the dataset - assuming there are 2 speakers! 
SPEAKER_1_TOKEN = "<SP1>"
SPEAKER_2_TOKEN = "<SP2>"
CONV_START_TOKEN = "<START>"
CONV_END_TOKEN = "<END>"
PAD_TOKEN = "<PAD>"
EOS_TOKEN = "<|endoftext|>"



In [53]:
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(REPORTS_DIR, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


### Tokenizing

In [54]:
# Tokenizer vars. 

TOKENIZER_CHECKPOINT = "gpt2"

In [55]:
# Load the tokenizer with special tokens defined. 
tokenizer = AutoTokenizer.from_pretrained(
    TOKENIZER_CHECKPOINT, 
    pad_token = PAD_TOKEN, 
    eos_token = EOS_TOKEN, 
    additional_special_tokens=(
        SPEAKER_1_TOKEN, SPEAKER_2_TOKEN, CONV_START_TOKEN, 
        CONV_END_TOKEN))

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /Users/muhammadumair/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_pr

In [56]:
# Save the tokenizer after adding new tokens 
tokenizer.save_pretrained(os.path.join(SAVE_MODEL_DIR,"tokenizer")) 

tokenizer config file saved in /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/tokenizer_config.json
Special tokens file saved in /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/special_tokens_map.json


('/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/tokenizer_config.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/special_tokens_map.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/vocab.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/merges.txt',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/added_tokens.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/tokenizer.json')

In [57]:
tokenizer = tokenizer.from_pretrained(os.path.join(SAVE_MODEL_DIR,"tokenizer"))
tokenizer

loading file /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/vocab.json
loading file /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/merges.txt
loading file /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/tokenizer.json
loading file /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/added_tokens.json
loading file /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer/special_tokens_map.json
loading file /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset

PreTrainedTokenizerFast(name_or_path='/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/models/2.0-MU-GPT-Finetune-TextDataset-POC/tokenizer', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<PAD>', 'additional_special_tokens': ['<SP1>', '<SP2>', '<START>', '<END>']})

In [58]:
# Create the data collator, which is responsible for creating batches from the
# datasets during training.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False, 
    return_tensors="pt")


In [61]:

# NOTE: Loading the text files for the TextDataset class instead of the csv files. 
dataset_paths = glob.glob("{}/*.txt".format(PROCESSED_DATA_DIR))
dataset_paths = {os.path.splitext(os.path.basename(p))[0] : p for p in dataset_paths}
# Only keep the required keys / verify that they exist.
dataset_paths = {k : dataset_paths[k] for k in ('train','validation')} 
dataset_paths

{'train': '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/datasets/processed/ICC/julia_dissertation/train.txt',
 'validation': '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/datasets/processed/ICC/julia_dissertation/validation.txt'}

In [62]:
# NOTE: The TextDataset is deprecated - please see the next cell for the updated 
# method of creating the text dataset. Note that this is still included in 
# case the new approach does not work. 

####  UNCOMMENT IF NEEDED ######

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_paths["train"],
    block_size=128)
validation_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_paths["validation"],
    block_size=128)

#################################

Loading features from cached file /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/datasets/processed/ICC/julia_dissertation/cached_lm_GPT2TokenizerFast_128_train.txt [took 0.004 s]
Loading features from cached file /Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/datasets/processed/ICC/julia_dissertation/cached_lm_GPT2TokenizerFast_128_validation.txt [took 0.017 s]


In [63]:
len(tokenizer)

50262

In [64]:
# TextDataset is loading the tokenized version of the utterances
# NOTE: It includes the newlines in the data as tokens. 
train_dataset[0]

tensor([50260,   198, 50258,   428,   318,  1257,   220, 50258,   198, 50259,
          428,   318, 35607,   502,   503,  2728,  1312,  1254,   588,   545,
         2045,   379,   588,   616, 14580,   220, 50259,   198, 50258,   547,
         8066,  1716,  3881,  5788,   220, 50258,   198, 50259,  1312,  1254,
          588,   545,   588,   287,   262,  2003,   428,   318, 38598,   502,
          503,   220, 50259,   198, 50258, 24486,   993,   663,   588,   340,
         3073,   588,   340,   714,   307,   257, 10162,   475,   663,   407,
          220, 50258,   198, 50259,  1312,  1254,   588,  1312,   588,   340,
          318,   257, 10162,   588,   545,  1107, 24069,  1312, 17666,   892,
          547,  2035,   588,   287,   262,  1181,   284,  5412,   220, 50259,
          198, 50258,  7926, 10194,  1312,   760,  1312,  7360,   423,   257,
        50082,   290,   545,   290,   588,   304,    86,  8788,   644,   373,
         1312,  2282,   878,   356,  6807,   287,   994,   220])

In [65]:
tokenizer.decode(train_dataset[0][7]), len(tokenizer.decode(train_dataset[0]))

('<SP1>', 546)

# Model and Training

### Model / Training

In [66]:
MODEL_CHECKPOINT = "distilgpt2" if SMALL_MODEL else "gpt2-large"


In [67]:
from transformers import AutoModelForCausalLM

In [68]:
# Load the model 
model = AutoModelForCausalLM.from_pretrained(
    MODEL_CHECKPOINT, 
    pad_token_id = tokenizer.pad_token_id, 
    eos_token_id = tokenizer.eos_token_id
)
model.resize_token_embeddings(len(tokenizer))

loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at /Users/muhammadumair/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "pad_token_id": 50257,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "s

Embedding(50262, 768)

In [43]:
# Defining training arguments 
# NOTE: These args. are taken from Julia's original code - I've found that 
# deviations drastically change the output during training. 
training_args = TrainingArguments(
        output_dir=SAVE_MODEL_DIR,
        overwrite_output_dir=False,
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_steps=200,
        save_steps=400,
        warmup_steps=300,
        prediction_loss_only=True,
        evaluation_strategy='epoch')

In [44]:
# Create the trainer 
# NOTE: Trainer should automatically put the model and dataset to GPU 
trainer = Trainer(
    model=model,
    args=training_args, 
    data_collator=data_collator, 
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

In [45]:
import gc 

In [46]:
# Clear caches before training 
torch.cuda.empty_cache()
gc.collect() 


396

In [29]:
############# NOTE: DO NOT RUN UNLESS GPU ENVIRONMENT 
trainer.train() 
trainer.save_model()

***** Running training *****
  Num examples = 658
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 42


  0%|          | 0/42 [00:00<?, ?it/s]