# About

Proof of concept notebook for finetuning GPT using HuggingFace and Torch and 
is similar to notebook 2.0. However, in this notebook, we replace 
TextDataset with custom chunking of the datasets. 

## Setup

In [1]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [2]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# Transformers 
import transformers 
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead
from transformers import AutoTokenizer
import datasets 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



In [3]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 

if IS_LOCAL:
    LIMITED_RESOURCES = not IS_CUDA_ENV
    SMALL_DATASET_SIZE = 500

if IS_COLAB:
    LIMITED_RESOURCES = False 

In [4]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [5]:
# Project Paths
NOTEBOOK_NAME = "gpt_finetune_custom_dataset_poc"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue" 
# --- Input data dirs. 
DATASET_NAME = "in_conversation_corpus_poc"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", DATASET_NAME)

# --- Result dirs. 
SAVE_MODEL_DIR = os.path.join(PROJECT_ROOT_DIR,"models",NOTEBOOK_NAME)
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"reports",NOTEBOOK_NAME)

os.makedirs(REPORTS_DIR,exist_ok=True)
os.makedirs(SAVE_MODEL_DIR,exist_ok=True)
PROCESSED_DATA_DIR


'/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/in_conversation_corpus_poc'

## Finetuning HuggingFace GPT

In [6]:

# NOTE: The below should be the same in the dataset - assuming there are 2 speakers! 
SPEAKER_1_TOKEN = "<SP1>"
SPEAKER_2_TOKEN = "<SP2>"
CONV_START_TOKEN = "<START>"
CONV_END_TOKEN = "<END>"
PAD_TOKEN = "<PAD>"
EOS_TOKEN = "<|endoftext|>"



In [7]:
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(REPORTS_DIR, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


### Tokenizer Loading

In [85]:
# Tokenizer vars. 

TOKENIZER_CHECKPOINT = "gpt2"

In [86]:
# TODO: In the next cell, we were adding the additional tokens as new 
# tokens. However, it seems that the special tokens are not masked by the data 
# collator, which might lead to weird training results. 
# Therefore, loading a basic version of the tokenizer. 
tokenizer = AutoTokenizer.from_pretrained(
    TOKENIZER_CHECKPOINT
    )

In [87]:
# # Load the tokenizer with special tokens defined. 
# tokenizer = AutoTokenizer.from_pretrained(
#     TOKENIZER_CHECKPOINT, 
#     pad_token = PAD_TOKEN, 
#     eos_token = EOS_TOKEN, 
#     additional_special_tokens=(
#         SPEAKER_1_TOKEN, SPEAKER_2_TOKEN, CONV_START_TOKEN, 
#         CONV_END_TOKEN))

In [88]:
# Save the tokenizer after adding new tokens 
tokenizer.save_pretrained(SAVE_MODEL_DIR) 

('/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_custom_dataset_poc/tokenizer_config.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_custom_dataset_poc/special_tokens_map.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_custom_dataset_poc/vocab.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_custom_dataset_poc/merges.txt',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_custom_dataset_poc/added_tokens.json',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_custom_dataset_poc/tokenizer.json')

In [89]:
tokenizer = tokenizer.from_pretrained(SAVE_MODEL_DIR)
tokenizer

PreTrainedTokenizerFast(name_or_path='/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_custom_dataset_poc', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})

In [90]:
# This is the maximum content size of the tokenizer 
# NOTE: This can be changed depending on the amount of GPU memory that is available. 
tokenizer.model_max_length

1024

### Dataset Loading

In [91]:
dataset_paths = glob.glob("{}/*.csv".format(PROCESSED_DATA_DIR))
dataset_paths = {os.path.splitext(os.path.basename(p))[0] : p for p in dataset_paths}
# Only keep the required keys / verify that they exist.
dataset_paths = {k : dataset_paths[k] for k in ('train','validation')} 
dataset_paths

{'train': '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/in_conversation_corpus_poc/train.csv',
 'validation': '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/in_conversation_corpus_poc/validation.csv'}

In [92]:
# NOTE: The TextDataset is deprecated - please see the next cell for the updated 
# method of creating the text dataset. Note that this is still included in 
# case the new approach does not work. 

#####  UNCOMMENT IF NEEDED ######

# train_dataset = TextDataset(
#     tokenizer=tokenizer,
#     file_path=dataset_paths["train"],
#     block_size=128)
# validation_dataset = TextDataset(
#     tokenizer=tokenizer,
#     file_path=dataset_paths["validation"],
#     block_size=128)

##################################

In [93]:
from datasets import load_dataset, DatasetDict

In [94]:
# Loading text dataset using new method 
if LIMITED_RESOURCES:
    train_dataset, validation_dataset = load_dataset(DATASET_TYPE, data_files=dataset_paths, 
    split=[
        datasets.ReadInstruction('train', from_=0, to=SMALL_DATASET_SIZE, unit='abs'),
        datasets.ReadInstruction('validation', from_=0, to=SMALL_DATASET_SIZE, unit='abs')])
    dataset = DatasetDict({
        "train" : train_dataset, 
        "validation" : validation_dataset
    })
else:
    dataset = load_dataset(DATASET_TYPE, data_files=dataset_paths)



Using custom data configuration default-ec291898ba3afb33
Reusing dataset csv (/Users/muhammadumair/.cache/huggingface/datasets/csv/default-ec291898ba3afb33/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/2 [00:00<?, ?it/s]

In [95]:
# Each row in this dataset is an utterance. 
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'convID', 'Utterance'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'convID', 'Utterance'],
        num_rows: 500
    })
})

### Dataset Preprocessing and Tokenizing

In [96]:
TOKENIZER_BATCH_SIZE = 128

In [97]:
# Once loaded, the dataset needs to be processed 
def tokenize_fn(tokenizer):
    return lambda data: tokenizer(data["Utterance"], truncation=True) 
# NOTE: The batched with map allows the tokenize_fn to be applied to multiple 
# batched of the input data at the same time for faster processing - but does not 
# affect the tokenizer results themselves. 
tokenized_datasets = dataset.map(
    tokenize_fn(tokenizer), batched=True, remove_columns=["Unnamed: 0","convID","Utterance"])
# tokenized_datasets = dataset.map(
#     tokenize_fn(tokenizer), batched=True, batch_size=TOKENIZER_BATCH_SIZE, remove_columns=["Unnamed: 0","convID","Utterance"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [98]:
# Tokenizing adds the input_ids and attention_mask. 
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 500
})

In [99]:
# Each item in the tokenized dataset is simply that utterance
tokenized_datasets['train'][10]

{'input_ids': [50259, 8788, 220, 50259], 'attention_mask': [1, 1, 1, 1]}

In [100]:
# Slicing produces a list of lists for each feature
if LIMITED_RESOURCES:
    tokenized_samples = tokenized_datasets["train"][:SMALL_DATASET_SIZE]
else:
    tokenized_samples = tokenized_datasets["train"][:-1]
for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Utterance index {idx} : Utterance length: {len(sample)}'")
tokenized_samples.keys()

'>>> Utterance index 0 : Utterance length: 1'
'>>> Utterance index 1 : Utterance length: 6'
'>>> Utterance index 2 : Utterance length: 18'
'>>> Utterance index 3 : Utterance length: 8'
'>>> Utterance index 4 : Utterance length: 16'
'>>> Utterance index 5 : Utterance length: 18'
'>>> Utterance index 6 : Utterance length: 27'
'>>> Utterance index 7 : Utterance length: 28'
'>>> Utterance index 8 : Utterance length: 4'
'>>> Utterance index 9 : Utterance length: 14'
'>>> Utterance index 10 : Utterance length: 4'
'>>> Utterance index 11 : Utterance length: 14'
'>>> Utterance index 12 : Utterance length: 24'
'>>> Utterance index 13 : Utterance length: 8'
'>>> Utterance index 14 : Utterance length: 24'
'>>> Utterance index 15 : Utterance length: 9'
'>>> Utterance index 16 : Utterance length: 22'
'>>> Utterance index 17 : Utterance length: 12'
'>>> Utterance index 18 : Utterance length: 7'
'>>> Utterance index 19 : Utterance length: 10'
'>>> Utterance index 20 : Utterance length: 31'
'>>> Utter

dict_keys(['input_ids', 'attention_mask'])

In [101]:
# We can manually create chunks of data of the dataset using dict. comprehension. 
# NOTE: We need to decide whether to pad the last chunk or discard it. 

def chunk_tokenized_samples(tokenized_samples,chunk_size=128):
    # Concatenate all the utterances 
    keys =  ('input_ids','attention_mask')
    concatenated_examples = {k : sum(tokenized_samples[k],[]) for k in keys}
    total_length = len(concatenated_examples[keys[0]])
    total_length = (total_length // chunk_size) * chunk_size
    # NOTE: This method is discarding the last chunk. 
    chunks = {
        k : [concatenated_examples[k][i:i+ chunk_size] for i in range(0, total_length,chunk_size)]  for k in keys
    }
    return chunks
    

lm_datasets = tokenized_datasets.map(chunk_tokenized_samples,batched=True)
lm_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 52
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 47
    })
})

In [107]:
# This is the number of tokens in each chunk - which should be equal to chunk size. 
len(lm_datasets['train'][0]['input_ids'])

128

In [109]:
# This is the result of decoding the first CHUNK_SIZE tokens in the dataset. 
tokenizer.decode(lm_datasets['train'][0]['input_ids'])

'<START><SP1> this is fun <SP1><SP2> this is freaking me out cause i feel like im looking at like my reflection <SP2><SP1> were gonna become rock stars <SP1><SP2> i feel like im like in the future this is creeping me out <SP2><SP1> woah its like it looks like it could be a mirror but its not <SP1><SP2> i feel like i like it is a mirror like im really disturbed i dont think were either like in the state to handle <SP2><SP1> sorry yeah i know i literally have a migraine and im and like ew okay what was i saying before we walked in here <SP1><SP2> stats <SP2><SP1> stats'

In [79]:
# Create the data collator, which is responsible for creating batches from the
# datasets during training.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False, 
    return_tensors="pt")

# Model and Training

### Model / Training

In [None]:
MODEL_CHECKPOINT = "distilgpt2" if LIMITED_RESOURCES else "gpt2-large"


In [None]:
from transformers import AutoModelForCausalLM

In [None]:
# Load the model 
model = AutoModelForCausalLM.from_pretrained(
    MODEL_CHECKPOINT, 
    pad_token_id = tokenizer.pad_token_id, 
    eos_token_id = tokenizer.eos_token_id
)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Defining training arguments 
training_args = TrainingArguments(
        output_dir=SAVE_MODEL_DIR,
        overwrite_output_dir=False,
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_steps=200,
        save_steps=400,
        warmup_steps=300,
        prediction_loss_only=True,
        evaluation_strategy='epoch')


In [None]:
# Create the trainer 
# NOTE: Trainer should automatically put the model and dataset to GPU 
trainer = Trainer(
    model=model,
    args=training_args, 
    data_collator=data_collator, 
    train_dataset=lm_datasets['train'], 
    eval_dataset=lm_datasets['validation']
)

In [None]:
import gc 

In [None]:
# Clear caches before training 
torch.cuda.empty_cache()
gc.collect() 


In [None]:
# NOTE: This will probably not run locally. 
trainer.train() 
trainer.save_model()