# About

Proof of concept notebook for finetuning GPT using HuggingFace and Torch 

## Setup

In [8]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [9]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# Transformers 
import transformers 
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead
from transformers import AutoTokenizer
import datasets 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



NameError: name '_C' is not defined

In [None]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 

if IS_LOCAL:
    SMALL_MODEL = True if not IS_CUDA_ENV else False # Use a small dataset if no cuda env. 

if IS_COLAB:
    SMALL_MODEL = False 

In [None]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [None]:
# Project Paths
NOTEBOOK_NAME = "gpt_finetune_poc"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue" 
# --- Input data dirs. 
DATASET_NAME = "in_conversation_corpus_poc"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", DATASET_NAME)

# --- Result dirs. 
SAVE_MODEL_DIR = os.path.join(PROJECT_ROOT_DIR,"models",NOTEBOOK_NAME)
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"reports",NOTEBOOK_NAME)

os.makedirs(REPORTS_DIR,exist_ok=True)
os.makedirs(SAVE_MODEL_DIR,exist_ok=True)
PROCESSED_DATA_DIR


## Finetuning HuggingFace GPT

In [3]:

# NOTE: The below should be the same in the dataset - assuming there are 2 speakers! 
SPEAKER_1_TOKEN = "<SP1>"
SPEAKER_2_TOKEN = "<SP2>"
CONV_START_TOKEN = "<START>"
CONV_END_TOKEN = "<END>"
PAD_TOKEN = "<PAD>"
EOS_TOKEN = "<|endoftext|>"



In [4]:
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(REPORTS_DIR, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


### Tokenizing

In [6]:
# Tokenizer vars. 

TOKENIZER_CHECKPOINT = "gpt2"
TOKENIZER_BATCH_SIZE = 128

In [7]:
# Load the tokenizer with special tokens defined. 
tokenizer = AutoTokenizer.from_pretrained(
    TOKENIZER_CHECKPOINT, 
    pad_token = PAD_TOKEN, 
    eos_token = EOS_TOKEN, 
    additional_special_tokens=(
        SPEAKER_1_TOKEN, SPEAKER_2_TOKEN, CONV_START_TOKEN, 
        CONV_END_TOKEN))

NameError: name 'AutoTokenizer' is not defined

In [None]:
# Save the tokenizer after adding new tokens 
tokenizer.save_pretrained(SAVE_MODEL_DIR) 

In [None]:
tokenizer = tokenizer.from_pretrained(SAVE_MODEL_DIR)
tokenizer

In [None]:
# Create the data collator, which is responsible for creating batches from the
# datasets during training.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False, 
    return_tensors="pt")


In [None]:
dataset_paths = glob.glob("{}/*.csv".format(PROCESSED_DATA_DIR))
dataset_paths = {os.path.splitext(os.path.basename(p))[0] : p for p in dataset_paths}
# Only keep the required keys / verify that they exist.
dataset_paths = {k : dataset_paths[k] for k in ('train','validation')} 
dataset_paths

In [None]:
# NOTE: The TextDataset is deprecated - please see the next cell for the updated 
# method of creating the text dataset. Note that this is still included in 
# case the new approach does not work. 

#####  UNCOMMENT IF NEEDED ######

# train_dataset = TextDataset(
#     tokenizer=tokenizer,
#     file_path=dataset_paths["train"],
#     block_size=128)
# validation_dataset = TextDataset(
#     tokenizer=tokenizer,
#     file_path=dataset_paths["validation"],
#     block_size=128)

##################################

In [None]:
from datasets import load_dataset

In [None]:
# Loading text dataset using new method 
dataset = load_dataset(DATASET_TYPE, data_files=dataset_paths,streaming=True)

In [None]:
# Each row in this dataset is an utterance. 
dataset['train']

In [None]:
dataset

In [None]:
# Once loaded, the dataset needs to be processed 
def tokenize_fn(tokenizer):
    return lambda data: tokenizer(data["Utterance"], truncation=True) 
tokenized_datasets = dataset.map(
    tokenize_fn(tokenizer), batched=True, batch_size=TOKENIZER_BATCH_SIZE)

In [None]:
tokenized_datasets['train']

In [None]:
next(iter(tokenized_datasets['train']))

# Model and Training

### Model / Training

In [None]:
MODEL_CHECKPOINT = "distilgpt2" if SMALL_MODEL else "gpt2-large"


In [None]:
from transformers import AutoModelForCausalLM

In [None]:
# Load the model 
model = AutoModelForCausalLM.from_pretrained(
    MODEL_CHECKPOINT, 
    pad_token_id = tokenizer.pad_token_id, 
    eos_token_id = tokenizer.eos_token_id
)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Defining training arguments 
training_args = TrainingArguments(
        output_dir=SAVE_MODEL_DIR,
        overwrite_output_dir=False,
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_steps=1,
        save_steps=1,
        warmup_steps=1,
        prediction_loss_only=True,
        evaluation_strategy='epoch',
        logging_dir=REPORTS_DIR)



In [None]:
# Create the trainer 
# NOTE: Trainer should automatically put the model and dataset to GPU 
trainer = Trainer(
    model=model,
    args=training_args, 
    data_collator=data_collator, 
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
import gc 

In [None]:
# Clear caches before training 
torch.cuda.empty_cache()
gc.collect() 


In [None]:

trainer.train() 
trainer.save_model()