# About 

Proof of concept for the using a finetuned GPT model to infer word probabilities 
given the previous context/ 

## Setup 

In [1]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [2]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# Transformers 
import transformers 
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelForCausalLM
from transformers import AutoTokenizer
import datasets 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



In [3]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 

if IS_LOCAL:
    SMALL_MODEL = True if not IS_CUDA_ENV else False # Use a small dataset if no cuda env. 

if IS_COLAB:
    SMALL_MODEL = False 

In [4]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [58]:
# Project Paths
NOTEBOOK_NAME = "gpt_inference_poc"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue" 
# --- Input data dirs. 
DATASET_NAME = "daily_dialog_poc"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", DATASET_NAME)
RAW_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "raw", NOTEBOOK_NAME)

# --- Result dirs. 
# NOTE: The model dir will have to change depending on where the models are stored. 
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"reports",NOTEBOOK_NAME)

# --- Saved model / tokenizer paths 
TRAINED_MODEL_DIR = os.path.join(PROJECT_ROOT_DIR,"models","gpt_finetune_poc","checkpoint-3")
TRAINED_TOKENIZER_DIR = os.path.join(PROJECT_ROOT_DIR,"models","gpt_finetune_poc")

os.makedirs(REPORTS_DIR,exist_ok=True)


In [6]:
# Methods for GPU Support 

def to_device(data, device):
    """Move tensor(s) to the given device"""
    if isinstance(data, (list, tuple)):
        return [to_device(x,device) for x in data] 
    return data.to(device, non_blocking=True) 

class DeviceDataLoader():
    """Wrapper for a dataloader to move all the data to the specified device"""
    def __init__(self, dl, device):
        self.dl = dl 
        self.device = device 

    def __iter__(self):
        for b in self.dl:
            yield to_device(b, self.device)
    
    def __len__(self):
        return len(self.dl)


## Initialization 

In [7]:
MODEL_CHECKPOINT = "distilgpt2" if SMALL_MODEL else "gpt2-large"
# Tokenizer vars. 
TOKENIZER_CHECKPOINT = "gpt2"
TOKENIZER_BATCH_SIZE = 128

# 

In [8]:

# Loading the model / tokenizer 
model = AutoModelForCausalLM.from_pretrained(TRAINED_MODEL_DIR)
model


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [9]:
tokenizer = AutoTokenizer.from_pretrained(TRAINED_TOKENIZER_DIR)
tokenizer

PreTrainedTokenizerFast(name_or_path='/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/models/gpt_finetune_poc', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<PAD>', 'additional_special_tokens': ['<SP1>', '<SP2>', '<START>', '<END>']})

In [10]:
# Loading the dataset 
from datasets import load_dataset

In [11]:
dataset_paths = glob.glob("{}/*.csv".format(PROCESSED_DATA_DIR))
dataset_paths = {os.path.splitext(os.path.basename(p))[0] : p for p in dataset_paths}
# Only keep the required keys / verify that they exist. 
dataset_paths = {k : dataset_paths[k] for k in ('train','validation','test')}
dataset_paths

{'train': '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/daily_dialog_poc/train.csv',
 'validation': '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/daily_dialog_poc/validation.csv',
 'test': '/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/daily_dialog_poc/test.csv'}

In [12]:
# Read the test dataset as a dataframe 
test_df = pd.read_csv(dataset_paths['test'],index_col=0)
test_df

Unnamed: 0,convID,Utterance
0,0,<START>
1,0,<SP1> hey man you wanna buy some weed <SP1>
2,0,<SP2> some what <SP2>
3,0,<SP1> weed you know <SP1>
4,0,<SP1> pot ganja mary jane some chronic <SP1>
5,0,<SP2> oh umm no thanks <SP2>
6,0,<SP1> i also have blow if you prefer to do a f...
7,0,<SP2> no i am ok really <SP2>
8,0,<SP1> come on man i even got dope and acid try...
9,0,<SP2> do you really have all of these drugs <SP2>


## Inference

In [13]:

# NOTE: The below should be the same in the dataset - assuming there are 2 speakers! 
SPEAKER_1_TOKEN = "<SP1>"
SPEAKER_2_TOKEN = "<SP2>"
CONV_START_TOKEN = "<START>"
CONV_END_TOKEN = "<END>"
PAD_TOKEN = "<PAD>"
EOS_TOKEN = "<|endoftext|>"

# The ID of the conversation in test.csv from where to start (included)
START_CONVERSATION_NO = 0 

# The ID of the last conversation in test.csv, -1 for all conversations.
# NOTE: If END_CONVERSATION_NO = 80 and START_CONVERSATION_NO = 10, 
# includes conversations with IDs 10 to 79. 
END_CONVERSATION_NO = -1

# Defines the number of words in each turn, from the end, for which the
# probability is calculated.
# NOTE: If set to -1, the probability of all the words will be calculated.
N_PROBS = -1  # TODO: Change to -1 before running on HPC.

# The maximum size, in words, of the context.
# NOTE: Larger buffer size will increase inference time.
CONTEXT_BUFFER_SIZE = 512 


In [None]:
# Segment the dataset into conversations 
conversation_dfs = [test_df.loc[test_df['convID'] == i] for i in range(np.max(test_df['convID'].unique()) + 1)]
assert len(conversation_dfs) >= END_CONVERSATION_NO
conversation_dfs = conversation_dfs[START_CONVERSATION_NO:END_CONVERSATION_NO]
len(conversation_dfs)

In [None]:
# All conversations are not in separate lists 
conversation_dfs[0]

In [52]:
# Function for loading and segment the data. 

def load_inference_dataset(csv_path, start_conv_no=START_CONVERSATION_NO, 
        end_conv_no=END_CONVERSATION_NO):
    df = pd.read_csv(csv_path,index_col=0)
    conversation_dfs = [df.loc[df['convID'] == i] for i in range(np.max(df['convID'].unique()) + 1)]
    if end_conv_no > len(conversation_dfs) or end_conv_no == -1:
        end_conv_no = len(conversation_dfs)
    assert len(conversation_dfs) >= end_conv_no
    assert start_conv_no < end_conv_no
    conversation_dfs = conversation_dfs[start_conv_no:end_conv_no]
    return conversation_dfs


In [53]:
# Reading all the conversations
conversation_dfs = load_inference_dataset(
    dataset_paths['test'], 
    start_conv_no=0,
    end_conv_no=-1)
len(conversation_dfs)

3

In [54]:
# Reading all the conversations with a very large end value. 
conversation_dfs = load_inference_dataset(
    dataset_paths['test'], 
    start_conv_no=1,
    end_conv_no=100)
len(conversation_dfs)

2

In [55]:
# Loading only 1 conversation 
conversation_dfs = load_inference_dataset(
    dataset_paths['test'], 
    start_conv_no=1,
    end_conv_no=2)
len(conversation_dfs)

1

In [41]:
def get_last_word_prob(model, tokenizer, text):
    sentence_so_far = text
    context = ' '.join(text.split()[:-1])
    # Encode
    context_encoding = tokenizer.encode(
        context, return_tensors="pt")
    whole_text_encoding = tokenizer.encode(
        sentence_so_far, return_tensors="pt")
    cw_encoding = whole_text_encoding[:, context_encoding.shape[1]:]
    output = model(whole_text_encoding)
    # Obtain the logits for the last hidden state and the logits
    # that provide values for the tokens in the critical word.
    # i.e., if cw token starts at position i in the sentence, then the logits
    # are from i-1 to len(tokens) - 1.
    cw_extracted_logits = output.logits[-1, context_encoding.shape[1]-1:-1, :]
    # Obtain the probabilities from the logits
    softmax = torch.nn.Softmax(dim=-1)
    cw_extracted_probs_from_logits = softmax(cw_extracted_logits)
    # NOTE: Converting to log scale and taking exponential sum of the log
    # probabilities at the end will ensure that there is not floating point
    # overflow issue for very small probability values.
    cw_extracted_log_probs_from_logits = torch.log(
        cw_extracted_probs_from_logits)
    # Extract the probabilities of the specific tokens
    cw_tokens_probs = []
    for cw_subtoken, probs in zip(cw_encoding[0], cw_extracted_log_probs_from_logits):
        cw_tokens_probs.append(probs[cw_subtoken])
    return float(torch.exp(torch.sum(torch.Tensor(cw_tokens_probs))))

In [42]:
text = " ".join(conversation_dfs[0].iloc[:5]["Utterance"])
text

'<START> <SP1> the taxi drivers are on strike again <SP1> <SP2> what for <SP2> <SP1> they want the government to reduce the price of the gasoline <SP1> <SP2> it is really a hot potato <SP2>'

In [43]:
get_last_word_prob(model, tokenizer, text)

1.1585732888349357e-24

In [44]:
def get_final_n_word_probs(model, tokenizer, text, 
        N, context_buffer_size=CONTEXT_BUFFER_SIZE):
    words = text.strip().split(' ')
    if N == -1:
        N = len(words)
    assert not (N > len(words) or N<= 0)
    words[:len(words) - N]
    sentence_so_far = " ".join(words[:len(words) - N])
    results = []
    for word in words[len(words) - N:]:
        sentence_so_far += " " + word.strip()
        # Reset the buffer if required 
        num_words_so_far = len(sentence_so_far.split(' '))
        if num_words_so_far > context_buffer_size:
            sentence_so_far = " ".join(
                sentence_so_far.split(' ')[num_words_so_far - context_buffer_size - 1:])
        last_word_prob = get_last_word_prob(
            model, tokenizer, sentence_so_far)
        context = " ".join(sentence_so_far.split(' ')[:-1])
        results.append((context,word,last_word_prob))
    return np.asarray(results)


In [45]:
text = " ".join(conversation_dfs[0].iloc[:5]["Utterance"])
text


'<START> <SP1> the taxi drivers are on strike again <SP1> <SP2> what for <SP2> <SP1> they want the government to reduce the price of the gasoline <SP1> <SP2> it is really a hot potato <SP2>'

In [46]:
results = get_final_n_word_probs(model, tokenizer, text,N=-1)
results_df = pd.DataFrame(results,columns=['context','word','prob'])
results_df 

Unnamed: 0,context,word,prob
0,,<START>,1.0
1,<START>,<SP1>,2.834191283922965e-28
2,<START> <SP1>,the,3.0940359940300984e-19
3,<START> <SP1> the,taxi,2.193785995242145e-39
4,<START> <SP1> the taxi,drivers,0.0
5,<START> <SP1> the taxi drivers,are,0.0
6,<START> <SP1> the taxi drivers are,on,0.0
7,<START> <SP1> the taxi drivers are on,strike,0.0
8,<START> <SP1> the taxi drivers are on strike,again,0.0
9,<START> <SP1> the taxi drivers are on strike ...,<SP1>,0.0


In [47]:
def generate_conditional_probs(model, tokenizer, conversation_df, 
        N = -1, context_buffer_size=CONTEXT_BUFFER_SIZE):
    results_list = []
    text = ""
    for turn_no, turn in enumerate(conversation_df.itertuples()):
        text += " " +  turn.Utterance.strip()
        text = text.strip()
        turn_length = len(turn.Utterance.split(' '))
        n_probs = turn_length if N == -1 or N > turn_length else N 
        results = get_final_n_word_probs(
            model, tokenizer,text,n_probs, context_buffer_size)
        for result_no, result in enumerate(results):
            word_no = turn_length - n_probs + result_no
            context, word, prob = result
            results_list.append((
                turn.convID, turn_no, word_no, context,word,prob))
    return results_list

In [48]:
data = [] 
for conversation_df in conversation_dfs:
    results = generate_conditional_probs(
        model=model, 
        tokenizer=tokenizer, 
        conversation_df=conversation_df)
    data.extend(results)

In [80]:
results_df = pd.DataFrame(data, columns=['conversationNumber', 
            'turnNumber','wordNumber','context','word','probability'])

results_df[:5]

Unnamed: 0,conversationNumber,turnNumber,wordNumber,context,word,probability
0,1,0,0,,<START>,1.0
1,1,1,0,<START>,<SP1>,5.068076883288909e-18
2,1,1,1,<START> <SP1>,the,4.292436239150078e-25
3,1,1,2,<START> <SP1> the,taxi,2.681073475126427e-23
4,1,1,3,<START> <SP1> the taxi,drivers,8.376588547418521e-26


In [50]:
results_df.to_csv(os.path.join(REPORTS_DIR,"./sample_results.csv"))

In [None]:
# TODO: Write a multi-threaded function when writing the script because 
# it does not work in the notebook. 

# from functools import partial
# from itertools import repeat
# from multiprocessing import Pool
# import multiprocess as mp



# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

# # Generate the probs. for all conversations in a multithreaded way. 
# N = -1 
# results_list = []
# with mp.Pool(processes=5) as pool:

#     results_list = pool.map(lambda df : generate_conditional_probs(
#         model, tokenizer, df, N, CONTEXT_BUFFER_SIZE),conversation_dfs)



### Inference in different cases 

#### Very Large Context 

In [59]:
dataset_paths = glob.glob("{}/*.csv".format(RAW_DATA_DIR))
dataset_paths = {os.path.splitext(os.path.basename(p))[0] : p for p in dataset_paths}
# Only keep the required keys / verify that they exist. 
dataset_paths = {k : dataset_paths[k] for k in ('train','validation','test')}

In [83]:
# Inference if the input text is very large 

conversation_dfs = load_inference_dataset(
    dataset_paths['test'], 
    start_conv_no=0,
    end_conv_no=-1)

# NOTE: Pretending that the last K number of conversations are part of the 
# context to make sure that the model behaves as expected when given a large 
# context. 
context_df = pd.concat(conversation_dfs[:int(len(conversation_dfs)/2)])
large_text = " ".join(list(context_df["Utterance"]))
# Making inference 
results = get_final_n_word_probs(model, tokenizer, text,N=10)
results_df = pd.DataFrame(results,columns=['context','word','prob'])


In [87]:
# Since the text was very large, the context should have been trimmed to be 
# the context buffer size. 
assert len(results_df['context'][0].split(' ')) == CONTEXT_BUFFER_SIZE
 