# About

Proof of concept notebook for obtaining and preprocessing DailyDialog data from 
HuggingFace

## Setup

In [2]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [18]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# Transformers 
import transformers 
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead
from transformers import AutoTokenizer
import datasets 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



In [27]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')

if IS_LOCAL:
    SMALL_DATASET = True if not IS_CUDA_ENV else False # Use a small dataset if no cuda env. 

if IS_COLAB:
    SMALL_DATASET = False 

# to make this notebook's output stable across runs
np.random.seed(GLOBAL_SEED) 
torch.manual_seed(GLOBAL_SEED)

<torch._C.Generator at 0x7f792fe6b450>

In [70]:
# Project Paths
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue" 
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", "daily_dialog_poc")


In [71]:
# Methods for GPU Support 

def to_device(data, device):
    """Move tensor(s) to the given device"""
    if isinstance(data, (list, tuple)):
        return [to_device(x,device) for x in data] 
    return data.to(device, non_blocking=True) 

class DeviceDataLoader():
    """Wrapper for a dataloader to move all the data to the specified device"""
    def __init__(self, dl, device):
        self.dl = dl 
        self.device = device 

    def __iter__(self):
        for b in self.dl:
            yield to_device(b, self.device)
    
    def __len__(self):
        return len(self.dl)


## DailyDialogue - HuggingFace 

In [72]:
# Number of data items in the small version of this dataset. 
SMALL_DATASET_SIZE = 3 


In [73]:
from datasets import list_datasets, load_dataset

In [74]:
# NOTE: Download hugging face version: https://huggingface.co/datasets/daily_dialog
dataset = load_dataset('daily_dialog')

Using custom data configuration default
Reusing dataset daily_dialog (/Users/muhammadumair/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)


  0%|          | 0/3 [00:00<?, ?it/s]

In [75]:
# Separating into splits 
train_dataset_full = dataset["train"]['dialog'] 
test_dataset_full = dataset["test"]['dialog'] 
val_dataset_full = dataset["validation"]['dialog'] 

In [76]:
# Subset the data if running locally. 
if SMALL_DATASET:
    train_dataset_full = train_dataset_full[:SMALL_DATASET_SIZE]
    test_dataset_full = test_dataset_full[:SMALL_DATASET_SIZE]
    val_dataset_full = val_dataset_full[:SMALL_DATASET_SIZE]
    


In [77]:
# Each data item is a list of utterances in a single conversation. 
train_dataset_full[0]

['Say , Jim , how about going for a few beers after dinner ? ',
 ' You know that is tempting but is really not good for our fitness . ',
 ' What do you mean ? It will help us to relax . ',
 " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
 " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
 ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
 " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
 ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
 " Good.Let ' s go now . ",
 ' All right . ']

In [78]:
import re 
from sklearn.utils import shuffle 
from copy import deepcopy 
import itertools 

In [151]:
# Define the start and end tokens 
SPEAKER_TOK = "SP{}" 
CONV_START_TOK = "<START>"
CONV_END_TOK = "<END>"

In [160]:
def preprocess_huggingface_daily_dialog(dataset, num_speakers=2, seed=GLOBAL_SEED):
    """Preprocess the huggingface daily dialogue corpus"""
    dataset = deepcopy(dataset)
    dataset = shuffle(dataset,random_state=GLOBAL_SEED)
    data = [] 
    pbar = tqdm(desc="Preprocessing DailyDialog", total=len(dataset))
    for i in range(len(dataset)):
        conv = dataset[i] 
        data.append((i,CONV_START_TOK))
        for j in range(len(conv)):
            target_str = conv[j].strip() 
            split_toks = re.split(r"\. |\? ", target_str)
            split_toks = [tok for tok in split_toks if len(tok) > 0] 
            # Remove all punctuation and lowercase all 
            split_toks = [re.sub(r'[^\w\s]', '', tok).lower() for tok in split_toks]
            # Remove any double whitespaces 
            split_toks = [re.sub(' +', ' ', tok).lower() for tok in split_toks]
            # Add speaker labels for each turn 
            # Add the speaker tokens for each turn 
            split_toks = [ "[SP{}] ".format((j % num_speakers) + 1) + tok + \
                    "[SP{}]".format((j % num_speakers) + 1) for tok in split_toks]
            data.extend([(i, tok) for tok in split_toks])
        data.append((i,CONV_END_TOK))
        pbar.update()  
    # Create dataframe 
    dataset_df = pd.DataFrame(data, columns=["convID", "Utterance"])
    return dataset_df



In [161]:
from typing import List 

In [162]:
train_dataset_full_processed \
    =  preprocess_huggingface_daily_dialog(train_dataset_full) 
train_dataset_full_processed





Preprocessing DailyDialog: 100%|██████████| 3/3 [00:00<00:00, 2606.24it/s]


Unnamed: 0,convID,Utterance
0,0,<START>
1,0,[SP1] say jim how about going for a few beers ...
2,0,[SP2] you know that is tempting but is really ...
3,0,[SP1] what do you mean [SP1]
4,0,[SP1] it will help us to relax [SP1]
5,0,[SP2] do you really think so [SP2]
6,0,[SP2] i dont [SP2]
7,0,[SP2] it will just make us fat and act silly [...
8,0,[SP2] remember last time [SP2]
9,0,[SP1] i guess you are rightbut what shall we d...


In [164]:
# Preprocessing datasets 
train_dataset_full_processed = \
    preprocess_huggingface_daily_dialog(train_dataset_full) 
val_dataset_full_processed = \
    preprocess_huggingface_daily_dialog(val_dataset_full) 
test_dataset_full_processed = \
    preprocess_huggingface_daily_dialog(test_dataset_full) 





Preprocessing DailyDialog: 100%|██████████| 3/3 [00:00<00:00, 1560.19it/s]




Preprocessing DailyDialog: 100%|██████████| 3/3 [00:00<00:00, 2577.41it/s]




Preprocessing DailyDialog: 100%|██████████| 3/3 [00:00<00:00, 2619.80it/s]


In [171]:
def save_processed_to_file(save_dir, filename,dataset_df):
    dataset_df.to_csv(os.path.join(save_dir,filename)+".csv")

In [172]:
os.makedirs(PROCESSED_DATA_DIR,exist_ok=True)
PROCESSED_DATA_DIR

'/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/processed/daily_dialog_poc'

In [173]:
# Save the datasets to file 
save_processed_to_file(PROCESSED_DATA_DIR ,"train", train_dataset_full_processed)
save_processed_to_file(PROCESSED_DATA_DIR ,"validation", val_dataset_full_processed)
save_processed_to_file(PROCESSED_DATA_DIR ,"test", test_dataset_full_processed)