# About 


Proof of concept notebook for obtaining and preprocessing In-Conversation-Corpus data for finetuning HuggingFace GPT

## Setup

In [None]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [None]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# Transformers 
import transformers 
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelForCausalLM
from transformers import AutoTokenizer
import datasets 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



In [None]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 

if IS_LOCAL:
    SMALL_DATASET = True if not IS_CUDA_ENV else False # Use a small dataset if no cuda env. 
    SMALL_DATASET_SIZE = 3 


if IS_COLAB:
    SMALL_DATASET = False 

In [None]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [None]:
# Project Paths
NOTEBOOK_NAME = "preprocess_icc_gpt"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue" 
# --- Input data dirs. 
DATASET_NAME = "in_conversation_corpus_poc"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", DATASET_NAME)
RAW_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "raw", DATASET_NAME, "train")

# --- Result dirs. 
# NOTE: The model dir will have to change depending on where the models are stored. 
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"reports",NOTEBOOK_NAME)

os.makedirs(REPORTS_DIR,exist_ok=True)


## In-Conversation Corpus

### Loading Processed CHAT data 

In this section, we process .cha files and convert them to .csv files for use with the HuggingFace GPT model. 

Note that these files were previously processed to include speaker labels at the start and end of a turn. 

In [None]:
import re 
from copy import deepcopy 
from sklearn.utils import shuffle 

In [None]:
# Define the start and end tokens 
SPEAKER_TOK = "<SP{}>" 
CONV_START_TOK = "<START>"
CONV_END_TOK = "<END>"

In [None]:
# NOTE: Each of these files represents a single conversation. 
cha_paths = glob.glob("{}/*.cha".format(RAW_DATA_DIR))

if SMALL_DATASET:
    cha_paths = cha_paths[:SMALL_DATASET_SIZE]


In [None]:
''' 
Assumptions about the previous data:
    1. Has been previous pre-processed by Julia. 
    2. Contains start / end tokens at the start and end of each .cha file. 
    3. Each line format is SPX\t <text> \tSPX 
    4. There are only two speakers per conversation.
'''

def preprocess_huggingface_icc(cha_paths, seed=GLOBAL_SEED):
    """Creates a dataset dataframe from Julia's processed .cha files."""
    cha_paths = deepcopy(cha_paths)
    cha_paths = shuffle(cha_paths,random_state=seed)
    pbar = tqdm(desc="Preprocessing ICC conversations", total=len(cha_paths))
    data = [] 
    for i in range(len(cha_paths)):
        with open(cha_paths[i],'r') as f:
            # Read all lines as a list 
            conv = f.readlines()
            for j in range(len(conv)):
                target_str = conv[j].strip() 
                split_toks = re.split(r"\. |\?|\t+", target_str)
                split_toks = [tok for tok in split_toks if len(tok) > 0] 
                # Remove all punctuation and lowercase all 
                split_toks = [re.sub(r'[^\w\s]', '', tok).lower() for tok in split_toks]
                # Remove any double whitespaces 
                split_toks = [re.sub(' +', ' ', tok).lower() for tok in split_toks]
                # Removing existing speaker tokens to add the ones needed by the model. 
                split_toks = [SPEAKER_TOK.format("1") if re.match(r"(sp1)", tok) else tok for tok in split_toks]
                split_toks = [SPEAKER_TOK.format("2") if re.match(r"(sp2)", tok) else tok for tok in split_toks]
                split_toks = [CONV_START_TOK if re.match('start',tok)  else tok for tok in split_toks]
                split_toks = [CONV_END_TOK if re.match('end',tok) else tok for tok in split_toks] 
                if len(split_toks) == 3:
                    split_toks = [" ".join(split_toks)]
                    # split_toks = list( " ".join(split_toks))
                data.extend([(i, tok) for tok in split_toks])
        pbar.update()
    dataset_df = pd.DataFrame(data, columns=["convID", "Utterance"])    
    return dataset_df

In [None]:
dataset_df = preprocess_huggingface_icc(cha_paths, seed=GLOBAL_SEED)

In [None]:
os.makedirs(PROCESSED_DATA_DIR,exist_ok=True)

In [None]:
dataset_df.to_csv(os.path.join(PROCESSED_DATA_DIR,"train")+".csv")

### Train / Val Datasets 

Julia originally segmented the train and val. *.cha files into separate folders. Here, we process them to convert them into the appropriate format. 

In [None]:
dataset_paths = {
    "train" :os.path.join(PROJECT_ROOT_DIR,"data", "raw", DATASET_NAME, "train"), 
    "validation" : os.path.join(PROJECT_ROOT_DIR,"data", "raw", DATASET_NAME, "validation")
}

In [None]:
os.makedirs(PROCESSED_DATA_DIR,exist_ok=True)

In [None]:
SMALL_DATASET = False 

In [None]:

for dataset_name, path in dataset_paths.items():
    # NOTE: Each of these files represents a single conversation. 
    cha_paths = glob.glob("{}/*.cha".format(path))
    if SMALL_DATASET:
        cha_paths = cha_paths[:SMALL_DATASET_SIZE]
    dataset_df = preprocess_huggingface_icc(cha_paths, seed=GLOBAL_SEED)
    dataset_df.to_csv(os.path.join(PROCESSED_DATA_DIR,dataset_name)+".csv")


