# About 

Proof of concept notebook that formats ICC and Speaker Identity datasets 
for use with TurnGPT. 

This is because TurnGPT expects the data to be in a different format compared
with GPT-2. 

NOTE: The point of this document is to quickly generate the data - and it 
may be a bit hacky / need refactoring when the project is complete. 


## Setup 

In [1]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [12]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# Transformers 
import transformers 
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelForCausalLM
from transformers import AutoTokenizer
import datasets 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



In [13]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 


In [14]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [15]:
# Project Paths
NOTEBOOK_NAME = "8.0-MU-TurnGPT-Data-Preprocess-POC"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue" 



## Finetuning - In Conversation Corpus

In [66]:
# Project Paths
# --- Input data dirs. 
DATASET_NAME = "ICC/julia_dissertation_turngpt"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data","datasets", "processed", DATASET_NAME)
RAW_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data","datasets", "raw")


In [53]:
import re 
from copy import deepcopy 
from sklearn.utils import shuffle 

In [119]:
''' 
Assumptions about the previous data:
    1. Has been previous pre-processed by Julia. 
    2. Contains start / end tokens at the start and end of each .cha file. 
    3. Each line format is SPX\t <text> \tSPX 
    4. There are only two speakers per conversation.
'''

def preprocess_huggingface_icc(cha_paths, seed=GLOBAL_SEED):
    """Creates a dataset dataframe from Julia's processed .cha files."""
    cha_paths = deepcopy(cha_paths)
    cha_paths = shuffle(cha_paths,random_state=seed)
    pbar = tqdm(desc="Preprocessing ICC conversations", total=len(cha_paths))
    data = [] 
    for i in range(len(cha_paths)):
        with open(cha_paths[i],'r') as f:
            # Read all lines as a list 
            conv_name = os.path.splitext(os.path.basename(cha_paths[i]))[0]
            conv = f.readlines()
            for j in range(len(conv)):
                target_str = conv[j].strip() 
                split_toks = re.split(r"\. |\?|\t+", target_str)
                split_toks = [tok for tok in split_toks if len(tok) > 0] 
                # Remove all punctuation and lowercase all 
                split_toks = [re.sub(r'[^\w\s]', '', tok).lower() for tok in split_toks]
                # Remove any double whitespaces 
                split_toks = [re.sub(' +', ' ', tok).lower() for tok in split_toks]
                # Removing existing speaker tokens.
                for sp_label in (r'sp1',r'sp2','start','end'):
                    split_toks = [tok for tok in split_toks if not re.match(sp_label, tok) ]
                if len(split_toks) == 3:
                    split_toks = [" ".join(split_toks)]
                data.extend([(conv_name,i, tok) for tok in split_toks])
        pbar.update()
    dataset_df = pd.DataFrame(data, columns=["convName","convID", "Utterance"])    
    return dataset_df

In [88]:
def preprocess_and_save_cha(cha_paths,dataset_name, output_dir):
    os.makedirs(output_dir,exist_ok=True)
    dataset_df = preprocess_huggingface_icc(cha_paths, seed=GLOBAL_SEED)
    dataset_df.to_csv(os.path.join(output_dir,dataset_name)+".csv")
    # Save the dataframe as a text file as well 
    # NOTE: This is important to make sure that TextDataset can read these 
    # files during finetuning. 
    with open(os.path.join(output_dir,dataset_name)+".txt","w") as f:
        f.writelines("\n".join(dataset_df["Utterance"]))
    

In [89]:
RAW_DATA_DIR

'/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/datasets/raw'

In [105]:
# NOTE: Change with the directory containing the cha files. 

DIR_PATH = os.path.join(
    RAW_DATA_DIR,"ICC/julia_finetune_experiments/28_train_14_test_set/test")
NAME = "test"
OUT_DIR = os.path.join(PROCESSED_DATA_DIR,"28_train_14_test_set")

In [106]:
cha_paths = glob.glob("{}/*.cha".format(DIR_PATH))
print(len(cha_paths))


14


In [107]:
preprocess_and_save_cha(cha_paths,NAME,OUT_DIR)

Preprocessing ICC conversations: 100%|██████████| 14/14 [00:00<00:00, 54.42it/s]


## Inference - Speaker Identity Stims

In [113]:
DATASET_NAME = "speaker_identity_stims_turngpt"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data","datasets", "processed", DATASET_NAME)
RAW_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data","datasets", "raw")

In [114]:
DIR_PATH = os.path.join(
    RAW_DATA_DIR,"speaker_identity_stims")
NAME = "test"
OUT_DIR = PROCESSED_DATA_DIR
OUT_DIR

'/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/datasets/processed/speaker_identity_stims_turngpt'

In [120]:
cha_paths = glob.glob("{}/*.cha".format(DIR_PATH))
print(len(cha_paths))


750


In [150]:
''' 
Assumptions about the previous data:
    1. Each line is one turn in format: <Speaker label>: <Utterance>
    2. There are only two speakers per conversation.
    3. Any lines starting with @ are metadata and ignored. 
'''

def preprocess_lena_stimuli(cha_paths,num_speakers=2, seed=GLOBAL_SEED):
    """Creates a dataset dataframe from Julia's processed .cha files."""
    cha_paths = deepcopy(cha_paths)
    cha_paths = shuffle(cha_paths,random_state=seed)
    pbar = tqdm(desc="Preprocessing Lena's stimuli", total=len(cha_paths))
    data = [] 
    for i in range(len(cha_paths)):
        with open(cha_paths[i],'r') as f:
            # Read all lines as a list 
            conv_name = os.path.splitext(os.path.basename(cha_paths[i]))[0]
            conv = f.readlines()
            # Ignore / Remove all lines that start with comment marker. 
            conv = [line for line in conv if line[0] != "@"]
            # data.append((conv_name,i,CONV_START_TOK))
            for j in range(len(conv)):
                # Splitting utterance by punctuation i.e., punctuated substrings 
                # will appear as separate lines. 
                target_str = conv[j].strip() 
                # NOTE: Splitting by colon for the speaker ID. 
                split_toks = re.split(r"\. |\?|\t+|:", target_str)
                split_toks = [tok.strip() for tok in split_toks if len(tok) > 0] 
                # Remove all punctuation and lowercase all 
                split_toks = [re.sub(r'[^\w\s]', '', tok).lower() for tok in split_toks]
                # Remove any double whitespaces 
                split_toks = [re.sub(' +', ' ', tok).lower() for tok in split_toks]
                # Removing existing speaker tokens to add the ones needed by the model. 
                # NOTE: Assuming that speaker ids start from 1.
                split_toks = split_toks[1]
                # for speaker_id in range(num_speakers+1):
                #     split_toks = [tok for tok in split_toks if not re.match("sp{}".format(speaker_id),tok) ]
                # Add the trailing speaker token 
                # split_toks.append(split_toks[0])
                    # split_toks = list( " ".join(split_toks))
                data.append((conv_name, i, split_toks))
                # data.extend([(conv_name,i, tok) for tok in split_toks])
        # data.append((conv_name,i,CONV_END_TOK))
        pbar.update()
    dataset_df = pd.DataFrame(data, columns=["convName","convID", "Utterance"])    
    return dataset_df

In [151]:
dataset_df = preprocess_lena_stimuli(cha_paths, seed=GLOBAL_SEED)

Preprocessing Lena's stimuli: 100%|██████████| 750/750 [00:00<00:00, 7964.84it/s]


In [152]:
dataset_df.sort_values(by=['convID'])

Unnamed: 0,convName,convID,Utterance
0,19bd,0,i havent told you this story yet
1,19bd,0,why not
2,9ba,1,you handled that situation so calmly
3,9ba,1,well done
4,11c,2,you parked in my spot again
...,...,...,...
1495,77c,747,i am
1496,45ba,748,i have the secret santa gift in my bag
1497,45ba,748,dont look
1498,1c,749,ive been trying to unscrew this bolt for fifte...


In [153]:
# Save the dataset to file
dataset_df.to_csv(os.path.join(PROCESSED_DATA_DIR,"test")+".csv")