# About 

Proof of concept for obtaining and preprocessing Lena's stimuli for finetuning 
HuggingFace GPT.

## Setup

In [1]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [2]:

import os
import pandas as pd
import numpy as np
from tqdm import tqdm 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# Transformers 
import transformers 
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelForCausalLM
from transformers import AutoTokenizer
import datasets 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)



In [3]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 


In [4]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [5]:
# Project Paths
NOTEBOOK_NAME = "7.0-MU-Lena-Stimuli-Preprocess-POC"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue" 
# --- Input data dirs. 
DATASET_NAME = "lena_stimuli"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data","datasets", "processed", DATASET_NAME)
RAW_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data","datasets", "raw", DATASET_NAME)

# --- Result dirs. 
# NOTE: The model dir will have to change depending on where the models are stored. 
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"data","reports",NOTEBOOK_NAME)


## Lena's Stimuli

This data is from Lena's experiment for congruent and incongruent speakers. 

All files are in .cha format and each file contains turns with format:

```
<Speaker Label>: <Utterance>
```


In [7]:
import re 
from copy import deepcopy 
from sklearn.utils import shuffle 

In [8]:
# Define the start and end tokens 
SPEAKER_TOK = "<SP{}>" 
CONV_START_TOK = "<START>"
CONV_END_TOK = "<END>"

In [9]:
# NOTE: Each of these files represents a single conversation. 
cha_paths = glob.glob("{}/*.cha".format(RAW_DATA_DIR))

In [117]:
''' 
Assumptions about the previous data:
    1. Each line is one turn in format: <Speaker label>: <Utterance>
    2. There are only two speakers per conversation.
    3. Any lines starting with @ are metadata and ignored. 
'''

def preprocess_lena_stimuli(cha_paths,num_speakers=2, seed=GLOBAL_SEED):
    """Creates a dataset dataframe from Julia's processed .cha files."""
    cha_paths = deepcopy(cha_paths)
    cha_paths = shuffle(cha_paths,random_state=seed)
    pbar = tqdm(desc="Preprocessing Lena's stimuli", total=len(cha_paths))
    data = [] 
    for i in range(len(cha_paths)):
        with open(cha_paths[i],'r') as f:
            # Read all lines as a list 
            conv_name = os.path.splitext(os.path.basename(cha_paths[i]))[0]
            conv = f.readlines()
            # Ignore / Remove all lines that start with comment marker. 
            conv = [line for line in conv if line[0] != "@"]
            data.append((conv_name,i,CONV_START_TOK))
            for j in range(len(conv)):
                # Splitting utterance by punctuation i.e., punctuated substrings 
                # will appear as separate lines. 
                target_str = conv[j].strip() 
                # NOTE: Splitting by colon for the speaker ID. 
                split_toks = re.split(r"\. |\?|\t+|:", target_str)
                split_toks = [tok.strip() for tok in split_toks if len(tok) > 0] 
                # Remove all punctuation and lowercase all 
                split_toks = [re.sub(r'[^\w\s]', '', tok).lower() for tok in split_toks]
                # Remove any double whitespaces 
                split_toks = [re.sub(' +', ' ', tok).lower() for tok in split_toks]
                # Removing existing speaker tokens to add the ones needed by the model. 
                # NOTE: Assuming that speaker ids start from 1.
                for speaker_id in range(num_speakers+1):
                    split_toks = [SPEAKER_TOK.format(speaker_id) if re.match("sp{}".format(speaker_id),tok) else tok for tok in split_toks]
                # Add the trailing speaker token 
                split_toks.append(split_toks[0])
                if len(split_toks) == 3:
                    split_toks = [" ".join(split_toks)]
                    # split_toks = list( " ".join(split_toks))
                data.extend([(conv_name,i, tok) for tok in split_toks])
        data.append((conv_name,i,CONV_END_TOK))
        pbar.update()
    dataset_df = pd.DataFrame(data, columns=["convName","convID", "Utterance"])    
    return dataset_df

In [118]:
re.findall('[0-9]+',os.path.splitext(os.path.basename(cha_paths[0]))[0])[0]

'30'

In [119]:
dataset_df = preprocess_lena_stimuli(cha_paths, seed=GLOBAL_SEED)

Preprocessing Lena's stimuli: 100%|██████████| 750/750 [00:00<00:00, 2294.14it/s]


In [120]:
os.makedirs(PROCESSED_DATA_DIR,exist_ok=True)
PROCESSED_DATA_DIR

'/Users/muhammadumair/Documents/Repositories/mumair01-repos/GPT-Monologue-to-Dialogue/data/datasets/processed/lena_stimuli'

In [121]:
dataset_df.sort_values(by=['convID'])

Unnamed: 0,convName,convID,Utterance
0,19bd,0,<START>
1,19bd,0,<SP1> i havent told you this story yet <SP1>
2,19bd,0,<SP1> why not <SP1>
3,19bd,0,<END>
4,9ba,1,<START>
...,...,...,...
2993,45ba,748,<SP1> i have the secret santa gift in my bag <...
2997,1c,749,<SP1> ive been trying to unscrew this bolt for...
2998,1c,749,<SP2> want help <SP2>
2996,1c,749,<START>


In [122]:
# Save the dataset to file
dataset_df.to_csv(os.path.join(PROCESSED_DATA_DIR,"test")+".csv")