# About 

This notebook is intended to be a POC for the PauseDataset required for Experiment 4.2 in the Skantze 2017 paper. 

## Setup 

In [6]:
 # Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [7]:

import os
import pandas as pd
import numpy as np
from copy import deepcopy 

import random 
import shutil 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"


# Pytorch imports 
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)




In [8]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 
LIMITED_RESOURCES = not IS_CUDA_ENV

if LIMITED_RESOURCES:
    SMALL_DATASET_SIZE = 10

if IS_COLAB:
    LIMITED_RESOURCES = False 
   

In [9]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [10]:
# Project Paths
NOTEBOOK_NAME = "skantze2017_pause_dataset_poc"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous" 
# --- Input data dirs. 
DATASET_NAME = "maptask"
DATASET_TYPE = "csv"
RAW_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "raw", "maptask")
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", NOTEBOOK_NAME)

# --- Result dirs. 
# NOTE: The model dir will have to change depending on where the models are stored. 
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"reports",NOTEBOOK_NAME)

# Paths to the specific feature sets 
FULL_PROCESSED_FEATURE_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", "maptask","full")
PROSODY_PROCESSED_FEATURE_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", "maptask","prosody")
FULL_PROCESSED_FEATURE_DIR

os.makedirs(REPORTS_DIR,exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR,exist_ok=True )


## Required Code 

**NOTE** In this section, we are copying some required code from notebooks 1.0-* to 3.0-*.

Ay bugs in this code **should be fixed in the appropriate notebooks**. 

### MapTask Dataset Generation Code

In [11]:
RAW_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "raw", "maptask")
MAPTASK_DIR = os.path.join(RAW_DATA_DIR,"maptaskv2-1")
# Paths within the maptask corpus 
STEREO_AUDIO_PATH = os.path.join(MAPTASK_DIR,"Data/signals/dialogues")
MONO_AUDIO_PATH = os.path.join(MAPTASK_DIR,"Data/signals/mono_signals")
# NOTE: The timed units are also used for Voice Activity annotations. 
TIMED_UNIT_PATHS = os.path.join(MAPTASK_DIR,"Data/timed-units") 
POS_PATH = os.path.join(MAPTASK_DIR,"Data/pos")


In [12]:
def get_maptask_participant(csv_path):
    filename, ext = os.path.splitext(os.path.basename(csv_path))
    filename_split = filename.split(".")
    participant = filename_split[1]
    return participant

def get_maptask_dialogue(csv_path):
    filename, ext = os.path.splitext(os.path.basename(csv_path))
    filename_split = filename.split(".")
    dialogue = filename_split[0]
    return dialogue

def read_data(dir_path,dialogue_name, participant,ext):
    """
    Assumption is that the basename . is the dialogue name. 
    """
    results = []
    data_paths = [p for p in os.listdir(dir_path)]
    data_paths = [os.path.join(dir_path,p) for p in data_paths if os.path.splitext(p)[1][1:] == ext]
    for path in data_paths:
       if get_maptask_dialogue(path) == dialogue_name and \
                get_maptask_participant(path) == participant:
            results.append(path)
    return results 

def get_mono_audio(dialogue_name, participant):
    return read_data(MONO_AUDIO_PATH,dialogue_name, participant,"wav")[0]

def get_stereo_audio(dialogue_name):
    return read_data(STEREO_AUDIO_PATH,dialogue_name,"mix","wav")[0]

def get_timed_unit(dialogue_name, participant):
    return read_data(TIMED_UNIT_PATHS,dialogue_name, participant,"xml")[0]


In [13]:
def collect_dialogue_features(dialogue_names, features_dir):
    """
    Collect the dialogue f and g feature files.
    Assumes that features_dir contains both the f and g feature files. 
    """
    collected = {}
    for dialogue in dialogue_names:
        collected[dialogue] = {
            "f" : read_data(features_dir,dialogue,"f","csv")[0], 
            "g" : read_data(features_dir,dialogue,"g","csv")[0]}
    return collected 




In [14]:
import random
from copy import deepcopy
from sklearn.model_selection import train_test_split 

In [15]:
def get_train_val_test_dialogues(dataset_paths, test_size=0.25, val_size=0.2, 
        seed=GLOBAL_SEED):
    dataset_paths = deepcopy(dataset_paths)
    dialogue_names = sorted(list(set([get_maptask_dialogue(p) for p in dataset_paths])))
    train_dialogues, test_dialogues = train_test_split(dialogue_names, 
        test_size=test_size,random_state=seed)
    train_dialogues, val_dialogues = train_test_split(train_dialogues, 
        test_size=val_size,random_state=seed)
    return train_dialogues, val_dialogues, test_dialogues 


### MapTask Dataclass and Method Definitions 

These are required to be in the same notebook to be loaded. 


NOTE: **DO NOT** modify these here, refer to 2.0-MU-Skantze-MapTask-Dataset-POC

In [16]:
# NOTE:Seed worker can be used to ensure reproducibility in DataLoader 
# across runs. 
def seed_worker(worker_id):
    worker_seed =GLOBAL_SEED
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def generate_dataloader(dataset, batch_size=32, shuffle=True, num_workers=0, 
        drop_last=True, pin_memory=True):
    return DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        num_workers=num_workers, 
        drop_last=drop_last, # We always want to remove the last incomplete batch. 
        pin_memory=pin_memory, 
        worker_init_fn=seed_worker)

## Experiment 4.2: MapTask Pause Dataset

### DataSet Class 

In [27]:
class Skantze2017PausesDataset(Dataset):

    HOLD_LABEL = 0 
    SHIFT_LABEL = 1 

    def __init__(self, feature_paths_map , sequence_length_ms, min_pause_length_ms, 
                max_future_silence_window_ms, s0_participant, frame_step_size_ms, 
                save_dir = None):
        # Params . 
        self.feature_paths_map = feature_paths_map 
        self.sequence_length_ms = sequence_length_ms 
        self.min_pause_length_ms = min_pause_length_ms
        self.max_future_silence_window_ms = max_future_silence_window_ms
        self.s0_participant = s0_participant
        self.frame_step_size_ms = frame_step_size_ms  
        self.save_dir = save_dir # If the save dir is provided, saves the dataset as it is built. 
        # Calculated 
        self.num_context_frames = int(sequence_length_ms / frame_step_size_ms)
        self.num_pause_frames = int(min_pause_length_ms / frame_step_size_ms)
        self.future_window_frames = int(max_future_silence_window_ms / frame_step_size_ms)
        # Data Storage vars. 
        self.xs = [] 
        self.ys = [] 
        self.num_silences = 0 
        self.num_holds = 0 
        self.num_shifts = 0 
        self.num_pauses = 0 
        # Prepare the data 
        for dialogue in list(self.feature_paths_map.keys()):
            self.__prepare_items(dialogue, self.__prepare_pauses_df(dialogue))

    def __len__(self):
        return len(self.xs)
    
    def __getitem__(self, idx):
        if idx > self.__len__():
            raise Exception 
        # NOTE: xs has target speaker features concatenated with non-target speaker features. 
        # ys is the previous speaker, hold / shift label, and the next speaker. 
        return self.xs[idx], self.ys[idx]
    
    def get_pause_statistics(self):
        return {
            "min_pause_length_ms " : self.min_pause_length_ms , 
            "sequence_length_ms" : self.sequence_length_ms, 
            "max_future_silence_window_ms" : self.max_future_silence_window_ms, 
            "num_silences" : self.num_silences, 
            "num_holds" : self.num_holds, 
            "num_shifts" : self.num_shifts,
            "num_pauses" : self.num_pauses,
            "s0_participant" : self.s0_participant
        }
    
    def __prepare_items(self,dialogue,pauses_df):
        s0_feature_df, s1_feature_df = self.__load_dataframes(dialogue)
        # Collect the data for both models
        s0_s1_df = pd.concat([s0_feature_df.loc[:,s0_feature_df.columns != 'frameTime'],s1_feature_df.loc[:,s1_feature_df.columns != 'frameTime']],axis=1)   
        s1_s0_df = pd.concat([s1_feature_df.loc[:,s1_feature_df.columns != 'frameTime'],s0_feature_df.loc[:,s0_feature_df.columns != 'frameTime']],axis=1) 
        for pause_data in pauses_df.itertuples():
            _,_, previous_speaker, idx_after_silence_frames, _, hold_shift_label, next_speaker = pause_data 
            idx_after_silence_frames = int(idx_after_silence_frames)
            # Collect features for each speaker equal to sequence length / num context frames. 
            if idx_after_silence_frames- self.num_context_frames >= 0:
                x_s0 = np.asarray(s0_s1_df.iloc[idx_after_silence_frames-self.num_context_frames:idx_after_silence_frames])
                x_s1 = np.asarray(s1_s0_df.iloc[idx_after_silence_frames-self.num_context_frames:idx_after_silence_frames])
            else:
                num_pad = self.num_context_frames - idx_after_silence_frames -1 
                x_s0 = np.pad(np.asarray(s0_s1_df.iloc[0:idx_after_silence_frames+1]),[(num_pad,0),(0,0)],'constant')
                x_s1 = np.pad(np.asarray(s1_s0_df.iloc[0:idx_after_silence_frames+1]),[(num_pad,0),(0,0)],'constant')
            self.xs.append((x_s0,x_s1)) 
            self.ys.append((int(previous_speaker), int(hold_shift_label), int(next_speaker)))

    def __prepare_pauses_df(self, dialogue):
        s0_feature_df, s1_feature_df = self.__load_dataframes(dialogue)
        # Obtain frame indices where both speakers are speaking. 
        s0_va_idxs =  np.where(s0_feature_df['voiceActivity'] == 1)[0]
        s1_va_idxs =  np.where(s1_feature_df['voiceActivity'] == 1)[0]
        va_idxs = np.union1d(s0_va_idxs,s1_va_idxs)
        # Obtain index of last speaking frame before silences 
        speak_before_silence_frames_idx = \
            va_idxs[np.where(np.diff(va_idxs) > self.num_pause_frames)]
        self.num_silences += len(speak_before_silence_frames_idx)
        # Remove scenarios where both speakers were speaking last i.e., only 
        # one speaking could have been speaking before te pause
        speak_before_silence_frames_idx = [idx for idx in speak_before_silence_frames_idx \
            if not (idx in s0_va_idxs and idx in s1_va_idxs) and (idx in s0_va_idxs or idx in s1_va_idxs)]
        # Next, we want to find all the instances where one (and only one) 
        # speaker continues within the next future_window_ms seconds. 
        pauses_df = pd.DataFrame(columns=['pauseStartFrameTime', 'previousSpeaker',
            'pauseEndFrameIndex', 'nextSpeechFrameIndex', 'holdShiftLabel', 'nextSpeaker'])
        for i,idx in enumerate(speak_before_silence_frames_idx):
            # Obtain index of the frame after the specified pause length. 
            idx_after_silence_frames = idx + self.num_pause_frames + 1 
            # Get the voice activity in the specified future window
            s0_window_va = np.asarray((s0_feature_df['voiceActivity'])[
                idx_after_silence_frames:idx_after_silence_frames+self.future_window_frames] == 1)
            s1_window_va = np.asarray((s1_feature_df['voiceActivity'])[
                idx_after_silence_frames:idx_after_silence_frames+self.future_window_frames] == 1)
            # Determine the last speaker before silence 
            last_participant = 0 if s0_feature_df['voiceActivity'].iloc[idx] else 1 
            # NOTE: Both speakers might start speaking in the future window but we want 
            # to make sure that only one of the speakers starts i.e., no overlap. 
            # NOTE: 0 = hold, 1 = shift 
            # Condition 1: Speaker 0 is next. 
            if s0_window_va.any() and not s1_window_va.any():
                next_va_idx = np.argmax(s0_window_va) + idx_after_silence_frames  
                hold_shift_label = self.HOLD_LABEL if last_participant == 0 else self.SHIFT_LABEL
                pauses_df.loc[i] = (
                    s0_feature_df.iloc[idx]['frameTime'], last_participant,
                     idx_after_silence_frames, next_va_idx, hold_shift_label,0)             
            # Condition 2: Speaker 1 is next. 
            elif s1_window_va.any() and not s0_window_va.any():
                next_va_idx = np.argmax(s1_window_va) + idx_after_silence_frames 
                hold_shift_label = self.HOLD_LABEL if last_participant == 1 else self.SHIFT_LABEL
                pauses_df.loc[i] = (
                    s1_feature_df.iloc[idx]['frameTime'], last_participant,
                    idx_after_silence_frames, next_va_idx, hold_shift_label,1) 
        # Update the shift / hold values 
        self.num_holds += (pauses_df['holdShiftLabel'] == self.HOLD_LABEL).sum()
        self.num_shifts += (pauses_df['holdShiftLabel'] == self.SHIFT_LABEL).sum()
        self.num_pauses = self.num_holds + self.num_shifts
        # Save the pauses df if save dir provided 
        if self.save_dir != None:
            pauses_df.to_csv("{}/{}_pauses_df.csv".format(self.save_dir, dialogue))
        return pauses_df 

    def __load_dataframes(self, dialogue):
        if self.s0_participant == "f":
            s0_feature_df = pd.read_csv(self.feature_paths_map[dialogue]["f"], index_col=0,delimiter=",") 
            s1_feature_df = pd.read_csv(self.feature_paths_map[dialogue]["g"],index_col=0,delimiter=",")
        else:
            s0_feature_df = pd.read_csv(self.feature_paths_map[dialogue]["g"],index_col=0,delimiter=",") 
            s1_feature_df = pd.read_csv(self.feature_paths_map[dialogue]["f"],index_col=0,delimiter=",")
        # Trim the dataframes to the same length 
        min_num_frames = np.min([len(s0_feature_df.index),len(s1_feature_df.index)])
        s0_feature_df = s0_feature_df[:min_num_frames]
        s1_feature_df = s1_feature_df[:min_num_frames]
        assert len(s0_feature_df) == len(s1_feature_df)
        return s0_feature_df, s1_feature_df 


### Testing Dataset Class 

In [18]:
FRAME_STEP_SIZE_MS = 50 

In [19]:
# Load the processed data. 
dataset_csv_paths =  glob.glob("{}/*.csv".format(FULL_PROCESSED_FEATURE_DIR))

if LIMITED_RESOURCES:
    dataset_csv_paths = dataset_csv_paths[:SMALL_DATASET_SIZE]

len(dataset_csv_paths)

10

In [20]:
train_dialogues, val_dialogues, test_dialogues = get_train_val_test_dialogues(dataset_csv_paths)
len(train_dialogues), len(val_dialogues), len(test_dialogues)

(5, 2, 3)

In [21]:
feature_paths_map =  collect_dialogue_features(
    train_dialogues,FULL_PROCESSED_FEATURE_DIR)
len(feature_paths_map)

5

In [22]:

test_pauses_dataset = Skantze2017PausesDataset(
    feature_paths_map=feature_paths_map,
    sequence_length_ms= 60_000, 
    min_pause_length_ms=500,  
    max_future_silence_window_ms=1000, 
    s0_participant="f",
    frame_step_size_ms=FRAME_STEP_SIZE_MS,
    save_dir = PROCESSED_DATA_DIR)

In [23]:
test_pauses_dataset.get_pause_statistics()

{'min_pause_length_ms ': 500,
 'sequence_length_ms': 60000,
 'max_future_silence_window_ms': 1000,
 'num_silences': 348,
 'num_holds': 129,
 'num_shifts': 119,
 'num_pauses': 248,
 's0_participant': 'f'}

In [24]:
dataloader = generate_dataloader(test_pauses_dataset, batch_size=1)

In [25]:
len(dataloader)

248

In [26]:
for x_batch, y_batch in dataloader:
    x_s0, x_s1 = x_batch 
    print(x_s0.shape, x_s1.shape)
    print(y_batch)
    break 

torch.Size([1, 1200, 130]) torch.Size([1, 1200, 130])
[tensor([0]), tensor([1]), tensor([1])]
