# About

The goal in this notebook is to take the specific GeMaps feature csv files required 
for this project and create a Dataset class for the Skantze2017 model. 

## Setup 

In [1]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [5]:

import os
import pandas as pd
import numpy as np
import sys 
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Tensorflow imports 
# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"


# Pytorch imports 
import torch
from torch.utils.data import Dataset, DataLoader

# Others 
import glob 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)




In [6]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 
# LIMITED_RESOURCES = not IS_CUDA_ENV
LIMITED_RESOURCES = False 

if LIMITED_RESOURCES:
    SMALL_DATASET_SIZE = 10

if IS_COLAB:
    SMALL_DATASET = False 

In [7]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [8]:
# Project Paths
NOTEBOOK_NAME = "2.0-MU-Skantze2017-MapTask-Dataset-POC"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous" 
# --- Input data dirs. 
DATASET_NAME = "maptask" # NOTE: MapTask contains the full and prosody feature sets. 
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", DATASET_NAME)
RAW_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "raw", "maptask")

# --- Result dirs. 
# NOTE: The model dir will have to change depending on where the models are stored. 
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"reports",NOTEBOOK_NAME)
SAVE_DATASET_DIR = os.path.join(PROJECT_ROOT_DIR,"data","processed",NOTEBOOK_NAME)


os.makedirs(REPORTS_DIR,exist_ok=True)
os.makedirs(SAVE_DATASET_DIR,exist_ok=True)


In [9]:
# Paths to the specific feature sets 
FULL_PROCESSED_FEATURE_DIR = os.path.join(PROCESSED_DATA_DIR,"full")
PROSODY_PROCESSED_FEATURE_DIR = os.path.join(PROCESSED_DATA_DIR,"prosody")
FULL_PROCESSED_FEATURE_DIR

'/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/maptask/full'

In [10]:
MAPTASK_DIR = os.path.join(RAW_DATA_DIR,"maptaskv2-1")
# Paths within the maptask corpus 
STEREO_AUDIO_PATH = os.path.join(MAPTASK_DIR,"Data/signals/dialogues")
MONO_AUDIO_PATH = os.path.join(MAPTASK_DIR,"Data/signals/mono_signals")
# NOTE: The timed units are also used for Voice Activity annotations. 
TIMED_UNIT_PATHS = os.path.join(MAPTASK_DIR,"Data/timed-units") 
POS_PATH = os.path.join(MAPTASK_DIR,"Data/pos")


## Data Preprocessing 

### Utility Methods 

In [11]:
def get_maptask_participant(csv_path):
    filename, ext = os.path.splitext(os.path.basename(csv_path))
    filename_split = filename.split(".")
    participant = filename_split[1]
    return participant

def get_maptask_dialogue(csv_path):
    filename, ext = os.path.splitext(os.path.basename(csv_path))
    filename_split = filename.split(".")
    dialogue = filename_split[0]
    return dialogue

def read_data(dir_path,dialogue_name, participant,ext):
    """
    Assumption is that the basename . is the dialogue name. 
    """
    results = []
    data_paths = [p for p in os.listdir(dir_path)]
    data_paths = [os.path.join(dir_path,p) for p in data_paths if os.path.splitext(p)[1][1:] == ext]
    for path in data_paths:
       if get_maptask_dialogue(path) == dialogue_name and \
                get_maptask_participant(path) == participant:
            results.append(path)
    return results 

def get_mono_audio(dialogue_name, participant):
    return read_data(MONO_AUDIO_PATH,dialogue_name, participant,"wav")[0]

def get_stereo_audio(dialogue_name):
    return read_data(STEREO_AUDIO_PATH,dialogue_name,"mix","wav")[0]

def get_timed_unit(dialogue_name, participant):
    return read_data(TIMED_UNIT_PATHS,dialogue_name, participant,"xml")[0]


In [12]:
def collect_dialogue_features(dialogue_names, features_dir):
    """
    Collect the dialogue f and g feature files.
    Assumes that features_dir contains both the f and g feature files. 
    """
    collected = {}
    for dialogue in dialogue_names:
        collected[dialogue] = {
            "f" : read_data(features_dir,dialogue,"f","csv")[0], 
            "g" : read_data(features_dir,dialogue,"g","csv")[0]}
    return collected 




In [13]:
# Load the processed data. 
dataset_csv_paths =  glob.glob("{}/*.csv".format(FULL_PROCESSED_FEATURE_DIR))

if LIMITED_RESOURCES:
    dataset_csv_paths = dataset_csv_paths[:SMALL_DATASET_SIZE]

len(dataset_csv_paths)

256

In [14]:
get_maptask_participant(dataset_csv_paths[1])

'g'

In [15]:
get_maptask_dialogue(dataset_csv_paths[1])

'q3nc7'

### Target Voice Activity Labels 

Here, we want to be able to generate target voice activity labels depending 
on the length of the prediction window. 

In [16]:
FRAME_STEP_SIZE_MS = 50
PREDICTION_SIZE_MS = 1000 

In [17]:
def extract_voice_activity_labels(feature_df, N):
    # TODO: FIx the delimiter 
    feature_df = feature_df[["frameTime","voiceActivity"]]
    assert not feature_df.isnull().values.any()
    frame_times_ms = np.asarray(feature_df["frameTime"])
    voice_activity_annotations = np.asarray(feature_df["voiceActivity"])
    assert frame_times_ms.shape[0] == voice_activity_annotations.shape[0]
    labels = np.zeros((frame_times_ms.shape[0],N)) # target label shape: Num Frames x N
    for i in range(len(frame_times_ms)):
        # Pad the last labels with 0 if the conversation has ended 
        if i + N > len(frame_times_ms):
            concat = np.concatenate(
                [voice_activity_annotations[i:],
                 np.zeros(N - (len(frame_times_ms)-i))])
            labels[i] = concat
        else:
            labels[i] = voice_activity_annotations[i:i+N]
    labels_df = pd.DataFrame(labels) 
    labels_df.insert(0,"frameTime",frame_times_ms)
    assert not labels_df.isnull().values.any()
    return labels_df 


In [18]:
# NOTE; Here, 20 means the next 50ms * 20 i.e., the next one second. 
N = int(PREDICTION_SIZE_MS/FRAME_STEP_SIZE_MS)
N 

20

In [19]:
feature_df = pd.read_csv(dataset_csv_paths[0], delimiter=",", index_col=0)

In [20]:

extract_voice_activity_labels(feature_df, N=5)

Unnamed: 0,frameTime,0,1,2,3,4
0,0.00,0.0,0.0,0.0,0.0,0.0
1,0.05,0.0,0.0,0.0,0.0,0.0
2,0.10,0.0,0.0,0.0,0.0,0.0
3,0.15,0.0,0.0,0.0,0.0,0.0
4,0.20,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
8025,401.25,0.0,1.0,1.0,1.0,1.0
8026,401.30,1.0,1.0,1.0,1.0,0.0
8027,401.35,1.0,1.0,1.0,0.0,0.0
8028,401.40,1.0,1.0,0.0,0.0,0.0


### Data Splits 

We want to split the MapTask files into different files based on the dialogue. 





In [21]:
import random
from copy import deepcopy
from sklearn.model_selection import train_test_split 

In [22]:
def get_train_val_test_dialogues(dataset_paths, test_size=0.25, val_size=0.2, 
        seed=GLOBAL_SEED):
    dataset_paths = deepcopy(dataset_paths)
    dialogue_names = sorted(list(set([get_maptask_dialogue(p) for p in dataset_paths])))
    train_dialogues, test_dialogues = train_test_split(dialogue_names, 
        test_size=test_size,random_state=seed)
    train_dialogues, val_dialogues = train_test_split(train_dialogues, 
        test_size=val_size,random_state=seed)
    return train_dialogues, val_dialogues, test_dialogues 


In [23]:
train_dialogues, val_dialogues, test_dialogues = get_train_val_test_dialogues(dataset_csv_paths)
len(train_dialogues), len(val_dialogues), len(test_dialogues)

(76, 20, 32)

In [24]:
feature_paths_map =  collect_dialogue_features(
    train_dialogues,FULL_PROCESSED_FEATURE_DIR)
len(feature_paths_map)

76

## MapTask Training Dataset 



Here, we develop a dataset that can be used to train the Skantze 2017 model using the MapTask corpus. 

In [25]:
df = pd.read_csv(feature_paths_map[train_dialogues[0]]["g"],index_col=0,delimiter=",")
len(df.columns[df.columns != "frameTime"])

65

In [26]:

class Skantze2017VAPredictionMapTaskDataset(Dataset):
    """
    Maptask dataset for voice activity annotation sequence prediction.  
    NOTE: Needs a large amount of memory to load this. 
    """

    def __init__(self, feature_paths_map, sequence_length_ms, 
            prediction_length_ms, target_participant, frame_step_size_ms):
        # Vars. 
        self.feature_paths_map = feature_paths_map 
        self.sequence_length_ms = sequence_length_ms 
        self.prediction_length_ms = prediction_length_ms 
        self.target_participant = target_participant 
        self.frame_step_size_ms = frame_step_size_ms 
        # Calculated 
        self.num_context_frames = int(sequence_length_ms / frame_step_size_ms)
        self.num_target_frames = int(prediction_length_ms / frame_step_size_ms)
        # Storage 
        self.xs = [] 
        self.ys = [] 
        for dialogue in list(self.feature_paths_map.keys()):
            self.__load_data(dialogue)
        assert len(self.xs) == len(self.ys)

    def __len__(self):
        return len(self.xs)
    
    def __getitem__(self, idx):
        if idx > self.__len__():
            raise Exception 
        return self.xs[idx], self.ys[idx]

    def __load_data(self, dialogue):
        s0_feature_df, s1_feature_df = self.__load_dataframes(dialogue)
        # Extract the voice activity labels for s0 as the target labels
        s0_target_labels_df = extract_voice_activity_labels(
            s0_feature_df,self.num_target_frames)
        # Make sure none of the dfs have any nan values 
        assert not s0_feature_df.isnull().values.any() and \
            not s1_feature_df.isnull().values.any() and \
            not s0_target_labels_df.isnull().values.any()
        # Trim the dataframes to the same length 
        min_num_frames = np.min([len(s0_feature_df.index),len(s1_feature_df.index)])
        s0_feature_df = s0_feature_df[:min_num_frames]
        s1_feature_df = s1_feature_df[:min_num_frames]
        s0_target_labels_df = s0_target_labels_df[:min_num_frames]
        # Make sure they all have common frametimes
        assert s0_feature_df['frameTime'].equals(s1_feature_df['frameTime'])
        assert s0_feature_df['frameTime'].equals(s0_target_labels_df['frameTime'])
        s0_s1_df = pd.concat([s0_feature_df,s1_feature_df],axis=1)     
        assert not s0_s1_df.isnull().values.any()     
        # Determine the number of sequences for this dialogue 
        num_sequences = int(np.floor(len(s0_feature_df.index))/self.num_context_frames)
        for i in range(num_sequences):
            x = np.asarray(s0_s1_df.loc[:,s0_s1_df.columns != 'frameTime'][i * \
                self.num_context_frames : (i * self.num_context_frames) \
                    + self.num_context_frames])
            y = np.asarray(s0_target_labels_df.loc[:,s0_target_labels_df.columns\
                    != 'frameTime'][i * self.num_context_frames : \
                        (i * self.num_context_frames) + self.num_context_frames])[-1,:]
            self.xs.append(x)
            self.ys.append(y)

    def __load_dataframes(self, dialogue):
        if self.target_participant == "f":
            s0_feature_df = pd.read_csv(self.feature_paths_map[dialogue]["f"], index_col=0,delimiter=",") 
            s1_feature_df = pd.read_csv(self.feature_paths_map[dialogue]["g"],index_col=0,delimiter=",")
        else:
            s0_feature_df = pd.read_csv(self.feature_paths_map[dialogue]["g"],index_col=0,delimiter=",") 
            s1_feature_df = pd.read_csv(self.feature_paths_map[dialogue]["f"],index_col=0,delimiter=",")
        return s0_feature_df, s1_feature_df 



In [27]:
dataset = Skantze2017VAPredictionMapTaskDataset(
    feature_paths_map=feature_paths_map, 
    sequence_length_ms=60_000, 
    prediction_length_ms=3000, 
    target_participant="f", 
    frame_step_size_ms=FRAME_STEP_SIZE_MS)

In [28]:
next(iter(dataset))[0].shape,next(iter(dataset))[1].shape

((1200, 130), (60,))

In [29]:
len(dataset)

494

In [30]:
# NOTE:Seed worker can be used to ensure reproducibility in DataLoader 
# across runs. 
def seed_worker(worker_id):
    worker_seed =GLOBAL_SEED
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def generate_dataloader(dataset, batch_size=32, shuffle=True, num_workers=0, 
        drop_last=True, pin_memory=True):
    return DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        num_workers=num_workers, 
        drop_last=drop_last, # We always want to remove the last incomplete batch. 
        pin_memory=pin_memory, 
        worker_init_fn=seed_worker)

In [31]:
# Create a dataloader for the data 
dataloader = DataLoader(dataset, batch_size = 32)
dataloader, len(dataloader)

(<torch.utils.data.dataloader.DataLoader at 0x7fc24865dd90>, 16)

In [32]:
import time 

In [33]:
start_time = time.time()
for x_batch, y_batch in dataloader:
    pass 
print("Total time taken to iterate over dataloader: {:.4f} secs".format(
    time.time() - start_time))

Total time taken to iterate over dataloader: 1.3312 secs


### Creating Feature Datasets For Later Use

In [34]:
# Saving a dataloader for the dataset 

feature_set = "full"
if feature_set == "full":
    features_dir = FULL_PROCESSED_FEATURE_DIR
else:
    features_dir = PROSODY_PROCESSED_FEATURE_DIR

save_dir = os.path.join(SAVE_DATASET_DIR,feature_set)
os.makedirs(save_dir,exist_ok=True)

dataset_csv_paths =  glob.glob("{}/*.csv".format(features_dir))

train_dialogues, val_dialogues, test_dialogues = \
    get_train_val_test_dialogues(dataset_csv_paths)
print(len(train_dialogues), len(val_dialogues), len(test_dialogues))

datasets = [] 
for dialogues in (train_dialogues, val_dialogues, test_dialogues):
    dataset = Skantze2017VAPredictionMapTaskDataset(
        feature_paths_map=collect_dialogue_features(
                                dialogues,features_dir), 
        sequence_length_ms=60_000, 
        prediction_length_ms=3000, 
        target_participant="f", 
        frame_step_size_ms=FRAME_STEP_SIZE_MS)
    datasets.append(deepcopy(dataset))

for i, (dataset_name, batch_size) in enumerate(zip(('train','val','test'),(32,32,1))):
    dataloader = generate_dataloader(datasets[i],batch_size=batch_size)
    path = "{}/{}_dataloader_{}.pt".format(save_dir, dataset_name, feature_set)
    torch.save(dataloader,path)


76 20 32


In [35]:
# Loading the saved models 

dataloaders = [] 
for dataset_name in ('train','val','test'):
    path = "{}/{}_dataloader_{}.pt".format(save_dir, dataset_name, feature_set)
    dataloader = torch.load(path)
    dataloaders.append(deepcopy(dataloader))
dataloaders


[<torch.utils.data.dataloader.DataLoader at 0x7fc0c6c28e50>,
 <torch.utils.data.dataloader.DataLoader at 0x7fbfecad6ee0>,
 <torch.utils.data.dataloader.DataLoader at 0x7fc2486e0550>]