# This notebook is used to prepare Excel sheets with all training data
It divides all real and synthetic data into train and ground-truth data paths.
Three folders are used to create dataset: `real`, `real_doubled` and `synthetic`

First we need to include all required libs

In [23]:
from os import path
import logging
import os
import pandas as pd
import re

Create global paths used in notebook

In [24]:
CUR_PATH = os.getcwd()
DATA_PATH_PREFIX = CUR_PATH.replace("notebooks", "data") + "/AEC-Challenge/datasets/"
DATA_PATHS = [DATA_PATH_PREFIX + "test_set", DATA_PATH_PREFIX + "test_set_interspeech2021"]

Check if folders we want to use exists

In [25]:
for path_ in DATA_PATHS:
    if(not path.exists(path_)):
        logging.error("%s is not existing!", path_)
        raise Exception()

Create list of file postfixes to later search one of them

In [26]:
EXCLUDE_FILEENDINGS = ["doubletalk",
                       "farend",
                       "nearend"
                       "sweep"]

## Create Training Dataset from Real Recordings
Crate list of directories containing real recordings

In [27]:
TEST_PATHS = [DATA_PATH_PREFIX + "test_set/clean", DATA_PATH_PREFIX + "test_set/noisy",
              DATA_PATH_PREFIX + "test_set_interspeech2021"]

Create Regex object to find one of excluded file postfixes and define function to perform Regex action on given filename

In [28]:
to_find = re.compile("doubletalk|farend|nearend|sweep")

def get_recording_id(filename_pv):
    try:
        match_obj = to_find.search(filename)
        the_index = match_obj.start()
        return filename[:the_index]
    except:
        return ""


Define function to save real recording to Excel sheet

In [29]:
def save_record_to_csv(work_dir, record_id, out_df):
    BASE_PATH = ""
    if work_dir != TEST_PATHS[-1]:
        BASE_PATH = work_dir + "/" + record_id
    else:
        BASE_PATH = work_dir + "/"

    # doubletalk_clean = ""
    # doubletalk_mic = ""
    # doubletalk_w_mv_clean = ""
    # doubletalk_w_mv_mic = ""
    # farend_s_t_clean = ""
    # farend_s_t_mic = ""
    # farend_s_t_w_mv_clean = ""
    # farend_s_t_w_mv_mic = ""
    # nearend_mic = ""
    sweep_lpb = ""
    sweep_mic = ""

    if work_dir != TEST_PATHS[-1]:
        doubletalk_clean = BASE_PATH + "doubletalk/" + "doubletalk_lpb.wav"
        doubletalk_mic = BASE_PATH + "doubletalk/" + "doubletalk_mic.wav"

        doubletalk_w_mv_clean = BASE_PATH + "doubletalk/" + "doubletalk_with_movement_lpb.wav"
        doubletalk_w_mv_mic = BASE_PATH + "doubletalk/" + "doubletalk_with_movement_mic.wav"

        farend_s_t_clean = BASE_PATH + "farend-singletalk/" + "farend_singletalk_lpb.wav"
        farend_s_t_mic = BASE_PATH + "farend-singletalk/" + "farend_singletalk_mic.wav"

        farend_s_t_w_mv_clean = BASE_PATH + "farend-singletalk/" + "farend_singletalk_with_movement_lpb.wav"
        farend_s_t_w_mv_mic = BASE_PATH + "farend-singletalk/" + "farend_singletalk_with_movement_mic.wav"

        nearend_mic = BASE_PATH + "nearend-singletalk/" + "nearend_singletalk_mic.wav"

    else:
        doubletalk_clean = BASE_PATH + "doubletalk_lpb.wav"
        doubletalk_mic = BASE_PATH + "doubletalk_mic.wav"

        doubletalk_w_mv_clean = BASE_PATH + "doubletalk_with_movement_lpb.wav"
        doubletalk_w_mv_mic = BASE_PATH + "doubletalk_with_movement_mic.wav"

        farend_s_t_clean = BASE_PATH + "farend_singletalk_lpb.wav"
        farend_s_t_mic = BASE_PATH + "farend_singletalk_mic.wav"

        farend_s_t_w_mv_clean = BASE_PATH + "farend_singletalk_with_movement_lpb.wav"
        farend_s_t_w_mv_mic = BASE_PATH + "farend_singletalk_with_movement_mic.wav"

        nearend_mic = BASE_PATH + "nearend_singletalk_mic.wav"


    data_row = dict()

    data_row["ID"] = record_id

    if os.path.isfile(doubletalk_clean):
        data_row["DT Clean"] = doubletalk_clean
    else: data_row["DT Clean"] = ""

    if os.path.isfile(doubletalk_mic):
        data_row["DT Mic"] = doubletalk_mic
    else: data_row["DT Mic"] = ""

    if os.path.isfile(doubletalk_w_mv_clean):
       data_row["DT MV Clean"] = doubletalk_w_mv_clean
    else: data_row["DT MV Clean"] = ""
    if os.path.isfile(doubletalk_w_mv_mic):
        data_row["DT MV Mic"] = doubletalk_w_mv_clean
    else: data_row["DT MV Mic"] = ""

    if os.path.isfile(farend_s_t_clean):
        data_row["FE Clean"] = farend_s_t_clean
    else: data_row["FE Clean"] = ""
    if os.path.isfile(farend_s_t_mic):
        data_row["FE Mic"] = farend_s_t_mic
    else: data_row["FE Mic"] = ""

    if os.path.isfile(farend_s_t_w_mv_clean):
        data_row["FE MV Clean"] = farend_s_t_w_mv_clean
    else: data_row["FE MV Clean"] = ""
    if os.path.isfile(farend_s_t_w_mv_mic):
        data_row["FE MV Mic"] = farend_s_t_w_mv_mic
    else: data_row["FE MV Mic"] = ""

    if os.path.isfile(nearend_mic):
        data_row["NE Clean"] = nearend_mic
    else: data_row["NE Clean"] = ""

    if os.path.isfile(sweep_lpb):
        data_row["SP Clean"] = sweep_lpb
    else: data_row["SP Clean"] = ""
    if os.path.isfile(sweep_mic):
        data_row["SP Mic"] = sweep_mic
    else: data_row["SP Mic"] = ""

    return out_df.append(data_row, ignore_index = True)


Create Pandas DataFrame object to save recordings records

In [30]:
cols = ["ID", "DT Clean", "DT Mic", "DT MV Clean", "DT MV Mic", "FE Clean", "FE Mic", "FE MV Clean", "FE MV Mic", "NE Clean", "SP Clean", "SP Mic"]
recs_df = pd.DataFrame(columns = cols)

For every directory, gather unique IDs and save recordings accordingly to columns in DataFrame

In [31]:
for tr_dir in TEST_PATHS:
    recordings_ids = []

    tst_dir = []
    if tr_dir == TEST_PATHS[-1]:
        tst_dir = [tr_dir + "/doubletalk", tr_dir + "/farend-singletalk", tr_dir + "/nearend-singletalk"]
    else:
        tst_dir = [tr_dir]

    for dir in tst_dir:
        for filename in os.listdir(dir):
            rec_id = get_recording_id(filename)
            if rec_id != "":
                recordings_ids.append(rec_id)

    recordings_ids = list(set(recordings_ids))
    print("Found {} unique IDs in {} directory.".format(len(recordings_ids), tr_dir))

    for ids in recordings_ids:
        recs_df = save_record_to_csv(tr_dir, ids, recs_df)

Found 98 unique IDs in D:\Repos\NAEC\data/AEC-Challenge/datasets/test_set/clean directory.
Found 100 unique IDs in D:\Repos\NAEC\data/AEC-Challenge/datasets/test_set/noisy directory.
Found 426 unique IDs in D:\Repos\NAEC\data/AEC-Challenge/datasets/test_set_interspeech2021 directory.


## Create Training Dataset from Synthetic Recordings

Define function to save synthetic recodings to Excel sheet

In [32]:
def save_synth_record_to_csv(work_dir, record_id, out_df):
    BASE_PATH = work_dir

    doubletalk_clean = BASE_PATH + "nearend_speech/" + "nearend_speech_fileid_{}.wav".format(record_id)
    doubletalk_mic = BASE_PATH + "nearend_mic_signal/" + "nearend_mic_fileid_{}.wav".format(record_id)

    farend_mic = BASE_PATH + "farend_speech/" + "farend_speech_fileid_{}.wav".format(record_id)

    nearend_mic = BASE_PATH + "nearend_speech/" + "nearend_speech_fileid_{}.wav".format(record_id)

    data_row = dict()

    data_row["ID"] = str(record_id)

    if os.path.isfile(doubletalk_clean):
        data_row["DT Clean"] = doubletalk_clean
    else: data_row["DT Clean"] = ""

    if os.path.isfile(doubletalk_mic):
        data_row["DT Mic"] = doubletalk_mic
    else: data_row["DT Mic"] = ""


    data_row["DT MV Clean"] = ""
    data_row["DT MV Mic"] = ""

    data_row["FE Clean"] = ""

    if os.path.isfile(farend_mic):
        data_row["FE Mic"] = farend_mic
    else: data_row["FE Mic"] = ""

    data_row["FE MV Clean"] = ""
    data_row["FE MV Mic"] = ""

    if os.path.isfile(nearend_mic):
        data_row["NE Clean"] = nearend_mic
    else: data_row["NE Clean"] = ""

    data_row["SP Clean"] = ""
    data_row["SP Mic"] = ""

    return out_df.append(data_row, ignore_index = True)

Read provided CSV file and save only training data to Excel sheet

In [33]:
SYNTH_PATH = DATA_PATH_PREFIX + "synthetic/"
synth_df = pd.read_csv(SYNTH_PATH + "meta.csv")
for idx, row in synth_df.iterrows():
    if row["split"] == "test":
        recs_df = save_synth_record_to_csv(SYNTH_PATH, row["fileid"], recs_df)

Save data to Excel

In [34]:
synth_df.to_excel("meta.xlsx")
recs_df.to_excel("test_data.xlsx")