# WSJ0

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt

import os

import pandas as pd
import distutils.dir_util
from sklearn.model_selection import train_test_split

In [7]:
metadata_train = pd.read_csv('/rzhome/ammannma/datasets/csr-i-speakerinfo.csv',sep='\t')
metadata_eval = pd.read_csv('/rzhome/ammannma/datasets/csr-i-speakerinfo-eval.csv',sep='\t')

print(metadata_train[metadata_train.gender == 'M'].describe())
print(metadata_train[metadata_train.gender == 'F'].describe())

metadata = pd.concat([metadata_train, metadata_eval])
print(metadata.columns)

       speaker_id gender
count          64     64
unique         64      1
top           01z      M
freq            1     64
       speaker_id gender
count          65     65
unique         65      1
top           205      F
freq            1     65
Index(['speaker_id', 'gender'], dtype='object')


In [41]:
print("Speaker count: %s" % len(metadata))

Speaker count: 149


In [42]:
def create_full_dataset(dataset_path):
    speaker_dirs = [(os.path.join(dataset_path, name), name.lower()) for name in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, name))]
    print("Speakers in dataset: %s" % len(speaker_dirs))

    speaker_dirs_frame = pd.DataFrame(speaker_dirs, columns=("speaker_path", "speaker_id"))
    return pd.merge(metadata, speaker_dirs_frame, on=['speaker_id', 'speaker_id'])

In [43]:
def create_fair_set(data):
    males = data[data.gender == 'M']
    females = data[data.gender == 'F']
    count = min(len(males), len(females))
    males = males[:count]
    females = females[:count]
    print("Male count: %s" % len(males))
    print("Female count: %s" % len(females))
    return pd.concat([males, females])

In [47]:
def collect_files(data, output_path):
    os.mkdir(output_path)
    for index, row in data.iterrows():
        distutils.dir_util.copy_tree(row['speaker_path'], os.path.join(output_path, row['speaker_id']))

In [20]:
dataset_path = "/rzhome/ammannma/datasets/csr-i-wsj0-complete-wav-resampled-restructured/SI_TR_S/"
# Missing SI_TR_S from other mic set
full = create_full_dataset(dataset_path)
full = full.sort_values(by=['speaker_id'])
train, validation = train_test_split(full, test_size=0.33, random_state=3359)
train = create_fair_set(train)
validation = create_fair_set(validation)

out_dir = "/fast/ammannma/speech-separation/workspace/data/WSJ0/"
collect_files(validation, os.path.join(out_dir, "validation"))
collect_files(train, os.path.join(out_dir, "train"))

Speakers in dataset: 101
Male count: 32
Female count: 32
Male count: 16
Female count: 16


In [50]:
# Evaluation

dataset_path = "/rzhome/ammannma/datasets/csr-i-wsj0-complete-wav-other-mic-resampled-restructured/SI_DT_05/"
full_a = create_full_dataset(dataset_path)
dataset_path = "/rzhome/ammannma/datasets/csr-i-wsj0-complete-wav-other-mic-resampled-restructured/SI_ET_05/"
full_b = create_full_dataset(dataset_path)
dataset_path = "/rzhome/ammannma/datasets/csr-i-wsj0-complete-wav-resampled-restructured/SI_ET_05/"
full_c = create_full_dataset(dataset_path)

full = pd.concat([full_a, full_b, full_c])
evaluation = create_fair_set(full)

Speakers in dataset: 10
Speakers in dataset: 8
Speakers in dataset: 8
Male count: 10
Female count: 10


In [54]:
out_dir = "/fast/ammannma/speech-separation/workspace/data/WSJ0/"
collect_files(evaluation, os.path.join(out_dir, "evaluation"))