# TELIUM 3 data preparation

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt

import os

import pandas as pd
from shutil import copyfile
import distutils.dir_util
from sklearn.model_selection import train_test_split

In [10]:
metadata = pd.read_csv('meta_audio.csv')
print(metadata.columns)

metadata = metadata[['fileName', 'num_speaker', 'duration', 'gender_name']]
metadata = metadata[metadata.gender_name.isin(['male', 'female']) & metadata.num_speaker == 1]

Index(['Unnamed: 0', 'comments', 'description', 'duration', 'event',
       'film_date', 'languages', 'main_speaker', 'name', 'num_speaker',
       'published_date', 'ratings', 'related_talks', 'speaker_occupation',
       'tags', 'title', 'url', 'views', 'link', 'annualTED', 'film_year',
       'published_year', 'num_speaker_talks', 'technology', 'science',
       'global issues', 'culture', 'design', 'business', 'entertainment',
       'health', 'innovation', 'society', 'Fascinating', 'Courageous',
       'Longwinded', 'Obnoxious', 'Jaw-dropping', 'Inspiring', 'OK',
       'Beautiful', 'Funny', 'Unconvincing', 'Ingenious', 'Informative',
       'Confusing', 'Persuasive', 'wpm', 'words_per_min', 'first_name',
       'gender_name', 'gender_name_class', 'fileName', 'ZCR', 'Energy',
       'EnergyEntropy', 'SpectralCentroid', 'SpectralSpread',
       'SpectralEntropy', 'SpectralFlux', 'SpectralRollof', 'mfcc1', 'mfcc2',
       'mfcc3', 'mfcc4', 'mfccC5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9

In [11]:
metadata.describe()

Unnamed: 0,num_speaker,duration
count,1126.0,1126.0
mean,1.0,14.612433
std,0.0,5.886083
min,1.0,2.4
25%,1.0,10.1
50%,1.0,15.9
75%,1.0,18.4
max,1.0,44.6


In [17]:
def create_full_dataset(dataset):
    dataset = dataset.copy()
    dataset['path'] = dataset.apply(lambda x: os.path.join("/rzhome/ammannma/datasets/TEDLIUM-wav/", x['fileName'] + '.sph.resampled.wav') ,axis=1)
    return dataset

def create_fair_set(metadata):
    males = metadata[metadata.gender_name == 'male']
    females = metadata[metadata.gender_name == 'female']
    count = min(len(males), len(females))
    males = males[:count]
    females = females[:count]
    print("Male count: %s" % len(males))
    print("Female count: %s" % len(females))
    return pd.concat([males, females])

def mean_select(talks, hours):
    talks = talks.copy()
    mean = talks['duration'].mean()
    talks['dev'] = (talks['duration'] - talks['duration'].mean()).abs()
    return talks.sort_values(by=['dev'])[0:int(1/(mean / 60) * hours)]

In [23]:
full = create_full_dataset(metadata)
full = full[full.duration >= full.duration.quantile(.80)]
full.describe()

Unnamed: 0,num_speaker,duration
count,236.0,236.0
mean,1.0,21.833898
std,0.0,3.558382
min,1.0,19.0
25%,1.0,19.7
50%,1.0,20.6
75%,1.0,22.625
max,1.0,44.6


In [19]:
train, evaluation = train_test_split(full, test_size=0.10, random_state=9580)
train = create_fair_set(train)
evaluation = create_fair_set(evaluation)

Male count: 47
Female count: 47
Male count: 7
Female count: 7


In [20]:
def collect_files(data, output_path):
    os.mkdir(output_path)
    for index, row in data.iterrows():
        copyfile(row['path'], os.path.join(output_path, row['fileName'] + '.wav'))

In [22]:
collect_files(train, "/fast/ammannma/speech-separation/workspace/data/TEDLIUM/continuous-train/")
collect_files(evaluation, "/fast/ammannma/speech-separation/workspace/data/TEDLIUM/continuous-evaluation/")