In [1]:
import os
os.sys.path.append('../')
import data.data_loader as DL
import sys
import wave
from glob import glob
from sklearn.utils import shuffle
import itertools

In [2]:
rDots_home_dir = '/home/muncok/DL/dataset/reddots_r2015q4_v1/'
rDots_pcm_dir = os.path.join(rDots_home_dir, 'pcm/')
rDots_wav_dir = os.path.join(rDots_home_dir, 'wav/')
rDots_ndx_dir = os.path.join(rDots_home_dir, 'ndx/')
rDots_infos_dir = os.path.join(rDots_home_dir, 'infos/')

In [3]:
def pcm_to_wav(audio_path, target_path, sampling_rate=16000):
    if audio_path.split('.')[-1] != 'pcm':
        raise BaseException('File is not pcm formatted')
    
    with open(audio_path, 'rb') as pcmfile:
        pcmdata = pcmfile.read()
    with wave.open(audio_path.replace('pcm', 'wav'), 'wb') as wavfile:
        wavfile.setparams((1, 2, sampling_rate, 0, 'NONE', 'NONE'))
        wavfile.writeframes(pcmdata)

In [4]:
def rDots_prepare_files(rDots_home_dir):
    pcmfiles = []
    pattern = '*.pcm'
    
    rDots_pcm_dir = os.path.join(rDots_home_dir, 'pcm/')
    
    rDots_wav_dir = os.path.join(rDots_home_dir, 'wav/')
    
    '''
    Copy substructure of pcm directory to new wav directory for the first run
    '''
    if not os.path.exists(rDots_wav_dir):
        os.mkdir(rDots_wav_dir)
        
        for dirpath, dirnames, filenames in os.walk(rDots_pcm_dir):
            structure = os.path.join(rDots_wav_dir, dirpath[len(rDots_pcm_dir):])
            if not os.path.isdir(structure):
                os.mkdir(structure)
    
    for dir,_,_ in os.walk(os.path.join(rDots_home_dir, 'pcm/')):
        pcmfiles.extend(glob(os.path.join(dir, pattern)))
    
    for pcmfile in pcmfiles:
        pcm_to_wav(pcmfile, os.path.join(rDots_home_dir, 'wav/'))

In [12]:
rDots_prepare_files(rDots_home_dir)

In [5]:
def rDots_parse_ndx(ndx_dir, part_num):
    genders = ['f', 'm']
    file_names, spk_IDs, sent_IDs, trial_types = [], [], [], []
    
    for gender in genders:
        with open(ndx_dir + gender + '_part_0' + str(part_num) + '.ndx', 'r') as ndx:
            lines = ndx.readlines()
            for line in lines:
                line = line.strip()
                [spk_sent, utterance, TC, TW, IC, IW] = line.split(',')
                file_names.append(utterance)
                
                [spk, sent] = spk_sent.split('_')
                spk_IDs.append(spk)
                sent_IDs.append(int(sent))
                
                if TC=='Y': trial_types.append(0) 
                elif TW=='Y': trial_types.append(1) 
                elif IC=='Y': trial_types.append(2) 
                elif IW=='Y': trial_types.append(3) 
    return [file_names, spk_IDs, sent_IDs, trial_types]

In [6]:
def rDots_parse_trn(trn_dir, part_num):
    genders = ['f', 'm']
    spk_ID, sent_ID, file1_name, file2_name, file3_name = [], [], [], [], []
    
    for gender in genders:
        with open(trn_dir + gender + '_part_0' + str(part_num) + '.trn', 'r') as trn:
            lines = trn.readlines()
            for line in lines:
                line = line.strip()
                [spk_sent, utters] = line.split(' ')
                [utter1, utter2, utter3] = utters.split(',')
                [spk, sent] = spk_sent.split('_')
                spk_ID.append(spk)
                sent_ID.append(int(sent))
                file1_name.append(utter1)
                file2_name.append(utter2)
                file3_name.append(utter3)
    return [spk_ID, sent_ID, file1_name, file2_name, file3_name]

In [7]:
def rDots_spkID_to_label(spk_IDs):
    label = []
    mapping = {}
    count = 0
    for spk_ID in spk_IDs:
        if spk_ID not in mapping:
            label.append(count)
            mapping[spk_ID] = count
            count += 1
        else:
            label.append(mapping[spk_ID])
    '''
    @return vars
    label: list type, contains numerical labels for whole spk_IDs 
    mapping: dict type, contains mapping between actual speaker & numerical label
    '''
    return label, mapping

In [8]:
def rDots_get_target_spk(infos_dir):
    genders = ['f', 'm']
    spk_IDs = []
    
    for gender in genders:
        with open(infos_dir + gender + '_target.txt', 'r') as infos:
            lines = infos.readlines()
            for line in lines:
                spk_IDs.append(line.split(';')[0])
    return spk_IDs

In [9]:
def rDots_get_imposter_spk(infos_dir):
    genders = ['f', 'm']
    spk_IDs = []
    
    for gender in genders:
        with open(infos_dir + gender + '_imposter.txt', 'r') as infos:
            lines = infos.readlines()
            for line in lines:
                spk_IDs.append(line.split(';')[0])
    return spk_IDs

In [10]:
def rDots_make_manifest(rDots_home_dir, default, part_num=None, hard_separate=False, testPOI=None, sent_filter=None):
    '''
    :param default: True -> make iden manifest with given ndx & trn file / False -> exclude anything in testPOI & sent_filter
    :param hard_separate: in defaulte mode, exclude all utterances by speakers on target list if True, exclude only those on
                            ndx & trn file if False
    '''
    rDots_wav_dir = os.path.join(rDots_home_dir, 'wav/')
    file_dirs = set(glob(rDots_wav_dir + '*/*'))
    print(len(file_dirs))
    spk_dirs = glob(rDots_wav_dir + '*/')
    spk_IDs = set([spk_dir.split('/')[-2] for spk_dir in spk_dirs])
    
    
    if default==True:
        if part_num > 4 or part_num < 1 or not isinstance(part_num, int):
            raise BaseException("Invalid part number (only 1~4 possible)")
            
        rDots_ndx_dir = os.path.join(rDots_home_dir, 'ndx/')
        rDots_trn_dir = os.path.join(rDots_home_dir, 'ndx/')
        
        ndx_list = rDots_parse_ndx(rDots_ndx_dir, part_num)
        trn_list = rDots_parse_trn(rDots_trn_dir, part_num)
                
        veri_spk_IDs = set(trn_list[0])
        
        if hard_separate is False:
            iden_spk_IDs = list(spk_IDs)
            iden_label_mapping = rDots_spkID_to_label(iden_spk_IDs)[1]
            veri_file_dirs = [(rDots_wav_dir + file_name + '.wav') for file_name in ndx_list[0]]
            for i in range(2, 5):
                veri_file_dirs = veri_file_dirs + [(rDots_wav_dir + file_name + '.wav') for file_name in trn_list[i]]
            
            for file_dir in veri_file_dirs:
                if '.' not in file_dir:
                    print(file_dir)
            veri_file_dirs = set(veri_file_dirs)
            iden_file_dirs = list(file_dirs - veri_file_dirs)
            veri_file_dirs = list(veri_file_dirs)
        else:
            iden_spk_IDs = list(spk_IDs - veri_spk_IDs)
            iden_label_mapping = rDots_spkID_to_label(iden_spk_IDs)[1]
            
            veri_file_dirs = []
            for veri_spk_ID in veri_spk_IDs:
                veri_file_dirs = veri_file_dirs + glob(rDots_wav_dir + veri_spk_ID + '/*')
            veri_file_dirs = set(veri_file_dirs)
            
            iden_file_dirs = list(file_dirs - veri_file_dirs)
            veri_file_dirs = list(veri_file_dirs)
        
        iden_file_dirs = shuffle(iden_file_dirs)
        veri_file_dirs = shuffle(veri_file_dirs)
        
        total_len = len(iden_file_dirs)
        train_lim = int(total_len * 0.64)
        val_lim = int(total_len * 0.8)
        iden_datasets = {'train': iden_file_dirs[:train_lim], 
                         'val': iden_file_dirs[train_lim:val_lim],
                         'test': iden_file_dirs[val_lim:]}
        
        for dataset in iden_datasets.keys():
            with open("../data/RedDots_iden_{}_manifest.csv".format(dataset), "w") as f:
                for file_dir in iden_datasets[dataset]:
                    spk_ID = iden_label_mapping[file_dir.split('/')[-2]]
                    f.write(file_dir + ',' + str(spk_ID))
                    f.write('\n')
            f.close()
                    
        veri_train_c = list(itertools.combinations(iden_file_dirs[:int(total_len / 10)], 2))
        veri_test_c = list(itertools.combinations(veri_file_dirs[:int(len(veri_file_dirs)/10)], 2))
        veri_datasets = {'train': veri_train_c, 'test': veri_test_c}
        
        for dataset in veri_datasets.keys():
            with open('../data/RedDots_veri_{}_manifest.csv'.format(dataset), "w") as f:
                for data in veri_datasets[dataset]:
                    line = data[0] + ',' + data[1] + ','
                    if data[0].split('/')[-2] == data[1].split('/')[-2]:
                        line = line + '1'
                    else:
                        line = line + '0'
                    f.write(line)
                    f.write('\n')
            f.close()
    else:
        pass

In [13]:
rDots_make_manifest(rDots_home_dir, True, part_num=1, hard_separate=False)

15305


In [None]:
prepare_RedDots_files()

In [None]:
a = [1, 2, 3, 4, 5]
b = list(itertools.combinations(a, 2))
print(b)

In [None]:
for i in range(2, 5):
    print(i)

In [None]:
import numpy as np
import pylab
data = np.memmap(rDots_pcm_dir + 'f0001/20150302171020760_f0001_58.pcm', dtype='h', mode='r')
print ("VALUES:", data[:100])
pylab.plot(data)
pylab.show()

In [None]:
subdirs = glob(rDots_wav_dir + '*/')
for subdir in subdirs:
    print(subdir)
    print(subdir.split('/')[-2])