##### This script creates a master index text file of all of the TORGO data

Run spectrograms.ipynb before running this, in order to create the spectogram images

We will not want to use any audio files that are missing prompts, or were unable to convert to spectrogram

In [1]:
from PIL import Image
import os

# Information to extract into master index

speaker = []
session = []
mic = []
prompt_id = []   # From file name
has_spect = []   # Were we able to generate a spect file for the 
spect_width = []   # Width of spectrogram
spect_height = []   # Height of spectrogram
prompt = []


In [2]:
# Location of source .wav files to convert. Most sessions have two recordings from two different microphones.

dir_names = ['data/TORGO/F01/Session1',
            'data/TORGO/F03/Session1','data/TORGO/F03/Session2','data/TORGO/F03/Session3',
            'data/TORGO/F04/Session1','data/TORGO/F04/Session2',
            'data/TORGO/M01/Session1','data/TORGO/M01/Session2_3',
            'data/TORGO/M02/Session1','data/TORGO/M02/Session2',
            'data/TORGO/M03/Session2',
            'data/TORGO/M04/Session1','data/TORGO/M04/Session2',
            'data/TORGO/M05/Session1','data/TORGO/M05/Session2']
mics = ['wav_headMic','wav_arrayMic']

In [3]:
# Extract information

for d in dir_names:
    for m in mics:
        
        dir_name = d + '/' + m
        print('Processing directory' + dir_name + '...')
                
        if os.path.exists(dir_name):
            for subdir, dirs, files in os.walk(dir_name):
                for file in files:
                    if file.endswith('.wav'):
                        
                        speaker.append(dir_name.split('/')[2])
                        session.append(dir_name.split('/')[3])
                        mic.append(dir_name.split('/')[4])
                        prompt_id.append(file[:-4])
                        
                        prompt_name = d + '/prompts/' + file[:-4] + '.txt'
                        if os.path.exists(prompt_name):
                            with open(prompt_name) as f:
                                prompt.append(f.readline())
                        else:
                            prompt.append(None)
                
                        png_name = subdir + '_spect/' + file[:-4] + '.png'
                        if os.path.exists(png_name):
                            has_spect.append('yes')
                            im=Image.open(png_name)
                            spect_width.append(im.size[0])
                            spect_height.append(im.size[1])
                        else:
                            has_spect.append('no')
                            spect_width.append(None)
                            spect_height.append(None)
                                          
print("Index complete!")

Processing directorydata/TORGO/F01/Session1/wav_headMic...
Processing directorydata/TORGO/F01/Session1/wav_arrayMic...
Processing directorydata/TORGO/F03/Session1/wav_headMic...
Processing directorydata/TORGO/F03/Session1/wav_arrayMic...
Processing directorydata/TORGO/F03/Session2/wav_headMic...
Processing directorydata/TORGO/F03/Session2/wav_arrayMic...
Processing directorydata/TORGO/F03/Session3/wav_headMic...
Processing directorydata/TORGO/F03/Session3/wav_arrayMic...
Processing directorydata/TORGO/F04/Session1/wav_headMic...
Processing directorydata/TORGO/F04/Session1/wav_arrayMic...
Processing directorydata/TORGO/F04/Session2/wav_headMic...
Processing directorydata/TORGO/F04/Session2/wav_arrayMic...
Processing directorydata/TORGO/M01/Session1/wav_headMic...
Processing directorydata/TORGO/M01/Session1/wav_arrayMic...
Processing directorydata/TORGO/M01/Session2_3/wav_headMic...
Processing directorydata/TORGO/M01/Session2_3/wav_arrayMic...
Processing directorydata/TORGO/M02/Session1/

In [4]:
# View number of records, and verify all lists have the same number
print(len(speaker))
print(len(session))
print(len(mic))
print(len(prompt_id))
print(len(has_spect))
print(len(spect_width))
print(len(spect_height))
print(len(prompt))

6179
6179
6179
6179
6179
6179
6179
6179


In [21]:
# Export index to text file

headers = ['speaker', 'session', 'mic', 'prompt_id', 'has_spect', 'spect_width', 'spect_height', 'prompt']
all_data = list(zip(speaker, session, mic, prompt_id, has_spect, spect_width, spect_height, prompt))

with open('index_TORGO_pre.txt', 'w') as f:
    f.write(','.join(str(s) for s in headers) + '\n')
    for d in all_data:
        f.write(','.join(str(s) for s in d) + '\n')

In [22]:
# Remove blank lines

with open('index_TORGO_pre.txt') as filehandle:
    lines = filehandle.readlines()

with open('index_TORGO.txt', 'w') as filehandle:
    lines = filter(lambda x: x.strip(), lines)
    filehandle.writelines(lines)
    
# Delete pre-exported file

os.remove("index_TORGO_pre.txt")