In [1]:
# 定义数据格式

FILE_PATTERN = r'p([0-9]+)_([0-9]+)\.wav'

In [2]:
import re

def get_category_cardinality(files):
    # 用正则表达式解析文件内容，转化为正则表达式对象
    id_reg_expression = re.compile(FILE_PATTERN)
    # 设定最小ID和最大ID的空值，循环读取数据后找出并填入最小值和最大值，int()是转化为十进制的整数
    min_id = None
    max_id = None
    for filename in files:
        matches = id_reg_expression.findall(filename)[0]
        id, recording_id = [int(id_) for id_ in matches]
        if min_id is None or id < min_id:
            min_id = id
        if max_id is None or id > max_id:
            max_id = id
    return min_id, max_id

In [3]:
import random

def randomize_files(files):
    for file in files:
        # 在0和len-1之间生成一个随机浮点数，yield不占用内存，迭代只可以读取一次，比return要好
        file_index = random.randint(0, (len(files) - 1))
        yield files[file_index]

In [4]:
import os
import fnmatch

# 找出所有的音频文件并存储在files,用fnmatch.filter()来测试filenames是否符合pattern，函数返回值是true或false
def find_files(directory, pattern='*.wav'):
    ''' Recursively finds all files matching the pattern. '''
    files = []
    for root, dirnames, filenames in os.walk(directory):
        for filename in fnmatch.filter(filenames, pattern):
            files.append(os.path.join(root, filename))
    return files

In [5]:
import librosa

def load_generic_audio(directory, sample_rate):
    '''Generator that yields audio waveforms from the directory.'''
    files = find_files(directory)                                     # 找出所有音频文件
    id_reg_exp = re.compile(FILE_PATTERN)                                # 解析音频文件
    print("files length: {}".format(len(files)))
    randomized_files = randomize_files(files)
    for filename in randomized_files:                                     # 找出文件ID
        ids = id_reg_exp.findall(filename)
        if not ids:
            # The file name does not match the pattern containing ids, so
            # there is no id.
            category_id = None
        else:
            # The file name matches the pattern for containing ids.
            category_id = int(ids[0][0])
        audio, _ = librosa.load(filename, sr=sample_rate, mono=True)  # 提取音频的特征
        audio = audio.reshape(-1, 1)                           # 把一维数组改成二维数组
        yield audio, filename, category_id

In [6]:
import numpy as np

def trim_silence(audio, threshold):
    '''Removes silence at the beginning and end of a sample.'''
    energy = librosa.feature.rmse(audio)          # 计算一个方均根root-mean-square，数值为常态分布曲线
    frames = np.nonzero(energy > threshold)   # 当方均根大于threshold时，返回数组中非零元素的索引值数组
    indices = librosa.core.frames_to_samples(frames)[1]  # 转换frame indices到 audio sample indices

    # Note: indices can be an empty array, if the whole audio was silence.
    return audio[indices[0]:indices[-1]] if indices.size else audio[0:0]

In [7]:
# 确认音频文件没有id

def not_all_have_id(files):
    ''' Return true iff any of the filenames does not conform to the pattern
        we require for determining the category id.'''
    id_reg_exp = re.compile(FILE_PATTERN)
    for file in files:
        ids = id_reg_exp.findall(file)
        if not ids:
            return True
    return False

In [8]:
import tensorflow as tf
import threading

class AudioReader(object):
    '''Generic background audio reader that preprocesses audio files
    and enqueues them into a TensorFlow queue.'''

    def __init__(self,
                 audio_dir,
                 coord,
                 sample_rate,
                 gc_enabled,
                 receptive_field,
                 sample_size=None,
                 silence_threshold=None,
                 queue_size=32):

        self.audio_dir = audio_dir
        self.sample_rate = sample_rate
        self.coord = coord
        self.sample_size = sample_size
        self.receptive_field = receptive_field
        self.silence_threshold = silence_threshold
        self.gc_enabled = gc_enabled   # Garbage Collection，如果true，可以更加有效运行程序
        self.threads = []
        self.sample_placeholder = tf.placeholder(dtype=tf.float32, shape=None)
        self.queue = tf.PaddingFIFOQueue(queue_size,['float32'],shapes=[(None, 1)])
        self.enqueue = self.queue.enqueue([self.sample_placeholder])

        if self.gc_enabled:
            # 给feed操作创建一个占位符，到时feed_dict会提供数据
            self.id_placeholder = tf.placeholder(dtype=tf.int32, shape=())
            # 创建一个先入先出队列(FIFOQueue)并且将值全设为0
            # PaddingFIFOQueue是一个FIFOQueue ，同时根据padding支持batching变长的tensor
            # enqueue是执行操作，构建一个图以获取队列出来的元素，对该元素加1操作，并将结果再放入队列末尾
            self.gc_queue = tf.PaddingFIFOQueue(queue_size, ['int32'],shapes=[()])
            self.gc_enqueue = self.gc_queue.enqueue([self.id_placeholder])

        '''在路径文件夹并找出音频文件并找出对应ID，同时把ID+=1，因为在tf在第一个index是0，需要转换和categories一致'''
        # TODO Find a better way to check this.
        # Checking inside the AudioReader's thread makes it hard to terminate
        # the execution of the script, so we do it in the constructor for now.
        files = find_files(audio_dir)
        if not files:
            raise ValueError("No audio files found in '{}'.".format(audio_dir))
        if self.gc_enabled and not_all_have_id(files):
            raise ValueError("Global conditioning is enabled, but file names "
                             "do not conform to pattern having id.")
        # Determine the number of mutually-exclusive categories we will
        # accomodate in our embedding table.
        if self.gc_enabled:
            _, self.gc_category_cardinality = get_category_cardinality(files)
            # Add one to the largest index to get the number of categories,
            # since tf.nn.embedding_lookup expects zero-indexing. This
            # means one or more at the bottom correspond to unused entries
            # in the embedding lookup table. But that's a small waste of memory
            # to keep the code simpler, and preserves correspondance between
            # the id one specifies when generating, and the ids in the
            # file names.
            self.gc_category_cardinality += 1
            print("Detected --gc_cardinality={}".format(self.gc_category_cardinality))
        else:
            self.gc_category_cardinality = None

    # enqueue是入列，把数据加入队列；dequeue是出列，把队列中的数据提取出来
    def dequeue(self, num_elements):
        output = self.queue.dequeue_many(num_elements)
        return output

    def dequeue_gc(self, num_elements):
        return self.gc_queue.dequeue_many(num_elements)

    def thread_main(self, sess):
        stop = False
        # Go through the dataset multiple times
        while not stop:
            iterator = load_generic_audio(self.audio_dir, self.sample_rate)
            for audio, filename, category_id in iterator:
                # Coordinator类用来帮助多个线程协同工作，多个线程同步终止。如果线程应该停止则返回True。
                if self.coord.should_stop():
                    stop = True
                    break
                if self.silence_threshold is not None:
                    # Remove silence
                    audio = trim_silence(audio[:, 0], self.silence_threshold)
                    audio = audio.reshape(-1, 1)
                    """
                    if audio.size == 0:
                        print("Warning: {} was ignored as it contains only "
                              "silence. Consider decreasing trim_silence "
                              "threshold, or adjust volume of the audio."
                              .format(filename))
                    """
                # np.pad()用于填充数据，具体示例在 http://blog.csdn.net/liyaohhh/article/details/51111115
                audio = np.pad(audio, [[self.receptive_field, 0], [0, 0]],'constant')

                if self.sample_size:
                    # Cut samples into pieces of size receptive_field +
                    # sample_size with receptive_field overlap
                    while len(audio) > self.receptive_field:
                        piece = audio[:(self.receptive_field + self.sample_size), :]
                        sess.run(self.enqueue,feed_dict={self.sample_placeholder: piece})
                        audio = audio[self.sample_size:, :]
                        if self.gc_enabled:
                            sess.run(self.gc_enqueue, feed_dict={self.id_placeholder: category_id})
                else:
                    sess.run(self.enqueue,feed_dict={self.sample_placeholder: audio})
                    if self.gc_enabled:
                        sess.run(self.gc_enqueue,feed_dict={self.id_placeholder: category_id})

    # 创建一个线程                        
    def start_threads(self, sess, n_threads=1):
        for _ in range(n_threads):
            thread = threading.Thread(target=self.thread_main, args=(sess,))
            thread.daemon = True  # Thread will close when parent quits.
            thread.start()
            self.threads.append(thread)
        return self.threads
