In [0]:
!pip install pydub
%tensorflow_version 1.x
import tensorflow as tf
import urllib
import codecs
import fnmatch
import os
import sys
import tarfile
import tempfile
import unicodedata
from os.path import splitext
from pydub import AudioSegment


Collecting pydub
  Downloading https://files.pythonhosted.org/packages/7b/d1/fbfa79371a8cd9bb15c2e3c480d7e6e340ed5cc55005174e16f48418333a/pydub-0.24.1-py2.py3-none-any.whl
Installing collected packages: pydub
Successfully installed pydub-0.24.1
TensorFlow 1.x selected.


In [0]:
LIBRI_SPEECH_URLS = {
    "train-clean-100":
        "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
}

def download_and_extract(directory, url):
  """
    directory: the directory where to extract the tarball.
    url: the url to download the data file.
  """

  if not tf.gfile.Exists(directory):
    tf.gfile.MakeDirs(directory)

  _, tar_filepath = tempfile.mkstemp(suffix=".tar.gz")

  try:
    tf.logging.info("Downloading %s to %s" % (url, tar_filepath))

    def _progress(count, block_size, total_size):
      sys.stdout.write("\r>> Downloading {} {:.1f}%".format(
          tar_filepath, 100.0 * count * block_size / total_size))
      sys.stdout.flush()

    urllib.request.urlretrieve(url, tar_filepath, _progress)
    print()
    statinfo = os.stat(tar_filepath)
    tf.logging.info(
        "Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
    with tarfile.open(tar_filepath, "r") as tar:
      tar.extractall(directory)
  finally:
    tf.gfile.Remove(tar_filepath)
def download_and_process_datasets(directory, datasets):
  """
  Args:
    directory: the directory to put all the downloaded and preprocessed data.
    datasets: list of dataset names that will be downloaded and processed.
  """

  tf.logging.info("Preparing LibriSpeech dataset: {}".format(
      ",".join(datasets)))
  for dataset in datasets:
    tf.logging.info("Preparing dataset %s", dataset)
    dataset_dir = os.path.join(directory, dataset)
    download_and_extract(dataset_dir, LIBRI_SPEECH_URLS[dataset])
    convert_audio_and_split_transcript(
        dataset_dir + "/LibriSpeech", dataset, dataset + "-wav",
        dataset_dir + "/LibriSpeech", 
        dataset + ".csv")
    
def convert_audio_and_split_transcript(input_dir, source_name, target_name,
                                       output_dir, output_file):
  """
  Args:
    input_dir: the directory which holds the input dataset.
    source_name: the name of the specified dataset. e.g. test-clean
    target_name: the directory name for the newly generated audio files.
                 e.g. test-clean-wav
    output_dir: the directory to place the newly generated csv files.
    output_file: the name of the newly generated csv file. e.g. test-clean.csv
  """

  tf.logging.info("Preprocessing audio and transcript for %s" % source_name)
  source_dir = os.path.join(input_dir, source_name)
  target_dir = os.path.join(input_dir, target_name)

  if not tf.gfile.Exists(target_dir):
    tf.gfile.MakeDirs(target_dir)

  # Convert all FLAC file into WAV format. At the same time, generate the csv
  # file.
  for root, _, filenames in tf.gfile.Walk(source_dir):
    for filename in fnmatch.filter(filenames, "*.trans.txt"):
      trans_file = os.path.join(root, filename)
      with codecs.open(trans_file, "r", "utf-8") as fin:
        for line in fin:
          seqid, transcript = line.split(" ", 1)
          # We do a encode-decode transformation here because the output type
          # of encode is a bytes object, we need convert it to string.
          transcript = unicodedata.normalize("NFKD", transcript).encode(
              "ascii", "ignore").decode("ascii", "ignore").strip().lower()

          # Convert FLAC to WAV.

          def flac2wav(flac_file):
            wav_path = os.path.join(target_dir, seqid + ".wav")
            song = AudioSegment.from_file(flac_file,format="flac")
            song.export(wav_path, format = "wav")
          flac_file = os.path.join(root, seqid + ".flac")
          #print(flac_file)
          #print(splitext(flac_file)[0])
          wav_file = os.path.join(target_dir, seqid + ".wav")
          print(wav_file)
          if not tf.gfile.Exists(wav_file):
            flac2wav(flac_file)   

In [0]:
tf.gfile.MakeDirs("/tmp/librispeech_data")
download_and_process_datasets("/tmp/librispeech_data",["train-clean-100"])

In [0]:
directory="/tmp/librispeech_data"
dataset="train-clean-100"
dataset_dir = os.path.join(directory, dataset)
convert_audio_and_split_transcript(
        dataset_dir + "/LibriSpeech", dataset, dataset + "-wav",
        dataset_dir + "/LibriSpeech", 
        dataset + ".csv")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0004.wav
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0005.wav
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0006.wav
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0007.wav
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0008.wav
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0009.wav
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0010.wav
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0011.wav
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0012.wav
/tmp/librispeech_data/train-clean-100/LibriSpeech/train-clean-100-wav/2911-15084-0013.wav
/tmp/librispeech_data/train-clean-1

In [0]:
labels={}
partitions={}
train_list=[]
data=[]
input_dir=dataset_dir + "/LibriSpeech" 
source_name=dataset
source_dir = os.path.join(input_dir, source_name)
for root, _, filenames in tf.gfile.Walk(source_dir):
    for filename in fnmatch.filter(filenames, "*.trans.txt"):
      trans_file = os.path.join(root, filename)
      with codecs.open(trans_file, "r", "utf-8") as fin:
        for line in fin:
          seqid, transcript = line.split(" ", 1)
          # We do a encode-decode transformation here because the output type
          # of encode is a bytes object, we need convert it to string.
          transcript = unicodedata.normalize("NFKD", transcript).encode(
              "ascii", "ignore").decode("ascii", "ignore").strip().lower()
          labels[seqid+".wav"]=transcript
          sample=(seqid+".wav",transcript)
          data.append(sample)
          train_list.append(seqid+".wav")
partitions["train"]=train_list          
          
def process(sample):
  source,transcript=sample
  line_to_index=[]
  for character in list(transcript):
  if character==' ':
    character='<SPACE>'
    line_to_index.append(char_map[character])  
  processed=(spectrogram(source,aug),line_to_index)   
  return processed      

In [0]:
char_map_str = """
<SPACE> 0
a 1
b 2
c 3
d 4
e 5
f 6
g 7
h 8
i 9
j 10
k 11
l 12
m 13
n 14
o 15
p 16
q 17
r 18
s 19
t 20
u 21
v 22
w 23
x 24
y 25
z 26
' 27
"""

char_map = {}
index_map = {}

for line in char_map_str.strip().split('\n'):
    ch, index = line.split()
    char_map[ch] = int(index)
    index_map[int(index)] = ch
index_map[0] = ' '

In [0]:
import numpy as np
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=128, dim=(),shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def on_epoch_end(self):
  'Updates indexes after each epoch'
      self.indexes = np.arange(len(self.list_IDs))
      if self.shuffle == True:
        np.random.shuffle(self.indexes)
    
    def __data_generation(self, list_IDs_temp):
  'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
  # Initialization
      X = np.empty((self.batch_size, *self.dim, self.n_channels))
      Y = np.empty((self.batch_size), dtype=int)

  # Generate data
      for i, ID in enumerate(list_IDs_temp):
        x,y=process(self.data[i])
        X[i,]=x
        Y[i,]=y

      return X,Y 
    
    def __len__(self):
        'Denotes the number of batches per epoch'
      return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y   

"that had its source away back in the woods of the old cuthbert place it was reputed to be an intricate headlong brook in its earlier course through those woods with dark secrets of pool and cascade but by the time it reached lynde's hollow it was a quiet well conducted little stream"

In [0]:
def spectrogram(samples, sample_rate, stride_ms = 10.0, 
                          window_ms = 20.0, max_freq = None, eps = 1e-14):

    stride_size = int(0.001 * sample_rate * stride_ms)
    window_size = int(0.001 * sample_rate * window_ms)

    input_len=480000
    if len(audio)>input_length:
      samples = samples[:input_length]
    else:
      samples = np.pad(samples, (0, max(0, input_length - len(samples))), "constant")
    truncate_size = (len(samples) - window_size) % stride_size
    samples = samples[:len(samples) - truncate_size]
    nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
    nstrides = (samples.strides[0], samples.strides[0] * stride_size)
    windows = np.lib.stride_tricks.as_strided(samples, 
                                          shape = nshape, strides = nstrides)
    
    assert np.all(windows[:, 1] == samples[stride_size:(stride_size + window_size)])

    # Window weighting, squared Fast Fourier Transform (fft), scaling
    weighting = np.hanning(window_size)[:, None]
    
    fft = np.fft.rfft(windows * weighting, axis=0)
    fft = np.absolute(fft)
    fft = fft**2
    
    scale = np.sum(weighting**2) * sample_rate
    fft[1:-1, :] *= (2.0 / scale)
    fft[(0, -1), :] /= scale
    
    # Prepare fft frequency list
    freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
    
    # Compute spectrogram feature
    ind = np.where(freqs <= max_freq)[0][-1] + 1
    specgram = np.log(fft[:ind, :] + eps)
    mean = np.mean(specgram, axis=0)
    std = np.std(specgram, axis=0)
    specgram = (specgram - mean) / std
    return specgram
