# mp3 to text with deep speech model

---

![speech to text](https://uploads-ssl.webflow.com/5985ca0c9abf440001d1f4b0/5a68a52180efb200017181cf_transcription_icon_v2_EN.png =259x)

In [7]:
from __future__ import absolute_import, division, print_function

import os
import numpy as np
import shlex
import subprocess
import sys
import wave

!pip install  deepspeech-gpu 
from deepspeech import Model, printVersions
from timeit import default_timer as timer

# audio converters
!apt update && apt-get install ffmpeg mpg123

# sox package for adjusting sample rate.
!apt-get install libsox-fmt-all libsox-dev sox
if not os.path.exists('downloads'):
    !mkdir downloads
    
if not os.path.exists('models'):
    # neural network model for acoustic recognition
    !wget -O - https://github.com/mozilla/DeepSpeech/releases/download/v0.5.1/deepspeech-0.5.1-models.tar.gz | tar xvfz -
    !mv deepspeech-0.5.1-models models

Reading package lists... Done
E: Could not open lock file /var/lib/apt/lists/lock - open (13: Permission denied)
E: Unable to lock directory /var/lib/apt/lists/
E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


# Accoustic parameters

In [2]:
model    = 'models/output_graph.pbmm'
alphabet = 'models/alphabet.txt'
lm       = 'models/lm.binary'
trie     = 'models/trie'

# These constants control the beam search decoder

# Beam width used in the CTC decoder when building candidate transcriptions
BEAM_WIDTH = 500

# The alpha hyperparameter of the CTC decoder. Language Model weight
LM_WEIGHT = 1.50

# Valid word insertion weight. This is used to lessen the word insertion penalty
# when the inserted word is part of the vocabulary
VALID_WORD_COUNT_WEIGHT = 2.10


# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training

# Number of MFCC features to use
N_FEATURES = 26

# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9

# Adapt Sample Rate of Audio File

In [3]:
def convert_samplerate(audio_path):
    sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate 16000 --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(audio_path)
    try:
        output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
    except OSError as e:
        raise OSError(e.errno, 'SoX not found, use 16kHz files or install it: {}'.format(e.strerror))

    return 16000, np.frombuffer(output, np.int16)



# Input MP3 Audio File

In [4]:
# download mp3 audio file.

import urllib.request
import os.path

def download_file(filename, url):
    """
    Download an URL to a file
    """
    urllib.request.urlretrieve(url, filename)

            
def download_if_not_exists(filename, url):
    """
    Download a URL to a file if the file
    does not exist already.
    Returns
    -------
    True if the file was downloaded,
    False if it already existed
    """
    if not os.path.exists(filename):
        download_file(filename, url)
        return True
    return False
mp3_file='downloads/speech.mp3'

print('Beginning file :  ',download_if_not_exists(mp3_file,'http://www.obamadownloads.com/mp3s/dnc-2004-speech.mp3'))


wav_file = 'downloads/speech.wav'

if os.path.exists(wav_file):
    print(wav_file + 'exists. Skip conveting.')
else:  
    # convert to wav file.  
    !ffmpeg -i downloads/speech.mp3 -vn -acodec pcm_s16le -ac 1 -ar 16000 -f wav downloads/speech.wav
    #!mpg123 -w speech.wav speech.mp3

Beginning file :   False
downloads/speech.wavexists. Skip conveting.


# Convert MP3 to Text

In [5]:
print('Loading model from file {}'.format(model), file=sys.stderr)
model_load_start = timer()
ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
model_load_end = timer() - model_load_start
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

if lm and trie:
    print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr)
    lm_load_start = timer()
    ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                           VALID_WORD_COUNT_WEIGHT)
    lm_load_end = timer() - lm_load_start
    print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

fin = wave.open(wav_file, 'rb')
fs = fin.getframerate()
if fs != 16000:
    print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
    fs, audio = convert_samplerate(wav_file)
else:
    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

audio_length = fin.getnframes() * (1/16000)
fin.close()

print('Running inference.', file=sys.stderr)
print('================================\n')
inference_start = timer()
print(ds.stt(audio, fs))
inference_end = timer() - inference_start
print('\n================================')
print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

Loading model from file models/output_graph.pbmm
Loaded model in 0.0162s.
Loading language model from files models/lm.binary models/trie
Loaded language model in 0.219s.
Running inference.



i'll be here of the great fertilisation in lettres my deepest gratitude for the privilege of addressing the convention the night of the prosecutor for may because lettaires onepage is pretty unlike my father was a portent on and raised in a small village in kindergarten go went to school and oinometer my grandfather was a cook and the mexican to the british but my grandfather had large dreams for his son through hard work and perseverance my father got his scholarship to study in a magnolia that shone as a beacon a freedom and opportunity to so many who comforted me my man he was born in a town on the other side of the world and can her father were on irenaeus the day answer rabbinate lined up for denjontadenakarondako that home my grandmother raised the baby and went to work on a bonesetter the war had studied on the sibillatory and later moved went all the way to the wide and search of opportunity and made to had big granfarther douncome dream born at tokonoma shared not only an imp

Inference took 503.164s for 777.720s audio file.
