In [102]:
import wave
import IPython
from pydub import AudioSegment
from pydub import effects
import boto3
import srt
from pathlib import Path
import os
import string
import time

polly = boto3.client('polly')

In [104]:
def addSilence(duration):
    """
    duration - duration in ms
    """
    silentSegment = AudioSegment.silent(duration=duration)
    return silentSegment

In [105]:
def speedChange(audioClip, speed=1.01):
    if speed < 1.1:
        fast = audioClip.speedup(speed, 150, 25)
    else:
        fast = audioClip.speedup(speed, 50, 5)
    return fast

In [106]:
en_FEMALE = 'Joanna'
en_MALE = 'Matthew'
es_MALE = 'Miguel'
def genSound(id, outputDir, text):
    
    ssml = "<speak>" + "<amazon:auto-breaths>" + text + "</amazon:auto-breaths>" + "</speak>"
    #res = polly.synthesize_speech(Text=text, TextType='text', Engine='standard',VoiceId=es_MALE, OutputFormat='mp3')
    res = polly.synthesize_speech(Text=text, TextType='text', Engine='neural',VoiceId=en_FEMALE, OutputFormat='mp3')
    #res = polly.synthesize_speech(Text=ssml, TextType='ssml', Engine='standard',VoiceId='Salli', OutputFormat='mp3')
    body = res['AudioStream'].read()
    fileName = outputDir + '/' + id + '.mp3'
    
    with open(fileName, 'wb') as file:
        file.write(body)
        file.close()

In [107]:
def processSubtitles():
    audio_out_file = None
    while True:
        try:
            # Get the subtitle filepath
            message = "Subtitle file: Enter an subtitle filepath of the voice captions to be generated (srt):\n"
            in_subpath = Path(input(message).replace("\"", "").replace("\'", ""))
        
            # Load subtitle to be processed
            srt_file = Path(in_subpath).read_text()
            print("Processing: ", in_subpath)
            if(not os.path.isdir('output')):
                    os.makedirs("output")
            
            subs = list(srt.parse(srt_file))
            currentTime = 0
            delay = 0
            padding = 0
            for sub in subs:
                if sub.index > 0:
                    print(f'Processing subtitle {sub.index}/{len(subs)}')
                    print(f'\tContent: {sub.content}')
                    # Calculate length of silence to add
                    print(f'\tStart: {sub.start}')
                    print(f'\tEnd: {sub.end}')

                    #clean_text = sub.content.translate(str.maketrans("", "", string.punctuation))
                    #print(f'Stripped text: {clean_text}')

                    ## Generating the spectrogram

                    print(f'\tCreating: output/{str(sub.index)}.mp3')
                    genSound(str(sub.index), 'output', str(sub.content))

                    # Add silence and stitch files
                    audio = AudioSegment.from_mp3('output/'+str(sub.index)+'.mp3')

                    if currentTime == 0:
                        padding = (sub.start.total_seconds() * 1000) - currentTime
                        currentTime += padding + len(audio)
                        audio_out_file = addSilence(padding) + audio
                    else:
                        if padding < 0:
                            # Speed change
                            speed = round(((-1*padding)+len(audio))/len(audio),2)
                            if speed > 1.0:
                                newAudio = speedChange(audio, speed)
                                audio = newAudio
                            print(f'\tSpeed: {speed}')
                            padding = 0
                            currentTime += padding + len(audio)
                            audio_out_file += audio
                                
                        else:
                            padding = (sub.start.total_seconds() * 1000) - currentTime
                            currentTime += padding + len(audio)                   
                            audio_out_file += addSilence(padding) + audio

                    print(f'\tPadding: {padding}')
                    print(f'\tclip len: {len(audio)}')
                    print(f'\tcurrentTime: {currentTime}')

            # Get the subtitle filepath
            message = "Output file: Enter a name for the output file  (wav):\n"
            out_name = input(message)
            print(f'\tExporting: {out_name}.wav')
            audio_out_file.export(f'{out_name}.wav', format="wav")

                
        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n") 

In [None]:
processSubtitles()

Subtitle file: Enter an subtitle filepath of the voice captions to be generated (srt):
ivan.srt
Processing:  ivan.srt
Processing subtitle 1/107
	Content: "Look at me!"
	Start: 0:00:04.782000
	End: 0:00:06.161000
	Creating: output/1.mp3
	Padding: 4782.0
	clip len: 816
	currentTime: 5598.0
Processing subtitle 2/107
	Content: That phrase turned me
into an eye-contact coach.
	Start: 0:00:08.180000
	End: 0:00:12.805000
	Creating: output/2.mp3
	Padding: 2582.0
	clip len: 2664
	currentTime: 10844.0
Processing subtitle 3/107
	Content: I am the mother of Ivan. He is 15 years old.
	Start: 0:00:14.299000
	End: 0:00:17.243000
	Creating: output/3.mp3
	Padding: 3455.0
	clip len: 3432
	currentTime: 17731.0
Processing subtitle 4/107
	Content: Ivan has autism,
	Start: 0:00:18.023000
	End: 0:00:19.967000
	Creating: output/4.mp3
	Padding: 292.0
	clip len: 1464
	currentTime: 19487.0
Processing subtitle 5/107
	Content: he doesn't speak,
	Start: 0:00:19.991000
	End: 0:00:21.536000
	Creating: output/5.mp3
	P

	Padding: 1640.0
	clip len: 3648
	currentTime: 169447.0
Processing subtitle 42/107
	Content: until I turned back.
	Start: 0:02:49.765000
	End: 0:02:51.285000
	Creating: output/42.mp3
	Padding: 318.0
	clip len: 1224
	currentTime: 170989.0
Processing subtitle 43/107
	Content: Only then did he calm down.
	Start: 0:02:52.040000
	End: 0:02:53.662000
	Creating: output/43.mp3
	Padding: 1051.0
	clip len: 1680
	currentTime: 173720.0
Processing subtitle 44/107
	Content: How was it possible
that a two and a half year old
	Start: 0:02:55.032000
	End: 0:02:58.728000
	Creating: output/44.mp3
	Padding: 1312.0
	clip len: 2544
	currentTime: 177576.0
Processing subtitle 45/107
	Content: didn't respond to his own name,
	Start: 0:02:58.752000
	End: 0:03:00.559000
	Creating: output/45.mp3
	Padding: 1176.0
	clip len: 1824
	currentTime: 180576.0
Processing subtitle 46/107
	Content: yet in the middle of the rain and fog,
where I couldn't see anything,
	Start: 0:03:01.355000
	End: 0:03:05.479000
	Creating: out

	Padding: 203.0
	clip len: 3024
	currentTime: 314149.0
Processing subtitle 83/107
	Content: which were his favorite.
	Start: 0:05:14.915000
	End: 0:05:16.661000
	Creating: output/83.mp3
	Padding: 766.0
	clip len: 1296
	currentTime: 316211.0
Processing subtitle 84/107
	Content: He said "yes" right away.
	Start: 0:05:19.347000
	End: 0:05:20.847000
	Creating: output/84.mp3
	Padding: 3136.0
	clip len: 1536
	currentTime: 320883.0
Processing subtitle 85/107
	Content: So that's how it went for a year:
	Start: 0:05:22.319000
	End: 0:05:25.561000
	Creating: output/85.mp3
	Padding: 1436.0
	clip len: 1824
	currentTime: 324143.0
Processing subtitle 86/107
	Content: Ivan would go to Jose's greengrocer,
	Start: 0:05:25.585000
	End: 0:05:27.875000
	Creating: output/86.mp3
	Padding: 1442.0
	clip len: 2280
	currentTime: 327865.0
Processing subtitle 87/107
	Content: help him arrange the shelves
of water bottles
	Start: 0:05:28.816000
	End: 0:05:32.260000
	Creating: output/87.mp3
	Padding: 951.0
	clip le

In [None]:
IPython.display.Audio("testOutput.wav")

In [58]:
sound = AudioSegment.from_file("output/6.mp3")
fast = speedChange(sound, 1.01)
fast.export("testOutputfast.wav", format="wav")
IPython.display.Audio("testOutputfast.wav")