In [None]:
!git clone https://github.com/TensorSpeech/TensorFlowTTS.git
!cd TensorFlowTTS
!pip install -q /content/TensorFlowTTS/

In [None]:
!pip install -q pytesseract pdf2image tf-nightly
!pip install -q git+https://github.com/repodiac/german_transliterate.git#egg=german_transliterate
!apt-get install -y poppler-utils tesseract-ocr

In [None]:
import numpy as np
import soundfile as sf
import yaml
import tensorflow as tf
import sys
sys.path.append('/content/TensorFlowTTS')
from tensorflow_tts.inference import AutoProcessor
from tensorflow_tts.inference import TFAutoModel
import IPython.display as ipd
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import gc
import logging
from google.colab import files
logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%H:%M:%S', level=logging.DEBUG)
print(tf.__version__) # check if >= 2.4.0

In [None]:
# Path of the pdf
PDF_file = "input_file.pdf"

  
# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, 500)
  
# Counter to store images of each page of PDF to image
image_counter = 1
  
# Iterate through all the pages stored above
for page in pages:
  
    filename = "page_"+str(image_counter)+".jpg"
      
    # Save the image of the page in system
    page.save(filename, 'JPEG')
  
    # Increment the counter to update filename
    image_counter = image_counter + 1
  
# Variable to get count of total number of pages
filelimit = image_counter-1
  
# Creating a text file to write the output
outfile = "out_text.txt"
  
# Open the file in append mode so that 
# All contents of all images are added to the same file
f = open(outfile, "a")
  
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
  
    filename = "page_"+str(i)+".jpg"
          
    # Recognize the text as string in image using pytesserct
    text = str(((pytesseract.image_to_string(Image.open(filename)))))
  
    text = text.replace('-\n', '')    
  
    # Finally, write the processed text to the file.
    f.write(text)
  
# Close the file after writing all the text.
f.close()

text = open(outfile, "r").read()

In [None]:
def text_to_speech(book_name, text, save_every=50):

  # initialize fastspeech2 model.
  fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")


  # initialize mb_melgan model
  mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en")


  # inference
  processor = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")

  n = 200
  chunks = [text[i:i + n] for i in range(0, len(text), n)]
  audios = []
  count = 0
  for chunk in chunks:
    input_ids = processor.text_to_sequence(chunk)
    # fastspeech inference

    mel_before, mel_after, duration_outputs, _, _ = fastspeech2.inference(
        input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
        speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        f0_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
        energy_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
    )

    # melgan inference
    #audio_before = mb_melgan.inference(mel_before)[0, :, 0]
    audio_after = mb_melgan.inference(mel_after)[0, :, 0]

    file_name = f'./{book_name}_cd_{count}.wav'
    print(f'saving audio to {file_name}')
    sf.write(file_name, audio_after, 22050, "PCM_16")
    count += 1

book = 'benjamin_franklin_the_way_to_wealth'
text_to_speech(book, text, save_every=1)

In [None]:
import glob
import wave

infiles = glob.glob('*.wav')
outfile = book + ".wav"

data= []
for infile in infiles:
    w = wave.open(infile, 'rb')
    data.append( [w.getparams(), w.readframes(w.getnframes())] )
    w.close()
    
output = wave.open(outfile, 'wb')
output.setparams(data[0][0])
for i in range(len(data)):
    output.writeframes(data[i][1])
output.close()
files.download(outfile)