# Audiobook Generator - Proof of Concept

This notebook is intended to be a proof of concept for the end-to-end work of generating an audiobook file from an ebook. This includes converting the .epub book files into raw python trxt strings, splitting into items and sentences, then tokenizing and batching them to run through the Nvidia implementation of Tacotron2.


## Outline of steps

1. Import .epub file
2. Divide ebook into chapters
3. Remove html tags
4. Tokenize text for use in the model

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import numpy as np
from torch.utils.data import DataLoader
from pydub import AudioSegment
from IPython.display import Audio
from tqdm.notebook import tqdm

seed = 1337
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# pg2554.epub = Crime and Punishment
# pg174.epub = Portrait of Dorian Gray
# pg1342.epub = Pride And Prejudice
ebook_path = 'pg2554.epub'
rate = 22050
batch_size = 100
max_char_len = 150

In [None]:
# !pip install nvidia-pyindex
# !pip install pytorch-quantization

In [None]:
tacotron2 = torch.hub.load(
    'NVIDIA/DeepLearningExamples:torchhub', 
    'nvidia_tacotron2', 
    model_math='fp16',  
    map_location=torch.device(device)
)
# tacotron2 = torch.hub.load_state_dict_from_url(
#     'https://ngc.nvidia.com/catalog/models/nvidia:tacotron2pyt_fp32/files?version=3', 
#     map_location=torch.device(device)
# )
tacotron2 = tacotron2.to(device)
# tacotron2 = tacotron2.eval()

utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')

waveglow = torch.hub.load(
    'NVIDIA/DeepLearningExamples:torchhub', 
    'nvidia_waveglow', 
    model_math='fp16',  
    map_location=torch.device(device)
)
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to(device)
waveglow = waveglow.eval()

In [None]:
def read_ebook(ebook_path):
    
    import ebooklib
    from ebooklib import epub
    from bs4 import BeautifulSoup
    from tqdm.notebook import tqdm
    from nltk import tokenize, download
    from textwrap import TextWrapper
    
    download('punkt')
    wrapper = TextWrapper(max_char_len, fix_sentence_endings=True)
    
    book = epub.read_epub(ebook_path)

    corpus = []
    for item in tqdm(list(book.get_items())):
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            input_text = BeautifulSoup(item.get_content(), "html.parser").text
            text_list = []
            for paragraph in input_text.split('\n'):
                paragraph = paragraph.replace('—', '-')
                sentences = tokenize.sent_tokenize(paragraph)
                
                # Truncate sentences to maximum character limit
                sentence_list = []
                for sentence in sentences:
                    wrapped_sentences = wrapper.wrap(sentence)
                    sentence_list.append(wrapped_sentences)
                # Flatten list of list of sentences
                trunc_sentences = [phrase for sublist in sentence_list for phrase in sublist]
                
                text_list.append(trunc_sentences)
            text_list = [text for sentences in text_list for text in sentences]
            corpus.append(text_list)

    return corpus

In [None]:
ebook = read_ebook(ebook_path)

In [None]:
len(ebook)

In [None]:
ebook[6]

In [None]:
plt.hist([len(sentence) for chapter in ebook for sentence in chapter])

In [None]:
text = ebook[6]
text = [sentence[:max_char_len] for sentence in text]
tokens, lengths = utils.prepare_input_sequence(text)

In [None]:
lengths

In [None]:
len(text)

In [None]:
tokens.shape

In [None]:
data = [(tokens[i],lengths[i]) for i in range(len(tokens))]

In [None]:
dataloader = DataLoader(data, batch_size=6)

In [None]:
len(dataloader)

In [None]:
audio_list = []
for X, length in tqdm(dataloader):
    with torch.no_grad():
        Y, _, _ = tacotron2.infer(X, length)
        audio = waveglow.infer(Y)
        audio_list.append(audio)
#     audio_numpy = audio[0].data.cpu().numpy()

In [None]:
!nvidia-smi

In [None]:
50 * 2.5