<a href="https://colab.research.google.com/github/marcosfelt/latex2speech/blob/main/tts_latex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text to speech for Latex

This notebook converts Latex into speech. It's useful for having your papers read back to you during editing/proofreading. 

How to use:


1. Click the play button "Setup" to install all the necessary packages
2. From the Google Colab menu, select "Runtime" -> "Restart Runtime". This is necessary to make sure the correct versions of certain packages are used.
3. Paste your latex code into the text box and click play.
4. You'll get your Latex read out to you!

FAQ:

- **Does this remove citation and reference commands?** Yes, automatically done!
- **How long does it take to generate speech?** The total generation pipeline is ~4x realtime, so 1 minute of speech will take ~15 seconds. Note, that the first run will take longer, since the model needs to be downloaded.
- **What model does this use?** It uses the [Tacotron-DDC](https://coqui.ai/blog/tts/solving-attention-problems-of-tts-models-with-double-decoder-consistency) model from [Coqui-AI](https://github.com/coqui-ai/TTS).

In [158]:
#@title Setup - Click the play icon

# Needed for inflect
import locale
locale.getpreferredencoding = lambda: "UTF-8"

# Install packages
!pip install TTS inflect

from TTS.api import TTS
# from pydub import AudioSegment
# from pydub.effects import speedup
import pysbd
import re
import textwrap
import inflect
import string
import random
from IPython.display import display, clear_output, HTML, Audio
from google.colab import files
from pathlib import Path

# Conversion of numbers
p = inflect.engine()
def convert_numbers(matchobj):
    return p.number_to_words(matchobj.group(0))
clear_output(wait=True)

def id_generator(size=6, chars=string.ascii_lowercase):
    return "".join(random.choice(chars) for _ in range(size))

# Abbreviations
# Inspired by https://github.com/coqui-ai/TTS/discussions/987
abbreviations = {
    "a": "ay",
    "b": "bee",
    "c": "sieh",
    "d": "dea",
    "e": "ee",
    "f": "eff",
    "g": "jie",
    "h": "edge",
    "i": "eye",
    "j": "jay",
    "k": "kaye",
    "l": "elle",
    "m": "emme",
    "n": "en",
    "o": "owe",
    "p": "pea",
    "q": "cue",
    "r": "are",
    "s": "esse",
    "t": "tea",
    "u": "hugh",
    "v": "vee",
    "w": "doub you",
    "x": "ex",
    "y": "why",
    "z": "zee",
}

isin = lambda l, s: any([li in s for li in l])


def abbreviation_preprocessor(text: str):
  # A bit of duplicate work because tts does this as well
  seg = pysbd.Segmenter(language="en", clean=True)
  sentences = seg.segment(text)
  for i in range(len(sentences)):
    words = sentences[i].split(" ")
    for j in range(len(words)):
      # Take the following for subsitition
      # All upper case
      has_period = words[j].rstrip(".") != words[j]
      words[j] = words[j].rstrip(".")
      check_1 = words[j].upper() == words[j]
      # All upper case plural
      check_2 = len(words[j])> 0 and words[j][:-1].upper() == words[j][:-1]
      # One letter
      check_3 = len(words[j]) == 1
      if words[j] == "GPUs":
        print("here")
      if check_1 or check_2 or check_3:
        words[j] = abbreviation_replacement(words[j])
      if has_period:
        words[j] += "."
    sentences[i] = " ".join(words)
  return " ".join(sentences)

def abbreviation_replacement(word: str):
  """Heuristic for abbreviations"""
  subwords = word.split("-")
  for i in range(len(subwords)):
    tokens = list(subwords[i])
    # Only spell out acronyms without middle vowels
    has_s = len(tokens)> 2 and tokens[-1] == "s"
    if has_s:
      tokens = tokens[:-1]
      subwords[i] = subwords[i][:-1]
    vowels = ["a", "e", "i", "o", "u"]
    check_1 = isin(vowels, subwords[i].lower())
    check_2 = len(tokens)> 0 and tokens[0].lower() not in vowels
    check_3 = len(tokens)> 0 and tokens[-1].lower() not in vowels
    if check_1 and check_2 and check_3:
      continue
    new_tokens = []
    for token in tokens:
      token = abbreviations.get(token.lower(), token)
      new_tokens.extend([token, " "])
    if has_s:
      new_tokens[-2] += "s"
    subwords[i] = "".join(new_tokens)
  return "".join(subwords)


class CustomAudio(Audio):
    """Audio class with the option to control default playback speed"""
    def __init__(self, data=None, filename=None, url=None, embed=None, rate=None, autoplay=False, normalize=True, *,
                playback_speed=1.0):
      element_id = id_generator()
      super().__init__(
          data=data,
          filename=filename,
          url=url, 
          embed=embed, 
          rate=rate, 
          autoplay=autoplay,
          normalize=normalize, 
          element_id=element_id
      )
      self.playback_rate = playback_speed

    def _repr_html_(self):
      src = super()._repr_html_()
      src = f"""
      {src}
      <script>
      {self.element_id}.playbackRate = {self.playback_rate}
      </script>
      """
      return src

In [159]:
#@title Generate speech

text = "In this work we propose the Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs. " #@param {type:"string"}
smart_abbreviations = True #@param {type:"boolean"}
playback_speed =  1.5 #@param {type:"number"}
if playback_speed < 1.0:
  raise ValueError("Playback speed must be greate than or equal to 1.0")
# Clean up latex
# Strip latex citations and references
text = re.sub(r"\\cite\{[A-za-z\d,\s\-\_:]+\}", "", text)
text = re.sub(r"\\citep\{[A-za-z\d,\s\-\_:]+\}", "", text)
text = re.sub(r"\\ref\{[A-za-z\d,\s\-\_:]+\}", "", text)
# Split alphanumeric characters
pattern = r'(?<=[a-zA-Z])(?=\d)|(?<=\d)(?=[a-zA-Z])'
result = re.split(pattern, text)
text = " ".join(result)
# Convert numbers to words
text = re.sub(r"\d+(\.\d+)?", convert_numbers, text)
# Percent symbols
text = text.replace("\%", " percent")
# Remove random latex symbols
for s in ["$", "\\", "{" ,"}"]:
  text = text.replace(s, "")
text_with_abbrevs = text.replace("_", "-")
if smart_abbreviations:
  text_final = abbreviation_preprocessor(text_with_abbrevs)
else:
  text_final = text_with_abbrevs

model_name = "tts_models/en/ljspeech/tacotron2-DDC"
tts = TTS(model_name, gpu=True, progress_bar=False,)
wav = tts.tts(text_final)
clear_output(wait=True)
print(" \n".join(textwrap.wrap(text_with_abbrevs, width=70)))
print()
display(CustomAudio(
    wav, 
    rate=22050, 
    playback_speed=playback_speed,
    autoplay=True
))

In this work we propose the Transformer, a model architecture 
eschewing recurrence and instead relying entirely on an attention 
mechanism to draw global dependencies between input and output. The 
Transformer allows for significantly more parallelization and can 
reach a new state of the art in translation quality after being 
trained for as little as twelve hours on eight P one hundred GPUs.

