# Extract text from thesis PDF file and narrate

### Extract text from PDF file, remove document formatting, and save as plain text

Note thesis needs to have been compiled in `speech` mode.

In [1]:
import re
import fitz

In [2]:
with fitz.open('thesis.pdf') as doc:
    fulltext = ""
    for page in doc:
        text = page.get_text()
        # remove spaces between words and punctuation
        text = re.sub(r'(\w)\s([.,])', r'\1\2', text)
        text = re.sub(r'(\w);\s([.,])', r'\1\2', text)
        text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
        text = re.sub(r'([\w,&])\n(\w)', r'\1 \2', text)

        fulltext += text

fulltext = re.sub(r'([\w,])\n(\w)', r'\1 \2', fulltext)

with open('thesis.txt', 'w', encoding='utf-8') as f:
    f.write(fulltext)

In [3]:
print(len(fulltext))

54311


In [4]:
print(fulltext[:1000])

Chapter 1 Introduction.
Decarbonisation is a truly enormous challenge. The UK and EU have committed to getting to net zero carbon emissions by 2050. There are lots of ways of slicing up the carbon pie.
But from a scientific perspective, there are three broad areas to decarbonise: industrial processes, where chemists must figure out how to make the materials we need without producing CO2 along the way; agriculture, where biologists need to find ways to grow enough food without releasing CO2 and methane; and energy, where physicists and engineers have to produce the electricity and heat that support modern life without using fossil fuels.
This all needs to be done at a cost that is low enough that society is willing to make the transition, and without ever interrupting supply. Imagine rebuilding your tennis racket out of new materials while making sure you stay in the rally.
Within energy, many sectors such as heating and transportation plan to decarbonise by electrification (potentially

Split text into rough paragraphs to fit inside request character limit.

In [5]:
paragraphs = re.split('\n', fulltext)
print(len(paragraphs))

max_len = max(len(p) for p in paragraphs)
assert max_len < 10000, f"Paragraphs are too long. Need to be less than 10,000 characters. Current max length: {max_len}"

print(paragraphs[:5])

128
['Chapter 1 Introduction.', 'Decarbonisation is a truly enormous challenge. The UK and EU have committed to getting to net zero carbon emissions by 2050. There are lots of ways of slicing up the carbon pie.', 'But from a scientific perspective, there are three broad areas to decarbonise: industrial processes, where chemists must figure out how to make the materials we need without producing CO2 along the way; agriculture, where biologists need to find ways to grow enough food without releasing CO2 and methane; and energy, where physicists and engineers have to produce the electricity and heat that support modern life without using fossil fuels.', 'This all needs to be done at a cost that is low enough that society is willing to make the transition, and without ever interrupting supply. Imagine rebuilding your tennis racket out of new materials while making sure you stay in the rally.', 'Within energy, many sectors such as heating and transportation plan to decarbonise by electrific

### Narrate text using ElevenLabs API, and save audio file.

**Make sure to check the text is correct before narrating!**

See https://elevenlabs.io/app/settings for more info.

In [6]:
import os
from io import BytesIO
from dotenv import load_dotenv
from elevenlabs.client import ElevenLabs
from elevenlabs import play, save

In [7]:
elevenlabs = ElevenLabs(
  api_key=os.getenv("ELEVENLABS_API_KEY"),
)

In [8]:
# audio = elevenlabs.text_to_speech.convert(
#     text=fulltext[:300],
#     voice_id="JBFqnCBsd6RMkjVDRZzb",
#     model_id="eleven_multilingual_v2",
#     output_format="mp3_44100_128",
# )

Perform text-to-speech conversion chunk by chunk due to character limit on requests. Provide previous request as context to improve intonation and continuity.

One concern is that doing is context might be using up much more tokens, which are in short supply.
*Commenting out the `previous_request_ids` argument will disable this.*

In [None]:
prev_request_id = []
audio_buffers = []

for paragraph in paragraphs[:2]:
    # Usually we get back a stream from the convert function, but with_raw_response is
    # used to get the headers from the response
    with elevenlabs.text_to_speech.with_raw_response.convert(
        text=paragraph,
        voice_id="JBFqnCBsd6RMkjVDRZzb",
        model_id="eleven_multilingual_v2",
        output_format="mp3_44100_64",
        previous_request_ids=prev_request_id
    ) as response:
        prev_request_id = [response._response.headers.get("request-id")]

        # response._response.headers also contains useful information like 'character-cost',
        # which shows the cost of the generation in characters.

        audio_data = b''.join(chunk for chunk in response.data)
        audio_buffers.append(BytesIO(audio_data))

combined_audio = BytesIO(b''.join(buffer.getvalue() for buffer in audio_buffers))

In [15]:
save(combined_audio,'introduction.mp3')