# Extract text from thesis PDF file and narrate

### Extract text from PDF file, remove document formatting, and save as plain text

Note: thesis needs to have been compiled in `speech` mode, and `thesis.pdf` needs to have been moved to the root directory.

In [None]:
import re
import fitz

In [None]:
with fitz.open('thesis.pdf') as doc:
    fulltext = ""
    for page in doc:
        text = page.get_text()
        # remove spaces between words and punctuation
        text = re.sub(r'(\w)\s([.,])', r'\1\2', text)
        text = re.sub(r'(\w);\s([.,])', r'\1\2', text)
        text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
        text = re.sub(r'([\w,&])\n(\w)', r'\1 \2', text)

        fulltext += text

fulltext = re.sub(r'([\w,])\n(\w)', r'\1 \2', fulltext)

fulltext = fulltext.replace('Chap.', 'Chapter')
fulltext = fulltext.replace('– ', '')
fulltext = fulltext.replace('• ', '')

with open('intro.txt', 'w', encoding='utf-8') as f:
    f.write(fulltext)

In [None]:
print(len(fulltext))
print(fulltext[:1000])

Split text into rough paragraphs for passing to the API. And make sure they adhere to the character limit.

In [None]:
paragraphs = re.split('\n', fulltext)

max_len = max(len(p) for p in paragraphs)
assert max_len < 10000, f"Paragraphs are too long. Need to be less than 10,000 characters. Current max length: {max_len}"

print(paragraphs[:2])

Find the paragraphs at the start of each section to help with generating the audio bit by bit.

In [None]:
start_indices = []
for i,p in enumerate(paragraphs):
    if 'Section' in p:
        start_indices.append(i)
print(start_indices)

### Narrate text using ElevenLabs API, and save audio file.

**Make sure to check the text is correct before narrating!**

See https://elevenlabs.io/app/settings for more info.

There's a lush Geordie voice here (https://elevenlabs.io/app/voice-library?voiceId=Nk0iUG137ZABNEQYgAMl), but it's quite expensive (too right lass!)

In [None]:
import os
from io import BytesIO
from tqdm.notebook import tqdm
from elevenlabs.client import ElevenLabs
from elevenlabs import play, save

In [None]:
elevenlabs = ElevenLabs(
  api_key=os.getenv("ELEVENLABS_API_KEY"),
)

Perform text-to-speech conversion chunk by chunk due to character limit on requests. *Can provide previous request as context to improve intonation and continuity, but this uses loads more tokens, and isn't really necessary for this use case.*

In [None]:
audio_buffers = []
prev_request_id = []

paragraphs_selection = paragraphs

for i,p in tqdm(enumerate(paragraphs_selection), total=len(paragraphs_selection)):
    # Usually we get back a stream from the convert function, but with_raw_response is
    # used to get the headers from the response
    with elevenlabs.text_to_speech.with_raw_response.convert(
        text=p,
        voice_id="RKCbSROXui75bk1SVpy8",
        model_id="eleven_multilingual_v2",
        output_format="mp3_44100_64",
        #previous_request_ids=prev_request_id
    ) as response:
        #prev_request_id = [response._response.headers.get("request-id")]
        # response._response.headers also contains useful information like 'character-cost',
        # which shows the cost of the generation in characters.

        audio_data = b''.join(chunk for chunk in response.data)
        audio_buffers.append(BytesIO(audio_data))

        if i < len(paragraphs_selection) - 2:
            if 'Section' in paragraphs_selection[i+1]: # save checkout at end of section
                print(i)
                combined_audio = BytesIO(b''.join(buffer.getvalue() for buffer in audio_buffers))
                save(combined_audio,f'intro_cp{i}.mp3')

combined_audio = BytesIO(b''.join(buffer.getvalue() for buffer in audio_buffers))
save(combined_audio,'introduction.mp3')