## initialize

In [1]:
%cd ..

/Users/ales/dev/repos/ai-audio-books


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
%load_ext autoreload
%autoreload 2

In [31]:
import os

from pprint import pprint

import dotenv
import pandas as pd
from httpx import Timeout
from pydantic import BaseModel
from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain_openai import ChatOpenAI
from langchain_community.callbacks import get_openai_callback

from IPython.display import Audio

import data.samples_to_split as samples

from src.lc_callbacks import LCMessageLoggerAsync
from src.schemas import AudioOutputFormat, TTSParams, TTSTimestampsResponse, TTSTimestampsAlignemnt
from src.text_split_chain import create_split_text_chain
from src import tts
from src.utils import GPTModels

In [4]:
dotenv.load_dotenv()

True

## split text into character phrases

In [5]:
text = samples.GATSBY_2
text = """\
Margaret: hello, how are you Tom?
Tom: nice, thanks. And you?
"""

In [7]:
chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)
# chain = create_split_text_chain(llm_model=GPTModels.GPT_4_TURBO_2024_04_09)
with get_openai_callback() as cb:
    res = chain.invoke({"text": text}, config={"callbacks": [LCMessageLoggerAsync()]})

2024-10-31 00:57:14,451 [INFO] audio-books (lc_callbacks.py): call to gpt-4o with 2 messages:
{'role': 'system', 'content': 'you are provided with the book sample.\nplease rewrite it and insert xml tags indicating character to whom current phrase belongs.\nfor example: <narrator>I looked at her</narrator><Jill>What are you looking at?</Jill>\n\nNotes:\n- sometimes narrator is one of characters taking part in the action.\nin this case use narrator\'s name (if available) instead of "narrator"\n- if it\'s impossible to identify character name from the text provided, use codes "c1", "c2", etc,\nwhere "c" prefix means character and number is used to enumerate unknown characters\n- all quotes of direct speech must be attributed to characters, for example:\n<Tom>“She’s a nice girl,”</Tom><narrator>said Tom after a moment.</narrator>\nmind that sometimes narrator could also be a character.\n- use ALL available context to determine the character.\nsometimes the character name becomes clear from

In [8]:
res.characters

['Tom', 'Margaret']

In [9]:
print(res.text_annotated)

<Margaret>hello, how are you Tom?</Margaret><Tom>nice, thanks. And you?</Tom>


In [10]:
res.phrases

[CharacterPhrase(character='Margaret', text='hello, how are you Tom?'),
 CharacterPhrase(character='Tom', text='nice, thanks. And you?')]

In [11]:
print(res.text_raw)

Margaret: hello, how are you Tom?
Tom: nice, thanks. And you?



In [12]:
print(res.to_pretty_text())

characters: ['Tom', 'Margaret']
--------------------
[Margaret] hello, how are you Tom?
[Tom] nice, thanks. And you?


In [13]:
print(f'LLM usage:\n\n{cb}')

LLM usage:

Tokens Used: 278
	Prompt Tokens: 253
	Completion Tokens: 25
Successful Requests: 1
Total Cost (USD): $0.0008825


## map characters to voices

In [16]:
from src.select_voice_chain import VoiceSelector

In [17]:
vs = VoiceSelector()

2024-10-31 00:58:58,926 [INFO] audio-books (select_voice_chain.py): reading voice data from: "data/11labs_available_tts_voices.reviewed.csv"
2024-10-31 00:58:58,933 [INFO] audio-books (select_voice_chain.py): df.shape=(34, 15)
2024-10-31 00:58:58,935 [INFO] audio-books (select_voice_chain.py): filtering df by "manual_quality_review" column
2024-10-31 00:58:58,937 [INFO] audio-books (select_voice_chain.py): df.shape after filtering voices: (25, 15)


In [19]:
chain = vs.create_voice_mapping_chain(llm_model=GPTModels.GPT_4o)

In [20]:
chain

RunnableAssign(mapper={
  charater_props: ChatPromptTemplate(input_variables=['characters', 'text'], input_types={}, partial_variables={'available_genders': '"female", "male"', 'available_age_groups': '"middle_aged", "young", "old"', 'format_instructions': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"$defs": {"CharacterProperties": {"properties": {"gender": {"title": "Gender", "type": "string"}, "age_group": {"title": "Age Group", "type": "string"}}, "required": ["gender", "age_group"], "title": "CharacterProperties", "type": "object"}}, "properties": {"character2props": {"addi

In [21]:
res2 = chain.invoke(
    {"text": res.text_annotated, "characters": res.characters},
    config={"callbacks": [LCMessageLoggerAsync()]},
)

2024-10-31 00:59:29,332 [INFO] audio-books (lc_callbacks.py): call to gpt-4o with 2 messages:
{'role': 'system', 'content': 'You are a helpful assistant proficient in literature and psychology.\nOur goal is to create an audio book from the given text.\nFor that we need to hire voice actors.\nPlease help us to find the right actor for each character present in the text.\n\nYou are provided with the text split by the characters\nto whom text parts belong to.\n\nYour task is to assign available properties to each character provided.\nList of available properties:\n- gender: "female", "male"\n- age_group: "middle_aged", "young", "old"\n\nNOTES:\n- assign EXACTLY ONE property value for each property\n- select properties values ONLY from the list of AVAILABLE property values\n- fill properties for ALL characters from the list provided\n- DO NOT include any characters absent in the list provided\n\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs

In [22]:
res2

SelectVoiceChainOutput(character2props={'Tom': CharacterPropertiesNullable(gender='male', age_group='middle_aged'), 'Margaret': CharacterPropertiesNullable(gender='female', age_group='middle_aged')}, character2voice={'Tom': 'cjVigY5qzO86Huf0OWal', 'Margaret': '8opUN7sGOKbyojnjvNdl'})

In [24]:
character2voice = res2.character2voice
character2voice

{'Tom': 'cjVigY5qzO86Huf0OWal', 'Margaret': '8opUN7sGOKbyojnjvNdl'}

## generate audio

In [26]:
res.phrases

[CharacterPhrase(character='Margaret', text='hello, how are you Tom?'),
 CharacterPhrase(character='Tom', text='nice, thanks. And you?')]

In [27]:
tts_responses = []

for phrase in res.phrases:
    voice_id = character2voice[phrase.character]
    tts_params = params = TTSParams(voice_id=voice_id, text=phrase.text)
    response = await tts.tts_w_timestamps(params)
    tts_responses.append(response)

2024-10-31 01:02:30,721 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '8opUN7sGOKbyojnjvNdl', 'output_format': <AudioOutputFormat.MP3_44100_192: 'mp3_44100_192'>} for the following text: "hello, how are you Tom?"
2024-10-31 01:02:33,544 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 "HTTP/1.1 200 OK"
2024-10-31 01:02:33,569 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'cjVigY5qzO86Huf0OWal', 'output_format': <AudioOutputFormat.MP3_44100_192: 'mp3_44100_192'>} for the following text: "nice, thanks. And you?"
2024-10-31 01:02:34,321 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/cjVigY5qzO86Huf0OWal/with-timestamps?output_format=mp3_44100_192 "HTTP/1.1 200 OK"


In [35]:
alignments = [response.alignment for response in tts_responses]

In [42]:
a = TTSTimestampsAlignemnt.combine_alignments(alignments=alignments)

In [43]:
a.to_dataframe()

Unnamed: 0,char,start,end
0,h,0.0,0.081
1,e,0.081,0.139
2,l,0.139,0.174
3,l,0.174,0.232
4,o,0.232,0.36
5,",",0.36,0.511
6,,0.511,0.604
7,h,0.604,0.65
8,o,0.65,0.685
9,w,0.685,0.72


In [49]:
len(''.join([x.text for x in res.phrases]))

45

## generate audio with timestamps

In [5]:
text = samples.ARCH_WIKI_1[:200]
pprint(text)

('This document is an annotated index of popular articles and important '
 'information for improving and adding functionalities to the installed Arch '
 'system. Readers are assumed to have read and followed t')


In [6]:
text = '''\
hello, this is the test when I am voicing 123 different phrases (some in parentheses),
with newlines
some unreadable characters: #!@%*&
LooLLL123
how is it??? going!!
and some smiles: :))
'''

In [16]:
params = TTSParams(
    voice_id="8opUN7sGOKbyojnjvNdl",
    text=text,
    # seed=672
)

In [17]:
params.output_format

<AudioOutputFormat.MP3_44100_192: 'mp3_44100_192'>

In [18]:
list(params)

[('voice_id', '8opUN7sGOKbyojnjvNdl'),
 ('text',
  'hello, this is the test when I am voicing 123 different phrases (some in parentheses),\nwith newlines\nsome unreadable characters: #!@%*&\nLooLLL123\nhow is it??? going!!\nand some smiles: :))\n'),
 ('output_format', <AudioOutputFormat.MP3_44100_192: 'mp3_44100_192'>),
 ('audio_model_id', Ellipsis),
 ('language_code', Ellipsis),
 ('voice_settings', Ellipsis),
 ('seed', 672),
 ('previous_text', Ellipsis),
 ('next_text', Ellipsis),
 ('previous_request_ids', Ellipsis),
 ('next_request_ids', Ellipsis)]

In [19]:
params.to_dict()

{'voice_id': '8opUN7sGOKbyojnjvNdl',
 'text': 'hello, this is the test when I am voicing 123 different phrases (some in parentheses),\nwith newlines\nsome unreadable characters: #!@%*&\nLooLLL123\nhow is it??? going!!\nand some smiles: :))\n',
 'output_format': <AudioOutputFormat.MP3_44100_192: 'mp3_44100_192'>,
 'seed': 672}

In [22]:
response = await tts.tts_w_timestamps(params)

2024-10-29 22:55:03,113 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '8opUN7sGOKbyojnjvNdl', 'output_format': <AudioOutputFormat.MP3_44100_192: 'mp3_44100_192'>, 'seed': 672} for the following text: "hello, this is the test when I am voicing 123 different phrases (some in parentheses),
with newlines
some unreadable characters: #!@%*&
LooLLL123
how is it??? going!!
and some smiles: :))
"
2024-10-29 22:55:06,333 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 "HTTP/1.1 200 OK"


In [23]:
response.write_audio_to_file('tmp.672.2', params.output_format)

2024-10-29 22:55:07,034 [INFO] audio-books (utils.py): saving to: "tmp.672.2.mp3"


'tmp.672.2.mp3'

In [64]:
response_raw = await tts.ELEVEN_CLIENT_ASYNC.text_to_speech.convert_with_timestamps(
    **params.to_dict()
)

  Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `VoiceSettings` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `int` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
2024-10-27 21:44:22,111 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 "HTTP/1.1 200 OK"


In [75]:
response_parsed = TTSTimestampsResponse.model_validate(response_raw)

In [86]:
t = response_parsed.alignment.text_joined
t

'hello, this is the test when I am voicing 123 different phrases (some in parentheses),\nwith newlines\nsome unreadable characters: #!@%*&\nLooLLL123\nhow is it??? going!!\nand some smiles: :))\n'

In [89]:
from src.schemas import TTSTimestampsAlignemnt

In [94]:
a1 = TTSTimestampsAlignemnt(
    characters=list('abc'),
    character_start_times_seconds=[0.1, 0.2, 0.3],
    character_end_times_seconds=[0.15, 0.25, 0.35],
)
a2 = TTSTimestampsAlignemnt(
    characters=list('def'),
    character_start_times_seconds=[0.1, 0.2, 0.3],
    character_end_times_seconds=[0.15, 0.25, 0.35],
)
a3 = TTSTimestampsAlignemnt(
    characters=list("ghi"),
    character_start_times_seconds=[0.1, 0.2, 0.3],
    character_end_times_seconds=[0.15, 0.25, 0.35],
)

In [108]:
a = TTSTimestampsAlignemnt.combine_alignments(alignments=[a1, a2, a3])

In [109]:
a.to_dataframe()

Unnamed: 0,char,start,end
0,a,0.1,0.15
1,b,0.2,0.25
2,c,0.3,0.35
3,d,0.45,0.5
4,e,0.55,0.6
5,f,0.65,0.7
6,g,0.8,0.85
7,h,0.9,0.95
8,i,1.0,1.05


In [110]:
a.get_start_time_by_char_ix(3)

0.44999999999999996

In [111]:
a.get_end_time_by_char_ix(6)

0.85

## sound effects

### map spans indices to original text indices

In [None]:
import re
from src import utils

In [None]:
text_w_effects = """\
Hello<effect prompt="soft wind blowing"> - she said softly, gasping the fresh air from the window</effect>
What are you next plans?
<effect prompt="sound of a cars passing by">Frankly, I don't know. I need more time</effect>
"""

In [None]:
text = re.sub(r'<.+?>', '', text_w_effects)
print(text)

Hello - she said softly, gasping the fresh air from the window
What are you next plans?
Frankly, I don't know. I need more time



In [None]:
pat = re.compile(r'<effect prompt=\"(.*?)\">(.*?)</effect>')

In [None]:
all_matches = list(pat.finditer(text_w_effects))
all_matches

[<re.Match object; span=(5, 106), match='<effect prompt="soft wind blowing"> - she said so>,
 <re.Match object; span=(132, 224), match='<effect prompt="sound of a cars passing by">Frank>]

In [None]:
class SoundEffectSpan(BaseModel):
    prompt: str
    text_between_tags: str
    # indices relative to LLM response
    ix_start: int
    ix_end: int
    # indices relative to origin text passed to LLM
    ix_start_orig_text: int
    ix_end_orig_text: int

In [None]:
spans = []

rm_chars_running_total = 0
for m in all_matches:
    mstart, mend = m.span()
    prompt = m.group(1)
    text_between_tags = m.group(2)

    ix_start_orig = mstart - rm_chars_running_total
    ix_end_orig = ix_start_orig + len(text_between_tags)

    spans.append(
        SoundEffectSpan(
            prompt=prompt,
            text_between_tags=text_between_tags,
            ix_start=mstart,
            ix_end=mend,
            ix_start_orig_text=ix_start_orig,
            ix_end_orig_text=ix_end_orig,
        )
    )

    mlen = mend - mstart
    rm_chars_running_total += mlen - len(text_between_tags)

In [None]:
spans

[SoundEffectSpan(prompt='soft wind blowing', text_between_tags=' - she said softly, gasping the fresh air from the window', ix_start=5, ix_end=106, ix_start_orig_text=5, ix_end_orig_text=62),
 SoundEffectSpan(prompt='sound of a cars passing by', text_between_tags="Frankly, I don't know. I need more time", ix_start=132, ix_end=224, ix_start_orig_text=88, ix_end_orig_text=127)]

In [None]:
text[5:62]

' - she said softly, gasping the fresh air from the window'

In [None]:
text[88:127]

"Frankly, I don't know. I need more time"

## compare audio quality for different formats

In [5]:
text = samples.ARCH_WIKI_1[:200]
pprint(text)

('This document is an annotated index of popular articles and important '
 'information for improving and adding functionalities to the installed Arch '
 'system. Readers are assumed to have read and followed t')


In [12]:
params_base = TTSParams(
    voice_id="8opUN7sGOKbyojnjvNdl",
    text="hello, how are you doing? this is the test aiming to decide which audio quality option to use",
    # text=text,
)

In [13]:
# out_dp = "data/compare_audio_quality2"
# os.makedirs(out_dp, exist_ok=True)

# for audio_format in AudioOutputFormat:
#     if audio_format is AudioOutputFormat.ULAW_8000:
#         continue

#     params = params_base.model_copy(deep=True)
#     params.output_format = audio_format

#     response_raw = await ELEVEN_CLIENT_ASYNC.text_to_speech.convert_with_timestamps(
#         **params.to_dict()
#     )
#     response_parsed = TTSTimestampsResponse.model_validate(response_raw)

#     filepath_no_ext = os.path.join(out_dp, f"compare.{audio_format}")
#     out_fp = response_parsed.write_audio_to_file(
#         filepath_no_ext=filepath_no_ext, audio_format=audio_format
#     )

  Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `VoiceSettings` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `int` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
2024-10-27 18:16:54,779 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_22050_32 "HTTP/1.1 200 OK"
2024-10-27 18:16:54,807 [INFO] audio-books (utils.py): saving to: "data/compare_audio_quality2/compare.mp3_22050_32.mp3"
2024-10-27 18:16:56,094 

In [11]:
!ffprobe data/compare_audio_quality/compare.mp3_44100_64.mp3

ffprobe version 7.0.1 Copyright (c) 2007-2024 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.3.9.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.0.1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex -