<a href="https://colab.research.google.com/github/pdrobny/MonReader/blob/main/tts_chatterbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install chatterbox-tts

Collecting chatterbox-tts
  Downloading chatterbox_tts-0.1.2-py3-none-any.whl.metadata (7.2 kB)
Collecting s3tokenizer (from chatterbox-tts)
  Downloading s3tokenizer-0.2.0.tar.gz (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.2/225.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers==4.46.3 (from chatterbox-tts)
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diffusers==0.29.0 (from chatterbox-tts)
  Downloading diffusers-0.29.0-py3-none-any.whl.metadata (19 kB)
Collecting resemble-perth==1.0.1 (from chatterbox-tts)
  Downloading resemble_perth-1.0.1-py3-none-any.whl.metadata (4.8 kB)
Collecting conformer==0.3.2 (from chatterbox-tts)
  Downloading conformer-0.3.2-py3-none-any.whl.metadata (631 bytes)
Collecting nvidia-cuda-n

In [None]:
import IPython.display as ipd
from google.colab import userdata
import numpy as np
import pandas as pd
import os
import PIL.Image
import torch
import torchaudio as ta
import textwrap
from chatterbox.tts import ChatterboxTTS
from chatterbox.vc import ChatterboxVC
from pydub import AudioSegment

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import warnings
warnings.filterwarnings('ignore')
print("Imported libraries.")

Imported libraries.


In [None]:
# load files from google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Chatterbox TTS

In [None]:
# Automatically detect the best available device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

Using device: cuda


## Example text

In [None]:
text = "Testing how well chatterbox works!."

In [None]:
model = ChatterboxTTS.from_pretrained(device=device)

wav = model.generate(text)
ta.save("test-1.wav", wav, model.sr)

In [None]:
audio_data, sample_rate = ta.load("test-1.wav")
ipd.Audio(audio_data, autoplay=True, rate=sample_rate)

## Voice cloning

In [None]:
# If you want to synthesize with a different voice, specify the audio prompt
AUDIO_PROMPT_PATH = "/content/drive/MyDrive/Apziva/MonReader/voice/Paul_earth.wav"
wav = model.generate(text, audio_prompt_path=AUDIO_PROMPT_PATH)
ta.save("test-2.wav", wav, model.sr)


Sampling:   6%|▌         | 61/1000 [00:01<00:29, 31.43it/s]


In [None]:
audio_data, sample_rate = ta.load("test-2.wav")
ipd.Audio(audio_data, autoplay=True, rate=sample_rate)

## Extracted Text

In [None]:
def chunk_text(text, max_chars=200):
    return textwrap.wrap(text, width=max_chars)

In [None]:
# Read text from the file
text_file_path = "/content/drive/MyDrive/Apziva/MonReader/extract/p3_extract.txt" # Assuming the next extracted text is in p4_extract.txt
try:
    with open(text_file_path, 'r') as f:
        book_page_text = f.read().strip()
except FileNotFoundError:
    print(f"Error: Text file not found at {text_file_path}")
    book_page_text = "" # Set empty string if file not found
cleaned_text = book_page_text.replace('\n', ' ')
cleaned_text

"IDEAS ABOUT THE UNIVERSE 3  As long ago as 340 B.C. Aristotle, in his book On the Heavens, was able to put forward two good arguments for believ- ing that the Earth was a round ball rather than a flat plate. First, he realized that eclipses of the moon were caused by the Earth coming between the sun and the moon. The Earth's shadow on the moon was always round, which would be true only if the Earth was spherical. If the Earth had been a flat disk, the shadow would have been elongated and elliptical, unless the eclipse always occurred at a time when the sun was di- rectly above the center of the disk. Second, the Greeks knew from their travels that the Pole Star appeared lower in the sky when viewed in the south than it did in more northerly regions. From the difference in the ap- parent position of the Pole Star in Egypt and Greece, Aristotle even quoted an estimate that the distance around the Earth was four hundred thousand stadia. It is not known exactly what length a stadium was, 

In [None]:
# generate audio for each chunk
chunks = chunk_text(cleaned_text)
os.makedirs("segments", exist_ok=True)
segment_files = []

for i, chunk in enumerate(chunks):
    wav = model.generate(chunk, exaggeration=0.3, cfg_weight=0.8, temperature=0.4) # tuned: exaggeration=0.3, cfg_weight=0.8, temperature=0.4, original: exaggeration=0.5, cfg_weight=0.8, temperature=0.6)
    file_name = f"segments/segment_{i}.wav"
    ta.save(file_name, wav, model.sr)
    segment_files.append(file_name)

In [None]:
# stitch together audio
final_audio = AudioSegment.empty()
for file in segment_files:
    seg = AudioSegment.from_wav(file)
    final_audio += seg

final_audio.export("final_output.wav", format="wav")

<_io.BufferedRandom name='final_output.wav'>

In [None]:
audio_data, sample_rate = ta.load("final_output.wav")
ipd.Audio(audio_data, autoplay=True, rate=sample_rate)