#### Set OpenAI API key

In [1]:
OPENAI_API_KEY = ""

#### Install dependencies

In [None]:
!pip install -q torchaudio omegaconf openai

In [2]:
import os
import json
import openai
import torch
import numpy as np
from pprint import pprint
from omegaconf import OmegaConf
from IPython.display import Audio, display

#### Load model

In [None]:
openai.api_key = OPENAI_API_KEY

torch._C._jit_set_profiling_mode(False)

torch.hub.download_url_to_file('https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml',
                               'latest_silero_models.yml',
                               progress=True)
models = OmegaConf.load('latest_silero_models.yml')

language = 'en'
model_id = 'v3_en'

device = torch.device('cpu') # cpu or gpu (cuda)

model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                     model='silero_tts',
                                     language=language,
                                     speaker=model_id)
model.to(device)

#### Prompt (no need to modify)

In [4]:
prompt_instruct = """
given json input, i want you to analyze the text and segment it based on the speaker.

this is the input/output format:
```json
{
    "previous speaker": "Name or Unknown",
    "text": "sentence or paragraph",
}
[
    ["text segment", "speaker name", "gender"],
    ["text segment", "speaker name", "gender"],
    etc.
]
```

here are input/output examples:

```json
{
    "previous speaker": "Unknown",
    "text": "\"Yes, Your Majesty.\" Barov nodded, and then asked, \"I don't know if your trip to Longsong...\"",
}
[
    ["\"Yes, Your Majesty.\"", "Barov", "Male"],
    ["Barov nodded, and then asked, ", "Narrator", "Female"],
    ["\"I don't know if your trip to Longsong...\"", "Barov", "Male"]
]
```
```json
{
    "previous speaker": "Barov",
    "text": "Roland was amused by the cautious look on Barov's face. He shook his head reluctantly. \"Even if they're your disciples, I wouldn't put the blame on you, so you can rest assured.\"",
}
[
    ["Roland was amused by the cautious look on Barov's face. He shook his head reluctantly. ", "Narrator", "Female"],
    ["\"Even if they're your disciples, I wouldn't put the blame on you, so you can rest assured.\"", "Roland", "Male"]
]
```
```json
{
    "previous speaker": "Roland",
    "text": "\"Summon every one of City Hall to gather in the castle hall tomorrow. I'll personally give them a briefing to talk about the importance of discipline and responsibility.\"",
}
[
    ["\"Summon every one of City Hall to gather in the castle hall tomorrow. I'll personally give them a briefing to talk about the importance of discipline and responsibility.\"", "Roland", "Male"],
]
```

Now you try:
"""

In [5]:
sample_rate = 48000
fallback_voice = 'en_67'
seg_cache = None

# decent voices
# male: 103, 98, 89, 77, 73, 71, 66, 31
# female: 98, 95, 94, 72, 75, 62, 60, 55, 51, 47, 44, 43, 41, 39, 37, 33, 28, 25, 21, 14, 10

# add character names
voice_map = {
    "male": "en_89", # 103, 31, 71
    "female": "en_55",
    "narrator": "en_43" # 67, 41
}


def parse_dialog(text, prev_speaker):
    prompt = {"previous speaker": prev_speaker, "text": text}
    prompt = prompt_instruct+json.dumps(prompt)
    
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        temperature=0,
        max_tokens=1000,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
    ).choices[0].text
    
    try:
        global seg_cache
        seg_cache = json.loads(response)
        return seg_cache
    except Exception as e:
        print(e)
        return None


def segs2audio(segs):
    
    final_audio = np.array([])

    for seg in segs:
        voice = fallback_voice
        if seg[1].lower() in voice_map:
            voice = voice_map[seg[1].lower()]
        elif seg[2].lower() in ['male', 'female']:
            voice = voice_map[seg[2].lower()]
        
        text = seg[0].strip().replace("...", ".")
        
        audio = model.apply_tts(text=text,
                                speaker=voice,
                                sample_rate=sample_rate,
                                put_accent=True,
                                put_yo=True)
        
        final_audio = np.hstack([final_audio, np.zeros(sample_rate//2), audio])
    
    return final_audio

def read_segs(segs):
    display(Audio(segs2audio(segs), rate=sample_rate))

def read_text(text, prev_speaker="Unknown"):
    #text = text.replace('"', '\"').strip()
    audio = segs2audio(parse_dialog(text, prev_speaker))
    display(Audio(audio, rate=sample_rate))

#### Generate audio

In [6]:
previous_speaker = "Unknown"
text = """
"Her magic power looks like a gray ball..." Nightingale whispered in his ear, "but the shape isn't fixed."
"What's the total amount of magic power?"
"The level of improvement is good. At the moment, she's between Soraya and Maggie."
Roland nodded and looked at Evelyn. "What's inside the bottles?"
"""

read_text(text, previous_speaker)

##### Regenerate audio using cached text segments (to avoid making api call)

In [None]:
pprint(seg_cache)
read_segs(seg_cache)