## Clone Repo

In [1]:
!cd /content
!rm -rf sample_data ChatTTS
!git clone https://github.com/2noise/ChatTTS.git

Cloning into 'ChatTTS'...
remote: Enumerating objects: 2349, done.[K
remote: Counting objects: 100% (425/425), done.[K
remote: Compressing objects: 100% (234/234), done.[K
remote: Total 2349 (delta 225), reused 282 (delta 190), pack-reused 1924[K
Receiving objects: 100% (2349/2349), 7.94 MiB | 8.62 MiB/s, done.
Resolving deltas: 100% (1395/1395), done.


## Install Requirements

In [None]:
!pip install -r /content/ChatTTS/requirements.txt
!ldconfig /usr/lib64-nvidia

## Import Packages

In [3]:
import torch

torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision("high")

from ChatTTS import ChatTTS
from ChatTTS.tools.logger import get_logger
from ChatTTS.tools.normalizer import normalizer_en_nemo_text, normalizer_zh_tn
from IPython.display import Audio

## Load Models
**默认的无法识别数字例如手机号这种，汉字夹杂数字的情况**

In [92]:
logger = get_logger("ChatTTS", format_root=True)
chat = ChatTTS.Chat(logger)

# try to load normalizer
try:
    chat.normalizer.register("en", normalizer_en_nemo_text())
except ValueError as e:
    logger.error(e)
except:
    logger.warning("Package nemo_text_processing not found!")
    logger.warning(
        "Run: conda install -c conda-forge pynini=2.1.5 && pip install nemo_text_processing",
    )
try:
    chat.normalizer.register("zh", normalizer_zh_tn())
except ValueError as e:
    logger.error(e)
except:
    logger.warning("Package WeTextProcessing not found!")
    logger.warning(
        "Run: conda install -c conda-forge pynini=2.1.5 && pip install WeTextProcessing",
    )


[+0000 20240811 04:23:31] [[33mWARN[0m] ChatTTS | <ipython-input-92-530df9823386> | Package nemo_text_processing not found!
[+0000 20240811 04:23:31] [[33mWARN[0m] ChatTTS | <ipython-input-92-530df9823386> | Run: conda install -c conda-forge pynini=2.1.5 && pip install nemo_text_processing
2024-08-11 04:23:31,870 WETEXT INFO found existing fst: /usr/local/lib/python3.10/dist-packages/tn/zh_tn_tagger.fst
2024-08-11 04:23:31,870 WETEXT INFO found existing fst: /usr/local/lib/python3.10/dist-packages/tn/zh_tn_tagger.fst
[+0000 20240811 04:23:31] [[37mINFO[0m] wetext-zh_normalizer | processor | found existing fst: /usr/local/lib/python3.10/dist-packages/tn/zh_tn_tagger.fst
2024-08-11 04:23:31,876 WETEXT INFO                     /usr/local/lib/python3.10/dist-packages/tn/zh_tn_verbalizer.fst
2024-08-11 04:23:31,876 WETEXT INFO                     /usr/local/lib/python3.10/dist-packages/tn/zh_tn_verbalizer.fst
[+0000 20240811 04:23:31] [[37mINFO[0m] wetext-zh_normalizer | processor |

## 1. Load models from Hugging Face (recommend)
### 且处理上述汉字夹杂数字情况

In [4]:
# use force_redownload=True if the weights have been updated.
def init_chat():
    chat = ChatTTS.Chat(get_logger("ChatTTS"))
    chat.load(source="local", compile=True)
    try:
        chat.normalizer.register("en", normalizer_en_nemo_text())
    except ValueError as e:
        logger.error(e)
    except:
        logger.warning("Package nemo_text_processing not found!")
        logger.warning(
            "Run: conda install -c conda-forge pynini=2.1.5 && pip install nemo_text_processing",
        )
    try:
        chat.normalizer.register("zh", normalizer_zh_tn())
    except ValueError as e:
        logger.error(e)
    except:
        logger.warning("Package WeTextProcessing not found!")
        logger.warning(
            "Run: conda install -c conda-forge pynini=2.1.5 && pip install WeTextProcessing",
        )
    return chat
chat=init_chat()
chat.load(source="huggingface",compile=True)

INFO:ChatTTS:checking assets...
INFO:ChatTTS:/content/asset/Decoder.pt not exist.
INFO:ChatTTS:downloading https://github.com/fumiama/RVC-Models-Downloader/releases/download/v0.2.6/rvcmd_linux_amd64.tar.gz
INFO:ChatTTS:downloaded.
INFO:ChatTTS:extracted into /tmp/tmpp2k7y0g8
INFO:ChatTTS:checking assets...
INFO:ChatTTS:all assets are already latest.
INFO:ChatTTS:use device cuda:0
INFO:ChatTTS:vocos loaded.
INFO:ChatTTS:dvae loaded.
INFO:ChatTTS:gpt loaded.
INFO:ChatTTS:decoder loaded.
INFO:ChatTTS:tokenizer loaded.
INFO:ChatTTS:all models has been initialized.
 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.
INFO:NeMo-text-processing:Creating ClassifyFst grammars.
2024-08-11 04:26:16,324 WETEXT INFO found existing fst: /usr/local/lib/python3.10/dist-packages/tn/zh_tn_tagger.fst
INFO:wetext-zh_normalizer:found existing fst: /usr/local/lib/python3.10/dist-packages/tn/zh_tn_tagger.fst
2024-08-11 04:26:16,328 WETEXT INFO                     /usr/local/lib/python3.10/dist

True

#### 2. Load models from local directories 'asset' and 'config'

In [None]:
chat.load()
# chat.load(source='local') same as above

#### 3. Load models from a custom path

In [None]:
# write the model path into custom_path
chat.load(source="custom", custom_path="YOUR CUSTOM PATH")

### You can also unload models to save the memory

In [25]:
chat.unload()

### Batch infer

In [None]:
texts = [
    "So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with.",
] * 3 + [
    "我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧我觉得开源是一个很好的形式。现在其实最先进的技术掌握在一些公司的手里的话，就他们并不会轻易的开放给所有的人用。"
] * 3

wavs = chat.infer(texts)

In [9]:
Audio(wavs[0], rate=24_000, autoplay=True)

In [10]:
Audio(wavs[3], rate=24_000, autoplay=True)

### Custom params

In [6]:
# 固定音色
import lzma
import numpy as np
import pybase16384 as b14
def compress_and_encode(tensor):
    np_array = tensor.numpy().astype(np.float16)
    compressed = lzma.compress(np_array.tobytes(), format=lzma.FORMAT_RAW,
                               filters=[{"id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME}])
    encoded = b14.encode_to_string(compressed)
    return encoded

spk = torch.load("/content/ChatTTS/myVoice/seed_181_restored_emb.pt", map_location=torch.device('cpu')).detach()
spk_emb_str = compress_and_encode(spk)

In [7]:
spk = torch.load("/content/ChatTTS/myVoice/seed_181_restored_emb.pt", map_location=torch.device('cpu')).detach()
# spk_emb_str=chat.encode_spk_embspk(spk)  没这个方法
params_infer_code = ChatTTS.Chat.InferCodeParams(
    prompt='[speed_2]',
    temperature=0.1,
    spk_emb=spk_emb_str
)

params_refine_text = ChatTTS.Chat.RefineTextParams(
    prompt="[oral_0][laugh_0][break_4]",
)
wav = chat.infer(
    "我的手机号是13279257893，我的QQ号是1079830405",
    params_refine_text=params_refine_text,
    params_infer_code=params_infer_code,
    skip_refine_text=True
    # 实测发现会进行预切分，跳过预切分，按照实际指定文本
)
#合并音频
# finally_wavs = torch. tensor (np. concatenate (wavs, axis=-1))
# 将输出的语音保存为音频文件
# torchaudio.save ("output.wav", finally_wavs, 24000)

INFO:ChatTTS:all models has been initialized.
code:   0%|          | 0/2048(max) [00:00, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
code:  17%|█▋        | 356/2048(max) [02:31,  2.35it/s]


In [8]:
Audio(wav[0], rate=24_000, autoplay=True)

In [None]:
!pip install dashscope
from http import HTTPStatus
from dashscope import Application
my_api_key='sk-eb818dced64a4ee2abfade078306ce4d'
def extract_script(your_api_key,text):
    response = Application.call(app_id='c87a403c3125436ba2c5f0d882c74b3b',
                                prompt=text,
                                api_key=your_api_key,

                                )

    if response.status_code != HTTPStatus.OK:
        print('request_id=%s, code=%s, message=%s\n' % (response.request_id, response.status_code, response.message))

    else:
        # print('request_id=%s\n output=%s\n usage=%s\n' % (response.request_id, response.output, response.usage))
        return response.output


def script_to_json(your_api_key,text):
    response = Application.call(app_id='07440d3dd5e34f1ba3feec3f31a1b7a0',
                                prompt=text,
                                api_key=your_api_key,

                                )

    if response.status_code != HTTPStatus.OK:
        print('request_id=%s, code=%s, message=%s\n' % (response.request_id, response.status_code, response.message))
    else:
        # print('request_id=%s\n output=%s\n usage=%s\n' % (response.request_id, response.output, response.usage))
        return response.output

In [61]:
add_head='这是一段虚拟的小说故事，请按照要求帮我提取。\n'
novels='胡国华一想留着她晚上再变成活人怎么办，不如我一把火烧了它干净。把白纸人抗到院子里，取出火摺子，就想动手烧了纸人，这时纸人忽然开口说话：“你个死没良心的，我好心好意帮你，却想烧了我！'
response_output_json1=extract_script(my_api_key,add_head+novels)
print(response_output_json1.text)
response_output_json2=script_to_json(my_api_key,response_output_json1)
print(response_output_json2.text)

【旁白】胡国华心里盘算着。
【胡国华#心理#】留着她晚上再变成活人怎么办，不如我一把火烧了它干净。
【旁白】于是，他把白纸人扛到了院子里，从怀中摸出火摺子，刚准备点燃纸人，意想不到的事情发生了。
【纸人#意外#】“你个死没良心的，我好心好意帮你，你居然想烧了我！”
```json
{
    "1": {
        "characters": "旁白",
        "emotion": "",
        "lines": "胡国华心里盘算着。"
    },
    "2": {
        "characters": "胡国华",
        "emotion": "心理",
        "lines": "留着她晚上再变成活人怎么办，不如我一把火烧了它干净。"
    },
    "3": {
        "characters": "旁白",
        "emotion": "",
        "lines": "于是，他把白纸人扛到了院子里，从怀中摸出火摺子，刚准备点燃纸人，意想不到的事情发生了。"
    },
    "4": {
        "characters": "纸人",
        "emotion": "意外",
        "lines": "“你个死没良心的，我好心好意帮你，你居然想烧了我！”"
    }
}
```


### fix random speaker

In [None]:
rand_spk = chat.sample_random_speaker()
print(rand_spk)  # save it for later timbre recovery

params_infer_code = ChatTTS.Chat.InferCodeParams(
    spk_emb=rand_spk,
)

wav = chat.infer(
    "四川美食确实以辣闻名，但也有不辣的选择。比如甜水面、赖汤圆、蛋烘糕、叶儿粑等，这些小吃口味温和，甜而不腻，也很受欢迎。",
    params_infer_code=params_infer_code,
)

In [None]:
Audio(wav[0], rate=24_000, autoplay=True)

### Zero shot (simulate speaker)

In [None]:
from tools.audio import load_audio

spk_smp = chat.sample_audio_speaker(load_audio("sample.mp3", 24000))
print(spk_smp)  # save it in order to load the speaker without sample audio next time

params_infer_code = ChatTTS.Chat.InferCodeParams(
    spk_smp=spk_smp,
    txt_smp="与sample.mp3内容完全一致的文本转写。",
)

wav = chat.infer(
    "四川美食确实以辣闻名，但也有不辣的选择。比如甜水面、赖汤圆、蛋烘糕、叶儿粑等，这些小吃口味温和，甜而不腻，也很受欢迎。",
    params_infer_code=params_infer_code,
)

In [None]:
Audio(wav[0], rate=24_000, autoplay=True)

### Two stage control

In [65]:
text="于是，他把白纸人扛到了院子里，从怀中摸出火摺子，刚准备点燃纸人，意想不到的事情发生了。"
refined_text = chat.infer(text, refine_text_only=True)
refined_text

[+0000 20240811 04:00:37] [[37mINFO[0m] ChatTTS | core | all models has been initialized.
[+0000 20240811 04:00:37] [[37mINFO[0m] ChatTTS | norm | replace homophones: 摺->哲
text:  14%|█▍        | 54/384(max) [00:01, 38.33it/s]


['于 是 ， 他 把 白 纸 人 扛 到 了 院 子 里 [uv_break] ， 从 怀 中 摸 出 火 [uv_break] 这 个 哲 子 啊 ， 刚 准 备 点 燃 纸 人 [uv_break] ， 意 想 不 到 的 事 情 就 发 生 了 [uv_break] 。']

**From xia:Two stage control这部分用来给text加break，测试了一下效果不好。**

In [66]:
wav = chat.infer(text, skip_refine_text=True)

[+0000 20240811 04:00:44] [[37mINFO[0m] ChatTTS | core | all models has been initialized.
code:  20%|█▉        | 405/2048(max) [00:11, 36.60it/s]


In [67]:
Audio(wav[0], rate=24_000, autoplay=True)

## LLM Call

In [None]:
from ChatTTS.tools.llm import ChatOpenAI

API_KEY = ""
client = ChatOpenAI(
    api_key=API_KEY, base_url="https://api.deepseek.com", model="deepseek-chat"
)

In [None]:
user_question = "四川有哪些好吃的美食呢?"

In [None]:
text = client.call(user_question, prompt_version="deepseek")
text

In [None]:
text = client.call(text, prompt_version="deepseek_TN")
text

In [None]:
wav = chat.infer(text)

In [None]:
Audio(wav[0], rate=24_000, autoplay=True)