# Fish Speech

### For Windows User / win用户

In [None]:
!chcp 65001

### For Linux User / Linux 用户

In [1]:
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

'en_US.UTF-8'

### Prepare Model

In [2]:
# For Chinese users, you probably want to use mirror to accelerate downloading
# !set HF_ENDPOINT=https://hf-mirror.com
# !export HF_ENDPOINT=https://hf-mirror.com 

# !huggingface-cli download fishaudio/fish-speech-1.5 --local-dir checkpoints/fish-speech-1.5/

## WebUI Inference

> You can use --compile to fuse CUDA kernels for faster inference (10x).

In [None]:
!python tools/run_webui.py \
    --llama-checkpoint-path checkpoints/fish-speech-1.5 \
    --decoder-checkpoint-path checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth \
    # --compile

## Break-down CLI Inference

### 1. Encode reference audio: / 从语音生成 prompt: 

You should get a `fake.npy` file.

你应该能得到一个 `fake.npy` 文件.

In [None]:
# Enter the path to the audio file here
!python fish_speech/models/vqgan/inference.py \
    -i ./Surtr-001.ogg \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-19 23:09:22.735[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m47[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-19 23:09:22.736[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m76[0m - [1mProcessing in-place reconstruction of Surtr-001.ogg[0m
[32m2025-04-19 23:09:22.739[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m87[0m - [1mLoaded audio with 6.30 seconds[0m
  with autocast(enabled = False):
[32m2025-04-19 23:09:22.915[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m95[0m - [1mGenerated indices of shape torch.Size([8, 136])[0m
decoding start
[32m2025-04-19 23:09:23.078[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m118[0m - [1mGenerated audio of shape torch.Size([1, 1, 278528]), equivalent to 6.32 seconds from 136 features, features/second:

In [6]:
from IPython.display import Audio, display
audio = Audio(filename="fake.wav")
display(audio)

### 2. Generate semantic tokens from text: / 从文本生成语义 token:

> This command will create a codes_N file in the working directory, where N is an integer starting from 0.

> You may want to use `--compile` to fuse CUDA kernels for faster inference (~30 tokens/second -> ~300 tokens/second).

> 该命令会在工作目录下创建 codes_N 文件, 其中 N 是从 0 开始的整数.

> 您可以使用 `--compile` 来融合 cuda 内核以实现更快的推理 (~30 tokens/秒 -> ~300 tokens/秒)

In [12]:
!python fish_speech/models/text2semantic/inference.py \
    --text "他人の指導役はもうごめんだ。一般人たちと雁首揃えてアーツごっこするなんて興味ない。" \
    --prompt-text "あんた、自分の仕事も全うできないからって、私に助けろっていうの？" \
    --prompt-tokens "fake.npy" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/" \
    --num-samples 3
    # --compile

[32m2025-04-19 23:44:05.579[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1056[0m - [1mLoading model ...[0m
[32m2025-04-19 23:44:10.358[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m681[0m - [1mRestored model from checkpoint[0m
[32m2025-04-19 23:44:10.358[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m687[0m - [1mUsing DualARTransformer[0m
[32m2025-04-19 23:44:10.367[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1070[0m - [1mTime to load model: 4.79 seconds[0m
starting generation
[32m2025-04-19 23:44:10.378[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_long[0m:[36m788[0m - [1mEncoded text: 他人の指導役はもうごめんだ。[0m
[32m2025-04-19 23:44:10.381[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_long[0m:[36m788[0m - [1mEncoded text: 一般人たちと雁首揃えてアーツごっこするなんて興味ない。[0m
[32m2025-04-19 23:44:10.381[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_long[0m:[36m806[0m - [1mGen

### 3. Generate speech from semantic tokens: / 从语义 token 生成人声:

In [24]:
!python fish_speech/models/vqgan/inference.py \
    -i "./temp_0/codes_0.npy" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

from IPython.display import Audio, display
audio = Audio(filename="fake.wav")
display(audio)

  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-20 04:58:58.258[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m47[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-20 04:58:58.258[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m100[0m - [1mProcessing precomputed indices from temp/codes_0.npy[0m
decoding start
Indices shape of: torch.Size([8, 149])
[32m2025-04-20 04:58:58.632[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m119[0m - [1mGenerated audio of shape torch.Size([1, 1, 305152]), equivalent to 6.9195464853 seconds from 149 features, features/second: 21.53[0m
decoding end: time taken: 0.36461329460144043
[32m2025-04-20 04:58:58.639[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m129[0m - [1mSaved audio to fake.wav[0m


In [None]:
# 305152 / 149 = 2048

In [None]:
!python fish_speech/models/vqgan/inference.py \
    -i "./temp_0/codes_1.npy" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

from IPython.display import Audio, display
audio = Audio(filename="fake.wav")
display(audio)

  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-19 23:59:55.388[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m47[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-19 23:59:55.388[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m100[0m - [1mProcessing precomputed indices from temp/codes_1.npy[0m
decoding start
Indices shape of: torch.Size([8, 138])
[32m2025-04-19 23:59:55.672[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m119[0m - [1mGenerated audio of shape torch.Size([1, 1, 282624]), equivalent to 6.4087074830 seconds from 138 features, features/second: 21.53[0m
decoding end: time taken: 0.2750844955444336
[32m2025-04-19 23:59:55.690[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m129[0m - [1mSaved audio to fake.wav[0m


In [None]:
# 282624 / 138 = 2048

In [None]:
!python fish_speech/models/vqgan/inference.py \
    -i "./temp_0/codes_2.npy" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

from IPython.display import Audio, display
audio = Audio(filename="fake.wav")
display(audio)

  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-19 23:59:59.041[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m47[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-19 23:59:59.041[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m100[0m - [1mProcessing precomputed indices from temp/codes_2.npy[0m
decoding start
Indices shape of: torch.Size([8, 153])
[32m2025-04-19 23:59:59.325[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m119[0m - [1mGenerated audio of shape torch.Size([1, 1, 313344]), equivalent to 7.1053061224 seconds from 153 features, features/second: 21.53[0m
decoding end: time taken: 0.27536630630493164
[32m2025-04-19 23:59:59.343[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m129[0m - [1mSaved audio to fake.wav[0m


In [None]:
# 313344 / 153 = 2048

In [26]:
!python fish_speech/models/vqgan/inference.py \
    -i "./temp_0/codes_3.npy" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

from IPython.display import Audio, display
audio = Audio(filename="fake.wav")
display(audio)

  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-20 05:00:52.413[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m47[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-20 05:00:52.414[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m100[0m - [1mProcessing precomputed indices from temp/codes_3.npy[0m
decoding start
Indices shape of: torch.Size([8, 168])
[32m2025-04-20 05:00:52.697[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m119[0m - [1mGenerated audio of shape torch.Size([1, 1, 344064]), equivalent to 7.8019047619 seconds from 168 features, features/second: 21.53[0m
decoding end: time taken: 0.27561259269714355
[32m2025-04-20 05:00:52.705[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m129[0m - [1mSaved audio to fake.wav[0m


In [None]:
# 344064 / 168 = 2048

In [None]:
# so i'm currently using fish speech 1.5, and i'm trying to think of a way to get real time streaming. currently there's two main components. the llm that takes in text + audio tokens, and outputs audio tokens, and there's also a GAN decoder that takes in those audio tokens and generates a .wav file from it. the GAN decoding process takes like 0.2 seconds, whereas the LLM decoding needs 3+ seconds (definitely the bottleneck). 
# currently the system waits until the LLM decoding finishes before moving on to the GAN decoding, which makes sense, as if you lets say have a paragraph that you want to generate audio tokens for, the tone in the later parts of the paragraph can be highly dependent on the earlier parts of the paragraph, so it makes sense to wait until all the audio tokens are generated, before doing the GAN decoding. the issue is that this prevents real time output. the only way i can think of getting real time audio generation is by going sentence by sentence or something, but that doesn't feel great. 
# or... i'm not sure how exactly it's done, but assuming it's like eva-gan, (though slightly different, since eva gan takes in mel spectogram histogram bins, whereas this GAN decoder takes in latent vectors), and one token always maps up through 1d transposed convolutions to always the same amount of time in output seconds, then you might be able to pass in overlapping chunks of audio tokens to the GAN, and just calculate the locations to splice together the audio generations?



# ok yeah. seems like it is like eva-gan with 1d convs and fixed upscaling. numerator is audio shape[0], and denominator is token shape[0]# 305152 / 149 = 2048
# 282624 / 138 = 2048
# 313344 / 153 = 2048
# i think the overlapping chunk idea might work



# or maybe even. if the gan generation time scales significantly better than the token generation time, might be worth just passing in all previous tokens in along with the 100 new tokens, and send the spliced 100 new tokens to the client. (?)
# might be worth me testing on already generated indices, and just do a cumsum kinda generations, and see if the idea even works in the first place in terms of quality and speed.

# going to make an INFERENCE CUM file in the vqgan folder



In [42]:
!python fish_speech/models/vqgan/inference_cum.py \
    -i "./temp_0/codes_2.npy" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

from IPython.display import Audio, display
audio = Audio(filename="shit.wav")
display(audio)

  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-20 06:02:46.902[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m48[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-20 06:02:46.902[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m101[0m - [1mProcessing precomputed indices from temp_0/codes_2.npy[0m
torch.Size([8, 153])
10
torch.Size([8, 10])
decoding start
Indices shape of: torch.Size([8, 10])
[32m2025-04-20 06:02:47.193[0m | [1mINFO    [0m | [36m__main__[0m:[36mrestore[0m:[36m123[0m - [1mGenerated audio of shape torch.Size([1, 1, 20480]), equivalent to 0.4643990930 seconds from 10 features, features/second: 21.53[0m
decoding end: time taken: 0.2819359302520752
0 10
saving cum to ./temp/cum-10.wav
[32m2025-04-20 06:02:47.208[0m | [1mINFO    [0m | [36m__main__[0m:[36mrestore[0m:[36m139[0m - [1mSaved audio to ./temp/cum-10.wav

In [None]:
# combine together audio files. 
from pydub import AudioSegment
import os

wavs = list(map(lambda x: f"./temp0/{x}", sorted(list(filter(lambda x: x.startswith("new"), os.listdir("./temp"))), key=lambda x: int(x.split(".")[0].split("-")[1]))))

combined = AudioSegment.empty()
for wav in wavs:
  print(wav)
  audio = AudioSegment.from_wav(wav)
  combined += audio  # Append

combined.export("combined_output.wav", format="wav")

from IPython.display import Audio, display
audio = Audio(filename="combined_output.wav")
display(audio)

./temp/new-10.wav
./temp/new-20.wav
./temp/new-30.wav
./temp/new-40.wav
./temp/new-50.wav
./temp/new-60.wav
./temp/new-70.wav
./temp/new-80.wav
./temp/new-90.wav
./temp/new-100.wav
./temp/new-110.wav
./temp/new-120.wav
./temp/new-130.wav
./temp/new-140.wav
./temp/new-150.wav
./temp/new-153.wav


In [None]:
# OK YEP. IT WORKS. splicing it together works!

# ok yep. i think it actually works. guess the vq-vae codebook for the decoder/encoder gan actually maps pretty consistently to the same sound, even when considering cross-context in gan. i don't even need to do any processing afterwards. just the raw spliced wav files, can be recombined. 

# i guess to check the codebook mapping idea, i can remove prior context for the gan generation and see if the audio still matches up.

In [34]:
!python fish_speech/models/vqgan/inference_cum.py \
    -i "./temp_0/codes_2.npy" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-20 05:48:07.973[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m48[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-20 05:48:07.973[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m101[0m - [1mProcessing precomputed indices from temp_0/codes_2.npy[0m
torch.Size([8, 153])
10
torch.Size([8, 10])
decoding start
Indices shape of: torch.Size([8, 10])
[32m2025-04-20 05:48:08.279[0m | [1mINFO    [0m | [36m__main__[0m:[36mrestore[0m:[36m123[0m - [1mGenerated audio of shape torch.Size([1, 1, 20480]), equivalent to 0.4643990930 seconds from 10 features, features/second: 21.53[0m
decoding end: time taken: 0.29726624488830566
0 10
saving cum to ./temp/cum-10.wav
[32m2025-04-20 05:48:08.285[0m | [1mINFO    [0m | [36m__main__[0m:[36mrestore[0m:[36m139[0m - [1mSaved audio to ./temp/cum-10.wav

In [1]:
# combine together audio files. 
from pydub import AudioSegment
import os

wavs = list(map(lambda x: f"./temp_0/{x}", sorted(list(filter(lambda x: x.startswith("ONLYnew"), os.listdir("./temp"))), key=lambda x: int(x.split(".")[0].split("-")[1]))))

combined = AudioSegment.empty()
for wav in wavs:
  print(wav)
  audio = AudioSegment.from_wav(wav)
  combined += audio  # Append

combined.export("combined_output-2.wav", format="wav")

from IPython.display import Audio, display
audio = Audio(filename="combined_output-2.wav")
display(audio)

./temp/ONLYnew-10.wav
./temp/ONLYnew-20.wav
./temp/ONLYnew-30.wav
./temp/ONLYnew-40.wav
./temp/ONLYnew-50.wav
./temp/ONLYnew-60.wav
./temp/ONLYnew-70.wav
./temp/ONLYnew-80.wav
./temp/ONLYnew-90.wav
./temp/ONLYnew-100.wav
./temp/ONLYnew-110.wav
./temp/ONLYnew-120.wav
./temp/ONLYnew-130.wav
./temp/ONLYnew-140.wav
./temp/ONLYnew-150.wav
./temp/ONLYnew-153.wav


In [None]:
# VERY INTERESTING. THERE ARE ARTIFACTS WITHOUT LOOKING AT PRIOR CONTEXT. 

# OK very interesting. the prior context for the GAN does matter. without it, it still sounds pretty decent, but there are moments where the audio cuts out or has a very abrupt transition. rather amusing to think about. the earlier audio portions don't seem to be affected much if at all by later portions, but later portions are affected quite heavily by earlier portions. 
# considering the "multi-receptive fields" for the convolutions going on, i would've imagined that earlier upsamples would look ahead towards later audio tokens and be notably informed by it. guess it's not a significant effect.

In [None]:
# testing inference cum method for dual ar llm system

# https://arknights.wikiru.jp/?%E3%82%B9%E3%83%AB%E3%83%88#voice
!python fish_speech/models/text2semantic/inference_cum.py \
    --text "他人の指導役はもうごめんだ。一般人たちと雁首揃えてアーツごっこするなんて興味ない。" \
    --prompt-text "あんた、自分の仕事も全うできないからって、私に助けろっていうの？" \
    --prompt-tokens "surtr.npy" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/" \
    --num-samples 1
    # --compile

[32m2025-04-20 06:04:13.077[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1197[0m - [1mLoading model VQGAN ...[0m
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-20 06:04:13.752[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_vqgan[0m:[36m50[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-20 06:04:13.752[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1208[0m - [1mLoading model LLM ...[0m
[32m2025-04-20 06:04:18.114[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_llm[0m:[36m820[0m - [1mRestored model from checkpoint[0m
[32m2025-04-20 06:04:18.114[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_llm[0m:[36m826[0m - [1mUsing DualARTransformer[0m
[32m2025-04-20 06:04:18.122[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1222[0m - [1mTime to load model: 4.37 seconds[0m
starting generat

In [1]:
# testing inference cum method for dual ar llm system

# https://arknights.wikiru.jp/?%E3%82%B9%E3%83%AB%E3%83%88#voice
!python fish_speech/models/text2semantic/inference_cum.py \
    --text "他人の指導役はもうごめんだ。一般人たちと雁首揃えてアーツごっこするなんて興味ない。" \
    --prompt-text "あんた、自分の仕事も全うできないからって、私に助けろっていうの？" \
    --prompt-tokens "surtr.npy" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/" \
    --num-samples 1
    # --compile

[32m2025-04-21 00:05:23.750[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1197[0m - [1mLoading model VQGAN ...[0m
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-21 00:05:24.525[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_vqgan[0m:[36m50[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-21 00:05:24.525[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1208[0m - [1mLoading model LLM ...[0m
[32m2025-04-21 00:05:28.617[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_llm[0m:[36m820[0m - [1mRestored model from checkpoint[0m
[32m2025-04-21 00:05:28.617[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_llm[0m:[36m826[0m - [1mUsing DualARTransformer[0m
[32m2025-04-21 00:05:28.625[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1222[0m - [1mTime to load model: 4.10 seconds[0m
starting generat

In [22]:
# testing inference cum method for dual ar llm system

    # --text "他人の指導役はもうごめんだ。一般人たちと雁首揃えてアーツごっこするなんて興味ない。" \
    # --prompt-text "あんた、自分の仕事も全うできないからって、私に助けろっていうの？" \
    # --prompt-tokens "surtr.npy" \
    # --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/" \
# https://arknights.wikiru.jp/?%E3%82%B9%E3%83%AB%E3%83%88#voice
!python fish_speech/models/text2semantic/inference_cum_api.py \
    --checkpoint-path /media/sam/pain/models/fishaudio/fish-speech-1.5/ \
    --output-base-dir ./output/temp \
    --no-compile

Namespace(checkpoint_path='/media/sam/pain/models/fishaudio/fish-speech-1.5/', device='cuda', compile=False, output_base_dir='./output/temp', half=False)
sanity check
[32m2025-04-21 01:23:19.532[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m95[0m - [1mLoading VQGAN ...[0m
[32m2025-04-21 01:23:20.078[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_vqgan[0m:[36m91[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-21 01:23:20.078[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m102[0m - [1mLoaded VQGAN in 0.546730[0m
[32m2025-04-21 01:23:20.084[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1215[0m - [1mLoading model LLM ...[0m
[32m2025-04-21 01:23:24.508[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_llm[0m:[36m888[0m - [1mRestored model from checkpoint[0m
[32m2025-04-21 01:23:24.508[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_llm[0m:[36m894[0m - [

In [33]:
# testing inference cum method for dual ar llm system

    # --text "他人の指導役はもうごめんだ。一般人たちと雁首揃えてアーツごっこするなんて興味ない。" \
    # --prompt-text "あんた、自分の仕事も全うできないからって、私に助けろっていうの？" \
    # --prompt-tokens "surtr.npy" \
    # --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/" \
# https://arknights.wikiru.jp/?%E3%82%B9%E3%83%AB%E3%83%88#voice
# !python fish_speech/models/text2semantic/inference_cum_api.py \
#     --checkpoint-path /media/sam/pain/models/fishaudio/fish-speech-1.5/ \
#     --output-base-dir ./output/temp \
#     --no-compile

!python fish_speech/models/text2semantic/inference_cum_api.py \
  --checkpoint-path /media/sam/pain/models/fishaudio/fish-speech-1.5/ \
  --compile

Namespace(checkpoint_path='/media/sam/pain/models/fishaudio/fish-speech-1.5/', device='cuda', compile=True, output_base_dir='./output', half=False)
[32m2025-04-21 21:31:00.988[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m68[0m - [1mLoading VQGAN ...[0m
[32m2025-04-21 21:31:01.655[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model_vqgan[0m:[36m64[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-21 21:31:01.656[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m75[0m - [1mLoaded VQGAN in 0.667742[0m
[32m2025-04-21 21:31:01.662[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1196[0m - [1mLoading model ...[0m
[32m2025-04-21 21:31:06.060[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m846[0m - [1mRestored model from checkpoint[0m
[32m2025-04-21 21:31:06.061[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m852[0m - [1mUsing DualARTransformer[0m
[32m2

In [54]:
!python fish_speech/models/vqgan/inference.py \
    -i "./voice/chen/aceship/CN_010.mp3" \
    --checkpoint-path "/media/sam/pain/models/fishaudio/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"

  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
  @autocast(enabled = False)
[32m2025-04-21 22:08:56.956[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m47[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-04-21 22:08:56.957[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m76[0m - [1mProcessing in-place reconstruction of voice/chen/aceship/CN_010.mp3[0m
[32m2025-04-21 22:08:56.961[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m87[0m - [1mLoaded audio with 6.89 seconds[0m
  with autocast(enabled = False):
[32m2025-04-21 22:08:57.148[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m95[0m - [1mGenerated indices of shape torch.Size([8, 149])[0m
decoding start
Indices shape of: torch.Size([8, 149])
[32m2025-04-21 22:08:57.326[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m119[0m - [1mGenerated audio of shape torch.Size([1, 1, 305152]), equival