In [None]:
%cd /content
!git lfs install
!git clone https://huggingface.co/spaces/ASLP-lab/DiffRhythm
%cd /content/DiffRhythm
!ls
!mkdir example
!apt install espeak-ng

In [None]:
!pip install -r requirements.txt

In [22]:
config_text = """
import argparse
import torch
import torchaudio
import os

# DiffRhythm関連
from diffrhythm.infer.infer_utils import (
    prepare_model,
    get_reference_latent,
    get_lrc_token,
    get_style_prompt,
    get_negative_style_prompt
)
from diffrhythm.infer.infer import inference

def infer_music(
    cfm_model,
    vae_model,
    tokenizer,
    muq,
    device,
    lrc_text,
    ref_audio_path,
    steps=32,
    file_type="wav",
    max_frames=2048
):
    # ここから先は前回と同様の処理
    sway_sampling_coef = -1 if steps < 32 else None
    lrc_prompt, start_time = get_lrc_token(lrc_text, tokenizer, device)
    style_prompt = get_style_prompt(muq, ref_audio_path)
    negative_style_prompt = get_negative_style_prompt(device)
    latent_prompt = get_reference_latent(device, max_frames)

    generated_waveform = inference(
        cfm_model=cfm_model,
        vae_model=vae_model,
        cond=latent_prompt,
        text=lrc_prompt,
        duration=max_frames,
        style_prompt=style_prompt,
        negative_style_prompt=negative_style_prompt,
        steps=steps,
        sway_sampling_coef=sway_sampling_coef,
        start_time=start_time,
        file_type=file_type
    )
    return generated_waveform


def main():
    parser = argparse.ArgumentParser(
        description="DiffRhythmを用いて歌詞と参照音声から音楽を生成するスクリプト"
    )
    parser.add_argument("--lyrics", type=str, default="example/eg.lrc",
                        help="歌詞ファイル（LRC形式）のパス")
    parser.add_argument("--ref_audio", type=str, default="example/pray.mp3",
                        help="参照音声ファイルのパス（10秒以上推奨）")
    parser.add_argument("--output", type=str, default="output.wav",
                        help="生成結果を保存するファイル名 (wav/mp3/oggなど拡張子は適宜変更)")
    parser.add_argument("--steps", type=int, default=32,
                        help="Diffusionのステップ数（推奨: 32以上）")
    parser.add_argument("--file_type", type=str, default="wav",
                        choices=["wav", "mp3", "ogg"],
                        help="出力フォーマット")
    parser.add_argument("--max_frames", type=int, default=2048,
                        help="最大フレーム数（曲の長さに相当）")
    parser.add_argument("--device", type=str, default="cuda",
                        help="推論に使用するデバイス (cuda, mps, or cpu)")

    args = parser.parse_args()

    # デバイス判定
    # --device で指定された値を元に、使えるかどうかチェック
    if args.device == "mps":
        if torch.backends.mps.is_available():
            print("MPSを使用します。")
            device = "mps"
        else:
            print("Warning: MPSがサポートされていない環境のようです。CPUを使用します。")
            device = "cpu"
    elif args.device == "cuda":
        if torch.cuda.is_available():
            device = "cuda"
        else:
            print("Warning: CUDAがサポートされていない環境のようです。CPUを使用します。")
            device = "cpu"
    else:
        device = "cpu"

    print(f"Using device: {device}")

    # モデル準備
    print("Loading models...")
    cfm, tokenizer, muq, vae = prepare_model(device)

    # torch.compile が使用できる環境なら最適化 (PyTorch 2.0以降)
    try:
        cfm = torch.compile(cfm)
    except:
        pass

    # 歌詞(LRC)読み込み
    with open(args.lyrics, 'r', encoding='utf-8') as f:
        lrc_text = f.read()

    # 音楽生成
    print("Generating music...")
    generated_waveform = infer_music(
        cfm_model=cfm,
        vae_model=vae,
        tokenizer=tokenizer,
        muq=muq,
        device=device,
        lrc_text=lrc_text,
        ref_audio_path=args.ref_audio,
        steps=args.steps,
        file_type=args.file_type,
        max_frames=args.max_frames
    )

    # 出力を保存
    sample_rate = generated_waveform[0]
    waveform_data = generated_waveform[1]

    # waveformをtorch.Tensorに変換（もしnumpyなら）
    if not isinstance(waveform_data, torch.Tensor):
        waveform_tensor = torch.from_numpy(waveform_data).float()
    else:
        waveform_tensor = waveform_data

    # torchaudioは (channels, samples)の形が必要なので転置
    waveform_tensor = waveform_tensor.T  # (4194304, 2) → (2, 4194304)

    # ファイルに保存
    torchaudio.save(args.output, waveform_tensor, sample_rate)
    print("Done.")


if __name__ == "__main__":
    main()


"""

with open("main.py", "w", encoding="utf-8") as f:
  f.write(config_text)

In [23]:
config_text = """
[00:10.00] The cold wind pierces through my heart
[00:13.20] The blurry streetlights hide my tears
[00:16.85] The moment I knew what loneliness was
[00:20.40] I felt like I found a piece of the future
[00:24.15] Beyond the locked door
[00:27.65] I hear a whispering dream
[00:31.30] I want to believe, but I'm so afraid
[00:34.90] Reaching out with trembling hands
[00:38.55] Under the stardust night, I make a wish
[00:42.10] That your smile will never fade away
[00:45.75] Holding onto strength within the fleeting moments
[00:49.25] I will keep chasing the light, again and again
[00:52.00] Guided by the signpost soaked in rain
[00:55.30] I trace back the memories from afar
[00:58.90] The unseen future makes me anxious
[01:02.50] But a small flame flickers deep in my heart
[01:06.25] There's no dream that’s out of reach
[01:09.75] Because you were the one who showed me
[01:13.40] I won’t forget, no matter when
[01:16.95] Your voice will always lead me

"""

with open("example/eg.lrc", "w", encoding="utf-8") as f:
  f.write(config_text)

In [None]:
!python main.py