<a href="https://colab.research.google.com/github/rmnrnm/TNS-Pull-Request-Practice/blob/master/demo_riffusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# riffusion colab demo

Run [riffusion](https://www.riffusion.com/about) in a gradio demo with a colab host

Riffusion project by [Seth Forsgren](https://twitter.com/sethforsgren) and [Hayk Martiros](https://github.com/hmartiro), colab notebook by [Jasper Gilley](https://twitter.com/0xjasper)

Feel free to DM Jasper on Twitter if you have any problems with the notebook

Some cool prompt ideas can be found at https://ai-art-wiki.com/wiki/Riffusion#Prompts

In [None]:
!nvidia-smi

In [None]:
#@title Clone the inference repo
!git clone https://github.com/hmartiro/riffusion-inference
%cd riffusion-inference

In [None]:
#@title Install requirements (you may need to restart the kernel after this)
!pip install -r requirements.txt
!pip install gradio
!pip install --upgrade pillow
!pip install Pillow==9.0.0

In [None]:
#@title Imports
from diffusers import DiffusionPipeline
from riffusion.spectrogram_image_converter import b
from riffusion.spectrogram_params import SpectrogramParams
from io import BytesIO
from IPython.display import Audio

pipe = DiffusionPipeline.from_pretrained("riffusion/riffusion-model-v1")
pipe = pipe.to("cuda")

In [None]:
#@title Define a `predict` function

params = SpectrogramParams()
converter = SpectrogramImageConverter(params)

def predict(prompt, negative_prompt=None):
    spec = pipe(
        prompt,
        #negative_prompt=negative_prompt,
        width=512,
    ).images[0]
    
    wav = converter.audio_from_spectrogram_image(image=spec)
    wav.export(f"{prompt.replace(' ','_')}.wav", format='wav')
    return 'output.wav', spec

In [None]:
#@title Run with Colab interface
prompt = "ariana grande"#@param {type:"string"}
negative_prompt = "drums"#@param {type:"string"}

path, spec = predict(prompt)

#display(spec)
Audio(f"{prompt.replace(' ','_')}.wav")

In [None]:
import PIL.Image    
if not hasattr(PIL.Image, 'Transpose'):  
    PIL.Image.Transpose = PIL.Image  

In [None]:
#@title Run a Gradio demo
import gradio as gr

gr.Interface(
    predict,
    inputs=["text", "text"],
    outputs=[gr.outputs.Audio(type='filepath'), gr.outputs.Image(type='pil')],
    title="Riffusion",
).launch(share=True, debug=True)

In [None]:
from riffusion.spectrogram_image_converter import 


In [None]:
#@title Upload your own files for style transfer
#@markdown #### Drop your audio files (.wav is best) in Colab's file uploader, then type the filename below and run

from google.colab import files
# uploaded = files.upload()

from scipy.io import wavfile
import numpy as np
from PIL import Image

filename = "typing.wav"#@param {type:"string"}

# read uploaded file to wav
rate, data = wavfile.read(f'../{filename}')

# resample from 48000 to 44100
# from scipy.signal import resample
# data = resample(data, int(data.shape[0] * 44100 / 48000))

# convert to mono
#data = np.mean(data, axis=1)

# convert to float32
data = data.astype(np.float32)

# take a random 7 second slice of the audio
data = data[rate*7:rate*14]

spectrogram = spectrogram_from_waveform(
    waveform=data,
    sample_rate=rate,
    # width=768,
    n_fft=8192,
    hop_length=512,
    win_length=8192,
)

def image_from_spectrogram(
    spectrogram: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25
) -> Image.Image:
    """
    Compute a spectrogram image from a spectrogram magnitude array.
    """
    # Apply the power curve
    data = np.power(spectrogram, power_for_image)

    # Rescale to 0-255
    data = data * 255 / max_volume

    # Invert
    data = 255 - data

    # Convert to a PIL image
    image = Image.fromarray(data.astype(np.uint8))

    # Flip Y
    image = image.transpose(Image.FLIP_TOP_BOTTOM)

    # Convert to RGB
    image = image.convert("RGB")

    return image

spec = image_from_spectrogram(spectrogram)
# img.save('../rondo_alla_turca.png')
# display(img)

# Audio(data, rate=rate)

# wav2 = wav_bytes_from_spectrogram_image(img)
# with open("../inverse-spectro.wav", "wb") as f:
#     f.write(wav2[0].getbuffer())

# Audio('../inverse-spectro.wav')

In [None]:
#@title Audio-to-audio based on the previous generated sound: define new pipeline

import torch
from diffusers import StableDiffusionImg2ImgPipeline

# load the pipeline
device = "cuda"
MODEL_ID = "riffusion/riffusion-model-v1"
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)

pipe = pipe.to(device)

#### NOTE: Colab doesn't have enough memory to simultaneously load both the base riffusion pipeline and the audio2audio pipeline. You'll need to re-run the first 'imports' cell if you want to run regular riffusion after instantiating the audio2audio pipeline in in this cell

#### you can run the audio2audio inference cell (below) as many times as you like, though

In [None]:
#@title Audio-to-audio: run inference
prompt = "epic orchestra symphony" #@param {type:"string"}

images = pipe(
    prompt=prompt,
    image=spec,
    strength=0.5,
    guidance_scale=7
).images

wav = converter.audio_from_spectrogram_image(images[0])
wav.export('audio2audio.wav', format='wav')
Audio('audio2audio.wav')