<a id='ContentList'></a>
# Content List

## Single to Single Generation 

### 1. [Text To Image](#TextToImage)

### 2. [Image To Text](#ImageToText)

### 3. [Text To Audio](#TextToAudio)

### 4. [Audio To Text](#AudioToText)

### 5. [Image To Audio](#ImageToAudio)

### 6. [Audio To Image](#AudioToImage)

### 7. [Text To Video](#TextToVideo)

## Multi-Conditioning Generation

### 1. [Text + Image + Audio To Image](#TextImageAudioToImage)

## Joint Multimodal Generation

### 1. [Text To Image+Text](#TextToImageText)

### 1. [Text To Video+Audio](#TextToImageText)

<a id='LoadModel'></a>
# Load Model

In [None]:
import os
from core.models.model_module_infer import model_module

model_load_path = 'model_alldms_sd_vd_largeaudioenc.pth'
inference_tester = model_module(data_dir='/data1/terrantang/mmdif-data/', pth=model_load_path)
inference_tester = inference_tester.cuda()
inference_tester = inference_tester.eval()


<a id='TextToImage'></a>
# Text To Image
### [Back to Menu](#ContentList)

In [None]:
# Give a prompt
prompt = "Surfer on his surfboard in a wave"

# Generate image
n_samples = 1
images = inference_tester.inference(
                xtype = 'image',
                ctx = prompt,
                n_samples = 1, 
                image_size = 512)
images[0]

<a id='TextToAudio'></a>
# Text To Audio
### [Back to Menu](#ContentList)

In [None]:
# Give a prompt
prompt = 'heavy raining.'

# Generate audio
audio_spec = inference_tester.inference(
                xtype = 'audio',
                cin = prompt,
                ctype = 'prompt',
                scale = 7.5,
                n_samples = 1, 
                ddim_steps = 50)

audio_wavs = inference_tester.mel_spectrogram_to_waveform(audio_spec[0])

# Visualize audio and play
import matplotlib.pyplot as plt
plt.imshow(audio_spec[0].squeeze().transpose(0,1).cpu().numpy()[:, :])
plt.show()
from IPython.display import Audio
Audio(audio_wavs.squeeze(), rate=16000)

<a id='ImageToAudio'></a>
# Image To Audio
### [Back to Menu](#ContentList)

In [None]:
# Load an image
from PIL import Image
from core.common.utils import regularize_image
im = Image.open('./assets/demo_files/rain_on_tree.jpg')
im

In [None]:
# Generate audio
n_samples = 1
audio_spec = inference_tester.inference(
                xtype = 'audio',
                cin = im,
                ctype = 'vision',
                scale = 7.5,
                n_samples = n_samples, 
                ddim_steps = 50)

audio_wavs = inference_tester.mel_spectrogram_to_waveform(audio_spec)

# Visualize the audio and play
import matplotlib.pyplot as plt
plt.imshow(audio_spec.squeeze().transpose(0,1).cpu().numpy()[:, :512])
plt.show()
from IPython.display import Audio
Audio(audio_wavs.squeeze(), rate=16000)

<a id='AudioToImage'></a>
# Audio To Image
### [Back to Menu](#ContentList)

In [None]:
# Load input audio andplay
import torchaudio
import torch
from IPython.display import Audio
pad_time = 10.23

path = './assets/demo_files/wind_chimes.wav'

audio_wavs, sr = torchaudio.load(path)
audio_wavs = torchaudio.functional.resample(waveform=audio_wavs, orig_freq=sr, new_freq=16000).mean(0)[:int(16000 * pad_time)]
padding = torch.zeros([int(16000 * pad_time) - audio_wavs.size(0)])
audio_wavs = torch.cat([audio_wavs, padding], 0)

from IPython.display import Audio
Audio(path, rate=16000)


In [None]:
# Generate image
n_samples = 1
images = inference_tester.inference(
                xtype = 'image',
                cin = audio_wavs,
                ctype = 'audio',
                scale = 7.5,
                n_samples = n_samples, 
                image_size = 512,
                ddim_steps = 50)
images[0]

<a id='ImageToText'></a>
# Image To Text
### [Back to Menu](#ContentList)

In [None]:
# Load an image input
from PIL import Image
im = Image.open('./assets/demo_files/cat.jpg')
im

In [None]:
n_samples = 4
text = inference_tester.inference(
                xtype = 'text',
                cin = im,
                ctype = 'vision',
                n_samples = n_samples, 
                ddim_steps = 50,
                scale = 7.5,)
text

<a id='AudioToText'></a>
# Audio To Text
### [Back to Menu](#ContentList)

In [None]:
import torchaudio
import torch
from IPython.display import Audio

path = './assets/demo_files/train_sound.flac'

audio_wavs, sr = torchaudio.load(path)
audio_wavs = torchaudio.functional.resample(waveform=audio_wavs, orig_freq=sr, new_freq=16000).mean(0)[:int(16000 * 10.23)]
Audio(audio_wavs.squeeze(), rate=16000)

In [None]:
n_samples = 4
text = inference_tester.inference(
                xtype = 'text',
                cin = audio_wavs,
                ctype = 'audio',
                n_samples = n_samples, 
                ddim_steps = 50,
                scale = 7.5)
text

<a id='TextImageAudioToImage'></a>
#  Text + Image + Audio To Image

### [Back to Menu](#ContentList)

In [None]:
# Load Audio Inputs
import torchaudio
import torch
from IPython.display import Audio

path = './assets/demo_files/classic_music.flac'

audio_wavs, sr = torchaudio.load(path)
audio_wavs = torchaudio.functional.resample(waveform=audio_wavs, orig_freq=sr, new_freq=16000).mean(0)[:int(16000 * 10.23)]
Audio(audio_wavs.squeeze(), rate=16000)


In [None]:
# Give A Prompt
prompt = 'dawn, dawn scenery, beautiful lighting.'

# Load Image Inputs
from PIL import Image
im = Image.open('./assets/demo_files/van_gogh_image.jpg').resize((512, 512))
im

In [None]:
# Generate image
n_samples = 1
images = inference_tester.application_dualguided(
                'image',
                cad = audio_wavs,
                ctx = prompt,
                cim = im,
                n_samples = n_samples,
                image_size = 512,
                mixing = 0.25,
                mixing_c2 = 0.45,)

images[0]

<a id='TextImageAudioToImage'></a>
#  Text To Image + Text

### [Back to Menu](#ContentList)

In [None]:
# Give A Prompt
prompt = 'deep diving in coral reef underwater.'

outputs = inference_tester.application_dualguided(
                'image',
                cad = audio_wavs,
                ctx = prompt,
                cim = im,
                n_samples = 1,
                image_size = 512)

image, text = outputs[0]

In [None]:
image

In [None]:
text

<a id='TextImageAudioToImage'></a>
#  Text To Video + Audio

### [Back to Menu](#ContentList)

In [None]:
# Give A Prompt
prompt = 'deep diving in coral reef underwater.'


n_samples = 1
outputs = inference_tester.application_dualguided(
                ['video', 'audio'],
                ctx = prompt,
                n_samples = 1,
                image_size = 256,
                ddim_steps = 50,
                num_frames = 8,
                scale = 7.5)

video, audio_spec = outputs[0]


audio_wavs = inference_tester.mel_spectrogram_to_waveform(audio_spec)
# Visualize the audio and play
import matplotlib.pyplot as plt
plt.imshow(audio_spec.squeeze().transpose(0,1).cpu().numpy()[:, :512])
plt.show()
from IPython.display import Audio
Audio(audio_wavs.squeeze(), rate=16000)

In [None]:
# Visual video as gif
from PIL import Image
frame_one = video[0]
path = "./generated_video.gif"
frame_one.save(path, format="GIF", append_images=video[1:],
               save_all=True, duration=2000/len(video), loop=0)

from IPython import display 
from IPython.display import Image
Image(data=open(path,'rb').read(), format='png')