In [7]:
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from diffusers import UNet2DModel, DDPMScheduler, DiffusionPipeline
from diffusers.training_utils import EMAModel
from accelerate import Accelerator
from tqdm import tqdm
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import autocast, GradScaler
from pathlib import Path
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt
import os
import librosa
import seaborn as sns
from typing import List
from diffusers import DDIMScheduler

In [2]:
import torch
from diffusers import UNet2DModel, DDPMScheduler, DDPMPipeline
import json

# Paths to your files
ckpt_path           = "ddpm-spectogram-256/unet/diffusion_pytorch_model.bin"
config_path         = "ddpm-spectogram-256/unet\config.json"
scheduler_config    = "ddpm-spectogram-256/scheduler/scheduler_config.json"

# 1) Load model config & instantiate UNet2DModel
with open(config_path, "r") as f:
    unet_cfg = json.load(f)
unet = UNet2DModel(**unet_cfg)

# 2) Load scheduler config & instantiate DDPMScheduler
with open(scheduler_config, "r") as f:
    sched_cfg = json.load(f)
scheduler = DDPMScheduler(**sched_cfg)

# 3) Load weights into UNet
state_dict = torch.load(ckpt_path, map_location="cpu")
unet.load_state_dict(state_dict)

# 4) Assemble pipeline
pipeline = DDPMPipeline(unet=unet, scheduler=scheduler)
pipeline.to("cuda")   # or "cpu"
pipeline.unet.eval()

UNet2DModel(
  (conv_in): Conv2d(1, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (time_proj): Timesteps()
  (time_embedding): TimestepEmbedding(
    (linear_1): Linear(in_features=128, out_features=512, bias=True)
    (act): SiLU()
    (linear_2): Linear(in_features=512, out_features=512, bias=True)
  )
  (down_blocks): ModuleList(
    (0-1): 2 x DownBlock2D(
      (resnets): ModuleList(
        (0-1): 2 x ResnetBlock2D(
          (norm1): GroupNorm(32, 128, eps=1e-05, affine=True)
          (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (time_emb_proj): Linear(in_features=512, out_features=128, bias=True)
          (norm2): GroupNorm(32, 128, eps=1e-05, affine=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (nonlinearity): SiLU()
        )
      )
      (downsamplers): ModuleList(
        (0): Downsample2D(
          (conv): Con

In [None]:
import os
from pathlib import Path

# Configuration
num_samples = 10
batch_size  = 2
steps       = 1000
output_dir  = Path("generated")

# Make sure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Find existing sample files and determine next index
existing = list(output_dir.glob("sample_*.png"))
if existing:
    # extract the numeric part from filenames like "sample_005.png"
    existing_idxs = [
        int(p.stem.split("_")[1])
        for p in existing
        if p.stem.split("_")[1].isdigit()
    ]
    next_idx = max(existing_idxs) + 1
else:
    next_idx = 0

# Generate images
all_images = []
for _ in range(num_samples // batch_size):
    images = pipeline(batch_size=batch_size, num_inference_steps=steps).images
    all_images.extend(images)

# Save with non-colliding filenames
for i, img in enumerate(all_images):
    idx = next_idx + i
    filename = output_dir / f"sample_{idx:03d}.png"
    img.save(filename)

print(f"Saved {len(all_images)} images, starting at index {next_idx:03d}.")


In [8]:
from PIL import Image
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-instrumental-hiphop-256").to(device)

Loading pipeline components...:   0%|          | 0/3 [00:00<?, ?it/s]An error occurred while trying to fetch C:\Users\mharu\.cache\huggingface\hub\models--teticio--audio-diffusion-instrumental-hiphop-256\snapshots\a63b0c7e794925f74f021d356c13bce47cb69264\unet: Error no file named diffusion_pytorch_model.safetensors found in directory C:\Users\mharu\.cache\huggingface\hub\models--teticio--audio-diffusion-instrumental-hiphop-256\snapshots\a63b0c7e794925f74f021d356c13bce47cb69264\unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Loading pipeline components...: 100%|██████████| 3/3 [00:00<00:00, 16.97it/s]
Expected types for unet: (<class 'diffusers.models.unets.unet_2d_condition.UNet2DConditionModel'>,), got <class 'diffusers.models.unets.unet_2d.UNet2DModel'>.


In [9]:
from PIL import Image
im = Image.open(r"generated-e6\sample_003.png")
a = pipe.mel.image_to_audio(im)
from IPython.display import Audio
display(Audio(a, rate=22100))

In [10]:
a.shape

(130560,)

In [47]:
music =[]
folder_path = "generated_augmented"
for filename in os.listdir(folder_path):
        if filename.endswith('.png'):
                image_file = os.path.join(folder_path, filename)
                im = Image.open(image_file)
                a = pipe.mel.image_to_audio(im)
                music.append(a)


In [48]:
music_np = np.array(music)

In [49]:
music_np.shape

(16, 130560)

In [50]:
display(Audio(music_np.flatten(), rate=22100))

In [42]:
music_np.flatten()[0]

np.float32(-0.00020155688)

In [37]:
sf.write('the_final.wav', music_np.flatten() * 10, 22100)