<a href="https://colab.research.google.com/github/eloimoliner/CQTdiff/blob/main/notebooks/demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Solving Audio Inverse Problems with a Diffusion Model

This notebook is a demo of the gramophone noise synthesis method proposed in:

> E. Moliner, J. Lehtinen and V. Välimäki,, "Solving audio inverse problems with a diffusion model", submitted to IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) 2023
Rhodes, Greece, May, 2023

Listen to our [audio samples](http://research.spa.aalto.fi/publications/papers/icassp23-cqt-diff/)

### Instructions for running:

* Make sure to use a GPU runtime, click:  __Runtime >> Change Runtime Type >> GPU__
* Press ▶️ on the left of each of the cells
* View the code: Double-click any of the cells
* Hide the code: Double click the right side of the cell


In [1]:
# !git clone https://github.com/eloimoliner/CQTdiff.git
# %cd CQTdiff
# !bash download_weights_and_examples.sh

In [3]:
#@title #Setup environment

#@markdown Execute this cell to setup the environment
#! git clone git@github.com:eloimoliner/CQTdiff.git
#%cd gramophone_noise_synth
#! wget https://github.com/eloimoliner/gramophone_noise_synth/releases/download/gramophonediff/weights-750000.pt
#! mkdir experiments
#! mkdir experiments/trained_model
#! mv weights-750000.pt experiments/trained_model/

# !pip install omegaconf
# ! pip install dotmap
# ! pip install Ninja

import os
#import hydra
import logging
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["TORCH_CUDA_ARCH_LIST"] = ""


import torch
device = torch.device("cpu")

import torchaudio
# torch.cuda.empty_cache()
import soundfile as sf

from omegaconf import OmegaConf
from omegaconf.omegaconf import open_dict
import numpy as np
from datetime import date

#from learner import Learner
#from model import UNet
import IPython

from tqdm import tqdm

import scipy.signal


import yaml
from pathlib import Path
from dotmap import DotMap

import glob
from IPython.display import Audio 

# args = yaml.safe_load(Path('conf/conf.yaml').read_text())
# args = DotMap(args)


# --- locate repo root and load conf.yaml safely (works in notebooks) ---
from pathlib import Path
import yaml

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "conf" / "conf.yaml").exists():
            return p
    raise FileNotFoundError(f"Couldn't find conf/conf.yaml starting from {start}")

repo_root = find_repo_root(Path.cwd())

import sys
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
print("Repo root added to sys.path:", repo_root)

conf_path = repo_root / "conf" / "conf.yaml"

print("Using conf:", conf_path)
args = yaml.safe_load(conf_path.read_text(encoding="utf-8"))
args = DotMap(args)
# ----------------------------------------------------------------------

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

dirname = os.getcwd()

#define the path where weights will be loaded and audio samples and other logs will be saved
args.model_dir = os.path.join(dirname, str(args.model_dir))
if not os.path.exists(args.model_dir):
    os.makedirs(args.model_dir)


args.architecture="unet_CQT" 
args.inference.checkpoint = os.path.join(repo_root, "experiments", "cqt", "cqt_weights.pt")


args.sample_rate=22050
args.resample_factor=1
args.inference.load.load_mode="from_directory"

#mkdir examples_dir
#copy the files there from somewhere
args.inference.load.data_directory=os.path.join(dirname,"data_dir")
args.inference.load.seg_idx=0

args.inference.load.seg_size=65536            

args.cqt.numocts=7
args.diffusion_parameters.sigma_data=0.057
args.cqt.use_norm=False


#import src.utils.setup as utils_setup
#test_set = utils_setup.get_test_set_for_sampling(args)

Repo root added to sys.path: C:\Projects\CQTdiffOFDM
Using conf: C:\Projects\CQTdiffOFDM\conf\conf.yaml


In [4]:
#@title Select audio example

import glob
import os
import soundfile as sf
from IPython.display import Audio
from pathlib import Path
import torch

# --- FIX: make data_directory absolute from repo root ---
REPO_ROOT = Path.cwd().parent   # C:\Projects\CQTdiff
args.inference.load.data_directory = str(REPO_ROOT / "examples" / "data_dir")
print("Using data_directory:", args.inference.load.data_directory)
# ------------------------------------------------------- ----

files = glob.glob(
    os.path.join(args.inference.load.data_directory, "**", "*.wav"),
    recursive=True
)

print(f"Found {len(files)} wav files")

if len(files) == 0:
    raise RuntimeError("No .wav files found — check data_directory")

example = min(2, len(files) - 1)
audio_file = files[example]

segnp, fs = sf.read(audio_file)
segnp = segnp[:8192]

args.audio_len = segnp.shape[0]
seg = torch.Tensor(segnp).unsqueeze(0)

Audio(data=segnp, rate=fs)


Using data_directory: C:\Projects\CQTdiffOFDM\examples\data_dir
Found 149 wav files


In [7]:
#@title Audio Declipping
#@markdown Execute this cell to run audio declipping experiments
#@markdown ## Diffusion schedule
args.inference.mode = 'declipping'
mode=args.inference.mode

audio_len=seg.shape[-1]
#@markdown Number of discretization steps (recommended: 35)
num_steps = 35 #@param {type:"slider", min:0, max:100, step:1}
args.inference.T=num_steps

#@markdown minimum noise level (recommended: 0.0001)
sigma_min = 0.0001 #@param {type:"number"}
args.diffusion_parameters.sigma_min=sigma_min

#@markdown maximum  noise level (recommended: 1)
sigma_max= 1 #@param {type:"number"}
args.diffusion_parameters.sigma_max=sigma_max

#@markdown noise schedule parameter (recommended 13)
rho=13 #@param{type:"slider", min:1, max:20, step:1}
args.diffusion_parameters.ro=rho

#@markdown stochasticity parameter (recommended 5)
Schurn=5 #@param{type:"slider", min:0, max:40, step:0.1}
args.diffusion_parameters.Schurn=Schurn

args.diffusion_parameters.Schurn=Schurn

#@markdown ## Conditioning parameters
#@markdown guidance scaling parameter (recommended 0.25).
xi=0.26 #@param{type:"slider", min:0, max:1, step:0.01}
args.inference.xi=xi

#@markdown This time it is not possible to use data consistency
data_consistency = False
args.inference.data_consistency=data_consistency
plot_animation=False


#@markdown ## Declipping details
#@markdown Specify the Signal-to-Distortion Ratio (in dB) of the clipping distortion
SDR=1 #@param{type:"slider", min:-10, max:30, step:0.1}
args.inference.declipping.SDR=SDR



from src.experimenters.exp_declipping import Exp_Declipping
exp=Exp_Declipping(args, plot_animation)

if plot_animation:
  path_degraded, path_result, fig=exp.conduct_experiment(seg,"1")
  fig.show()
else:
  path_degraded, path_result=exp.conduct_experiment(seg, "1")


print("")
print("clipped:")
IPython.display.display(Audio(path_degraded))
print("reconstructed")
IPython.display.display(Audio(path_result))


  warn("Q-factor too high for frequencies %s"%",".join("%.2f"%fi for fi in f[q >= qneeded]))
  checkpoint = torch.load(model_dir, map_location=model.device)


8192
C:\Projects\CQTdiff\notebooks\experiments/cqt\declipping23_12_2025\original/1.wav
C:\Projects\CQTdiff\notebooks\experiments/cqt\declipping23_12_2025\original/1.wav


100%|██████████| 35/35 [01:33<00:00,  2.67s/it]

C:\Projects\CQTdiff\notebooks\experiments/cqt\declipping23_12_2025\original/1.wav

clipped:



  y_lpf=torch.nn.functional.conv1d(y,B,padding="same")


reconstructed


In [None]:
#@title Unconditional synthesis
#@markdown Execute this cell to run unconditional synthesis experiments

args.inference.mode = 'unconditional'
mode=args.inference.mode
args.inference.unconditional.num_samples=1

#@markdown Length of the generated samples (in seconds)
audio_len=4 #@param {type:"slider", min:0.5, max:40, step:0.1}
args.audio_len=int(audio_len*args.sample_rate)


#@markdown Number of discretization steps (recommended: 35)
num_steps = 35 #@param {type:"slider", min:0, max:100, step:1}
args.inference.T=num_steps

#@markdown minimum noise level (recommended: 0.0001)
sigma_min = 0.0001 #@param {type:"number"}
args.diffusion_parameters.sigma_min=sigma_min

#@markdown maximum  noiose level (recommended: 1)
sigma_max= 1 #@param {type:"number"}
args.diffusion_parameters.sigma_max=sigma_max

#@markdown noise schedule parameter (recommended 13)
rho=12 #@param{type:"slider", min:5, max:20, step:1}
args.diffusion_parameters.ro=rho


#@markdown Stochasticity parameter (recommended 5)
Schurn=8.5 #@param{type:"slider", min:0, max:40, step:0.1}
args.diffusion_parameters.Schurn=Schurn


plot_animation=True

from src.experimenters.exp_unconditional import Exp_Unconditional
exp=Exp_Unconditional(args, plot_animation)

if plot_animation:
  audio_path, fig=exp.conduct_experiment("1")
  fig.show()
else:
  audio_path=exp.conduct_experiment("1")

Audio(audio_path) # load the saved file

In [None]:
#@title Bandwidth Extension
#@markdown Execute this cell to run bandwidth extension experiments
#@markdown ## Diffusion schedule
args.inference.mode = 'bandwidth_extension'
mode=args.inference.mode

audio_len=seg.shape[-1]
#@markdown Number of discretization steps (recommended: 35)
num_steps = 35 #@param {type:"slider", min:0, max:100, step:1}
args.inference.T=num_steps

#@markdown minimum noise level (recommended: 0.0001)
sigma_min = 0.0001 #@param {type:"number"}
args.diffusion_parameters.sigma_min=sigma_min

#@markdown maximum  noise level (recommended: 1)
sigma_max= 1 #@param {type:"number"}
args.diffusion_parameters.sigma_max=sigma_max

#@markdown noise schedule parameter (recommended 13)
rho=13 #@param{type:"slider", min:5, max:20, step:1}
args.diffusion_parameters.ro=rho

#@markdown stochasticity parameter (recommended 5)
Schurn=5 #@param{type:"slider", min:0, max:40, step:0.1}
args.diffusion_parameters.Schurn=Schurn

#@markdown ## Conditioning parameters
#@markdown guidance scaling parameter (recommended 0.25).
#@markdown Leave as 0 for no reconstruction guidance, but make sure to activate data consistency
xi=0.26 #@param{type:"slider", min:0, max:1, step:0.01}
args.inference.xi=xi

#@markdown Choose if you want to apply data consistency steps (only for "firwin" filters)
data_consistency = False #@param {type:"boolean"}
args.inference.data_consistency=data_consistency
plot_animation=False


#@markdown ## Lowpass filter parameters
#filt_type = "firwin" #@param ["firwin", "cheby1", "resample", "decimate"]
#@markdown In this cell, the filter is an FIR, designed using the window method
#@markdown Specify the cutoff frequency (in Hz)
fc=1054 #@param{type:"slider", min:0, max:10000, step:1}
args.inference.bandwidth_extension.filter.fc=fc
#@markdown Specify the order of the filter
order=403 #@param{type:"slider", min:0, max:1000, step:1}
args.inference.bandwidth_extension.filter.order=order

from src.experimenters.exp_bandwidth_extension import Exp_BWE
exp=Exp_BWE(args, plot_animation)

if plot_animation:
  path_degraded, path_result, fig=exp.conduct_experiment(seg,"1")
  fig.show()
else:
  path_degraded, path_result=exp.conduct_experiment(seg, "1")


print("")
print("lowpass filtered:")
IPython.display.display(Audio(path_degraded))
print("bandwidth-extended:")
IPython.display.display(Audio(path_result))


In [None]:
#@title Audio Inpainting
#@markdown Execute this cell to run audio inpainting experiments
#@markdown ## Diffusion schedule
args.inference.mode = 'inpainting'
mode=args.inference.mode

audio_len=seg.shape[-1]
#@markdown Number of discretization steps (recommended: 35)
num_steps = 35 #@param {type:"slider", min:0, max:100, step:1}
args.inference.T=num_steps

#@markdown minimum noise level (recommended: 0.0001)
sigma_min = 0.0001 #@param {type:"number"}
args.diffusion_parameters.sigma_min=sigma_min

#@markdown maximum  noise level (recommended: 1)
sigma_max= 1 #@param {type:"number"}
args.diffusion_parameters.sigma_max=sigma_max

#@markdown noise schedule parameter (recommended 13)
rho=13 #@param{type:"slider", min:1, max:20, step:1}
args.diffusion_parameters.ro=rho

#@markdown stochasticity parameter (recommended 5)
Schurn=5 #@param{type:"slider", min:0, max:40, step:0.1}
args.diffusion_parameters.Schurn=Schurn

args.diffusion_parameters.Schurn=Schurn

#@markdown ## Conditioning parameters
#@markdown guidance scaling parameter (recommended 0.25).
#@markdown Leave as 0 for no reconstruction guidance, but make sure to activate data consistency
xi=0.26 #@param{type:"slider", min:0, max:1, step:0.01}
args.inference.xi=xi

#@markdown Choose if you want to apply data consistency steps (only for "firwin" filters)
data_consistency = False #@param {type:"boolean"}
args.inference.data_consistency=data_consistency
plot_animation=False


#@markdown ## Inpainting details
#@markdown length od the gap (in ms)
gap_length=1000 #@param {type:"number"}
args.inference.inpainting.gap_length=gap_length
#@markdown start of the gap (in ms)
start_gap_idx=1000 #@param {type:"number"}
args.inference.inpainting.start_gap_idx=start_gap_idx


from src.experimenters.exp_inpainting import Exp_Inpainting
exp=Exp_Inpainting(args, plot_animation)

if plot_animation:
  path_degraded, path_result, fig=exp.conduct_experiment(seg,"1")
  fig.show()
else:
  path_degraded, path_result=exp.conduct_experiment(seg, "1")


print("")
print("masked:")
IPython.display.display(Audio(path_degraded))
print("reconstructed")
IPython.display.display(Audio(path_result))


In [None]:
#@title Compressive Sensing
#@markdown Execute this cell to run audio compressive sensing experiments
#@markdown ## Diffusion schedule


args.inference.mode = 'declipping'
mode=args.inference.mode

audio_len=seg.shape[-1]
#@markdown Number of discretization steps (recommended: 35)
num_steps = 35 #@param {type:"slider", min:0, max:100, step:1}
args.inference.T=num_steps

#@markdown minimum noise level (recommended: 0.0001)
sigma_min = 0.0001 #@param {type:"number"}
args.diffusion_parameters.sigma_min=sigma_min

#@markdown maximum  noise level (recommended: 1)
sigma_max= 1 #@param {type:"number"}
args.diffusion_parameters.sigma_max=sigma_max

#@markdown noise schedule parameter (recommended 13)
rho=13 #@param{type:"slider", min:1, max:20, step:1}
args.diffusion_parameters.ro=rho

#@markdown stochasticity parameter (recommended 5)
Schurn=5 #@param{type:"slider", min:0, max:40, step:0.1}
args.diffusion_parameters.Schurn=Schurn

args.diffusion_parameters.Schurn=Schurn

#@markdown ## Conditioning parameters
#@markdown guidance scaling parameter (recommended 0.25).
xi=0.26 #@param{type:"slider", min:0, max:1, step:0.01}
args.inference.xi=xi

#@markdown This time it is not possible to use data consistency
data_consistency = False
args.inference.data_consistency=data_consistency
plot_animation=False


#@markdown ## Compressed sensing details
#@markdown Specify the compression ratio. The percentage of samples that are dropped out from the example audio file. (Suggestion: use high values)
percentage=96 #@param{type:"slider", min:0, max:100, step:0.1}
args.inference.comp_sens.percentage=100-percentage



from src.experimenters.exp_comp_sens import Exp_CompSens
exp=Exp_CompSens(args, plot_animation)

if plot_animation:
  path_degraded, path_result, fig=exp.conduct_experiment(seg,"1")
  fig.show()
else:
  path_degraded, path_result=exp.conduct_experiment(seg, "1")


print("")
print("compressed:")
IPython.display.display(Audio(path_degraded))
print("reconstructed")
IPython.display.display(Audio(path_result))
