In [1]:
# try:
#     from google.colab import drive
#     drive.mount('/content/drive', force_remount=True)
#     COLAB = True
#     print("Note: using Google CoLab")
# except:
#     print("Note: not using Google CoLab")
#     COLAB = False

PATH = '/content/drive/MyDrive'

! pip install -U kaleido

# Configuration
FPS = 30
FFT_WINDOW_SECONDS = 0.25 # how many seconds of audio make up an FFT window

# Note range to display
FREQ_MIN = 10
FREQ_MAX = 1000

# Notes to display
TOP_NOTES = 3

# Names of the notes
NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

# Output size. Generally use SCALE for higher res, unless you need a non-standard aspect ratio.
RESOLUTION = (1920, 1080)
SCALE = 2 # 0.5=QHD(960x540), 1=HD(1920x1080), 2=4K(3840x2160)



In [2]:
import matplotlib.pyplot as plt
from scipy.fftpack import fft
from scipy.io import wavfile # get the api
import os

# Get a WAV file from GDrive, such as:
AUDIO_FILE = os.path.join(PATH,'musicgen_out.wav')

# Or download my sample audio
# !wget https://github.com/jeffheaton/present/raw/master/youtube/video/sample_audio/piano_c_major_scale.wav
# AUDIO_FILE = "/content/piano_c_major_scale.wav"

fs, data = wavfile.read(AUDIO_FILE) # load the data
audio = data.T[0] # this is a two channel soundtrack, get the first track
FRAME_STEP = int(fs / FPS) # audio samples per video frame
FFT_WINDOW_SIZE = int(fs * FFT_WINDOW_SECONDS)
AUDIO_LENGTH = len(data) / fs

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive\\musicgen_out.wav'

Several utility functions.

In [4]:
import plotly.graph_objects as go

def plot_fft(p, xf, fs, notes, dimensions=(960,540)):
  layout = go.Layout(
      title="frequency spectrum",
      autosize=False,
      width=dimensions[0],
      height=dimensions[1],
      xaxis_title="Frequency (note)",
      yaxis_title="Magnitude",
      font={'size' : 24}
  )

  fig = go.Figure(layout=layout,
                  layout_xaxis_range=[FREQ_MIN,FREQ_MAX],
                  layout_yaxis_range=[0,1]
                  )

  fig.add_trace(go.Scatter(
      x = xf,
      y = p))

  for note in notes:
    fig.add_annotation(x=note[0]+10, y=note[2],
            text=note[1],
            font = {'size' : 48},
            showarrow=False)
  return fig

def extract_sample(audio, frame_number):
    end = frame_number * FRAME_OFFSET
    begin = int(end - FFT_WINDOW_SIZE)

    if end <= 0:
        # We have no audio yet, return all zeros (very beginning)
        return np.zeros(FFT_WINDOW_SIZE, dtype=float)
    elif begin < 0:
        # We have some audio, pad with zeros
        padded_audio = np.concatenate([np.zeros(abs(begin), dtype=float), data[:end]])
        return padded_audio[-FFT_WINDOW_SIZE:]
    else:
        # Usually this happens, return the next sample
        return audio[begin:end]


def find_top_notes(fft,num):
  if np.max(fft.real)<0.001:
    return []

  lst = [x for x in enumerate(fft.real)]
  lst = sorted(lst, key=lambda x: x[1],reverse=True)

  idx = 0
  found = []
  found_note = set()
  while( (idx<len(lst)) and (len(found)<num) ):
    f = xf[lst[idx][0]]
    y = lst[idx][1]
    n = freq_to_number(f)
    n0 = int(round(n))
    name = note_name(n0)

    if name not in found_note:
      found_note.add(name)
      s = [f,note_name(n0),y]
      found.append(s)
    idx += 1

  return found

Run the FFT on individual samples of the audio and generate video frames of the frequency chart.

In [5]:
import numpy as np
import tqdm

# ! rm /content/*.png

# See https://newt.phys.unsw.edu.au/jw/notes.html
def freq_to_number(f): return 69 + 12*np.log2(f/440.0)
def number_to_freq(n): return 440 * 2.0**((n-69)/12.0)
def note_name(n): return NOTE_NAMES[n % 12] + str(int(n/12 - 1))

# Hanning window function
window = 0.5 * (1 - np.cos(np.linspace(0, 2*np.pi, FFT_WINDOW_SIZE, False)))

xf = np.fft.rfftfreq(FFT_WINDOW_SIZE, 1/fs)
FRAME_COUNT = int(AUDIO_LENGTH*FPS)
FRAME_OFFSET = int(len(data)/FRAME_COUNT)

# Pass 1, find out the maximum amplitude so we can scale.
mx = 0
for frame_number in range(FRAME_COUNT):
  sample = extract_sample(data, frame_number)

  fft = np.fft.rfft(sample * window)
  fft = np.abs(fft).real
  mx = max(np.max(fft),mx)

print(f"Max amplitude: {mx}")

Max amplitude: 146.85384916340084


In [6]:
def freq_to_number(f):
    if f == 0:
        return float('-inf')  # 处理频率为0的情况，避免除以零错误
    else:
        return 69 + 12*np.log2(f/440.0)

def find_top_notes(fft, num):
    if np.max(fft.real) < 0.001:
        return []

    lst = [x for x in enumerate(fft.real)]
    lst = sorted(lst, key=lambda x: x[1], reverse=True)

    idx = 0
    found = []
    found_note = set()
    while idx < len(lst) and len(found) < num:
        f = xf[lst[idx][0]]
        y = lst[idx][1]
        n = freq_to_number(f)
        if np.isfinite(n):  # 检查n是否为有限值，避免溢出错误
            n0 = int(round(n))
            name = note_name(n0)
            if name not in found_note:
                found_note.add(name)
                s = [f, name, y]
                found.append(s)
        idx += 1

    return found

In [7]:
# Pass 2, produce the animation
for frame_number in tqdm.tqdm(range(FRAME_COUNT)):
    sample = extract_sample(data, frame_number)

    fft = np.fft.rfft(sample * window)

    # 检查fft是否全为零，避免在计算mx时出现除以零错误
    if np.all(fft == 0):
        continue

    fft = np.abs(fft) / mx

    # 检查fft是否包含无穷大或无穷小的情况，避免在调用find_top_notes函数时出现异常
    if np.any(np.isinf(fft)) or np.any(np.isnan(fft)):
        continue

    s = find_top_notes(fft, TOP_NOTES)

    fig = plot_fft(fft.real, xf, fs, s, RESOLUTION)
    # fig.write_image(f"/content/frame{frame_number}.png", scale=2)
    fig.write_image(f"{frame_number}.png", scale=2)


  0%|          | 0/307 [00:00<?, ?it/s]

Use [ffmpeg](https://ffmpeg.org/) to combine the input audio WAV and the individual frame images into a MP4 video.

In [8]:
!ffmpeg -y -r {FPS} -f image2 -s 1920x1080 -i frame%d.png -i {AUDIO_FILE} -c:v libx264 -pix_fmt yuv420p movie.mp4

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Download the generated movie.

In [9]:
from google.colab import files
files.download('movie.mp4')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>