<a href="https://colab.research.google.com/github/olaviinha/SloppyButchery/blob/main/util_librosa_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#<font face="Trebuchet MS" size="6">Librosa utils <font color="#999" size="3">v 0.0.1<font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><font size="4">Sloppy Butchery @</font> <a href="https://github.com/olaviinha/SloppyButchery" target="_blank"><font color="#999" size="4">Github</font></a><font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><font size="3" color="#999"><a href="https://inha.se" target="_blank"><font color="#999">O. Inha</font></a></font></font>

Cheatsheet of basic audio-processing functions built around Librosa and Numpy.

Please note:
- Some of these functions contain residue from original code, as they are copy/pastes from other Sloppy Butchery notebooks, where they actually serve purposes. I will try to clean it up as a functioning, tiny utility library in the future.
- Hence, functions are not guaranteed to work "as is", but should be easy to modify for development purposes.
- Some of these functions also utilize other third party utilities, such as FFMpeg and Deezer Spleeter. `!pip install ffmpeg spleeter`

<hr size="1" color="#666"/>

In [None]:
import librosa
import numpy as np

## General

In [None]:
global_sr = 44100
global_fade = 0.003

# Convert a file list with FFMPEG
# Returns nothing
def convert(file_list, output_dir, sr=global_sr):
  for i, audiofile in enumerate(file_list):
    output = output_dir+slug(path_leaf(basename(audiofile)))+'.wav'
    filter = "pan=stereo|c0=c0|c1=c0"
    if reverse == True:
      filter = filter+", areverse"
    if normalize == True:
      filter = filter+", dynaudnorm=p=1/sqrt(2):m=100:s=12:g=15"
    !ffmpeg {ffmpeg_q} -y -i "{audiofile}" -c:a pcm_s16le -ar {sr} -ac 2 -af "{filter}" "{output}"

# Clip a file list
# Returns nothing
def clip_list(file_list, output_dir, duration, slice_duration, sr=global_sr):
  for i, audiofile in enumerate(file_list):
    #print('process', audiofile)
    #print('clip to', duration)
    audio_data, sr = librosa.load(audiofile, sr=sr, mono=False)
    a_duration = librosa.get_duration(audio_data, sr=sr)
    if a_duration > slice_duration*2:
      a_duration = a_duration/4 * random.randrange(2, 3)
    start = librosa.time_to_samples(a_duration, sr=sr)
    end = librosa.time_to_samples(a_duration+duration+sr, sr=sr)
    output = output_dir+path_leaf(audiofile)
    save(audio_data[:, start:end], output)
    audio_data = None

# Get total duration of audio files in a directory
# Returns duration in seconds
def get_duration(dir, sr=global_sr):
  files = list_audio(dir)
  duration = 0
  for file in files:
    duration += librosa.get_duration(filename=file, sr=sr)
  return duration

# Slice audio signal
# Returns slices as audio
def slice_to_frames(audio_data, slice_duration, fade_in=global_fade, fade_out=global_fade, fx=[], sr=global_sr):
  a_duration = librosa.get_duration(audio_data, sr=sr)
  clips = math.ceil(a_duration/slice_duration)
  frames = []
  for i in range(clips-1):
    if i > 0 and i < clips:
      start = i*slice_duration
      audio_clip = clip_audio(audio_data, start, slice_duration, fx)
      frames.append( audio_clip ) #fade_audio(audio_clip, fade_in, fade_out) )
  show_mem()
  return frames

# Clip audio signal
# Returns clipped audio siangl
def clip_audio(audio_data, start, duration, fx=[], oneshots=False, sr=global_sr):
  global global_fade
  xstart = librosa.time_to_samples(start, sr=sr)
  xduration = librosa.time_to_samples(start+duration, sr=sr)
  audio_data = audio_data[:, xstart:xduration]
  if len(fx) > 0:
    apply_fx(audio_data, duration, fx)
  if fx[0] == False and fx[1] == 0:
    audio_data = fade_audio(audio_data) 
  show_mem()
  return audio_data

# Split stereo audio to left and right
# Returns left audio signal, right audio signal
def split_channels(audio_data):
  return audio_data[0], audio_data[1]

# Merge two mono audio signals into stereo audio signal
# Returns stereo audio signal
def merge_channels(left_data, right_data):
  return np.array([left_data, right_data])

# Detect pitch of audio signal
# Returns pitch in Hz
def detect_pitch(audio_data, t, sr=global_sr):
  pitches, magnitudes = librosa.core.piptrack(y=audio_data, sr=sr, fmin=50, fmax=900)
  # print(pitches)
  index = magnitudes[:, t].argmax()
  pitch = pitches[index, t]
  # print('detect_pitch pitch:', pitch)
  return pitch

# Generate silence
# Returns silent audio signal
def generate_silence(duration, sr=global_sr):
  content = [0]*librosa.time_to_samples(duration, sr=sr)
  silence = np.array([content, content], dtype=np.float32)
  return silence

# Get spatial periods of audio signal. cycles=return a grain of N spatial periods instead, x_threshold=minimum number of samples between consecutive start times of spatial periods
# Returns spatial period start times, spatial period durations
def get_spatial_periods(audio, cycles=1, x_threshold=20):
  global amount, xtsca, glitch, repeat, rfsca
  zero_points = []
  last_point = 0
  las_val = 0
  drop_limit = 0.0005
  for i, mg in enumerate(audio):
    if i > x_threshold and i < len(audio)-x_threshold:
      if round(mg, 1) == 0 and i > last_point+x_threshold and audio[i-1] < drop_limit and audio[i+1] > drop_limit:
        zero_points.append(i)
        last_point = i
  #durations = np.append(np.diff(zero_points), len(audio)-zero_points[-1]) if len(zero_points) else 0
  #durations = durations.tolist()
  durations = [start - 1 for i, start in enumerate(zero_points) if i > 0]
  durations.append(len(audio))
  if cycles > 1:
    zero_points = zero_points[0::cycles]
    durations = durations[0::cycles]
  return zero_points, durations

# Downsample audio signal to shorter duration
# Returns downsampled audio
def downsample(audio, new_duration, ends_to_zero=True):
  spltr = np.linspace(0, len(audio), num=new_duration+1, dtype=int)
  sbr = np.split(audio, spltr[1:])
  downsampled = np.array( list( np.mean(item) for item in sbr[:-1] ) )
  if ends_to_zero == True:
    downsampled[0] = 0
    downsampled[-1] = 0
  return downsampled

# Normalize audio signal
# Returns normalized audio
def normalize(audio):
  return np.interp(audio, (audio.min(), audio.max()), (-1, 1))

# Save stereo audio as file
def save(audio_data, save_as='frank', sr=global_sr):
  if save_as=='frank':
    global bpm
    timestamp = datetime.datetime.today().strftime('%Y%m%d-%H%M%S')
    save_as = save_as+'_'+rnd_str(4)+'_'+timestamp+'__'+bpm+'bpm.wav'
  soundfile.write(save_as, audio_data.T, sr)

# Save audio as randomly named WAV in TEMP dir, encode to MP3 and create an audio player
# Returns audio player in output console
def test_audio(audio_data):
  global dir_tmp
  if not isinstance(audio_data, (np.ndarray, np.generic)):
    global global_sr
    audio_data, sr = librosa.load(audio_data, mono=False, sr=global_sr)
  out = dir_tmp+'test_'+rnd_str(8)
  save(audio_data, out+'.wav')
  !ffmpeg {ffmpeg_q} -i {out}.wav {mp3_192} {out}.mp3
  audio_player(out+'.mp3')

# Show RAM usage. Librosa can be quite the memory consumer.
# Returns used/available
def show_mem():
  global extra_verbose_performance
  if extra_verbose_performance is True:
    print('mem:', psutil.virtual_memory().percent, '/', psutil.virtual_memory().available * 100 / psutil.virtual_memory().total)

## Effects

In [None]:
# Remove click from between two concatenated audio clips (to=amplitude of first sample in the following audio signal)
# Returns audio signal with end "morphed" to amplitude
def declick(audio_data, samples=50, to=None):
  head = audio_data[:len(data)-samples]
  tail = audio_data[len(data)-samples:]
  if to == None:
    to = head[0]
  linear = np.linspace(tail[0], to, samples)
  new_tail = []
  for i, smp in enumerate(tail):
    new_point = smp + (i/samples*linear[i]) - (smp * (i/samples))
    new_tail.append(new_point)
  return np.concatenate([head, new_tail]).ravel().tolist()

# Apply fade in and/or fade out to audio signal
# Returns faded audio signal
def fade_audio(audio_data, fade_in=global_fade, fade_out=global_fade, sr=global_sr):
  a_duration = librosa.get_duration(audio_data, sr=sr)
  if fade_in > 0:
    fade_in_to = librosa.time_to_samples(fade_in, sr=sr)
    in_y = audio_data[:, 0:fade_in_to]
    fade_ins = []
    for channel in in_y:
      fade = [ i/len(channel)*smp for i, smp in enumerate(channel) ]
      fade_ins.append(fade)
    fade_ins = np.array(fade_ins)
    tail_start = fade_in_to+1  
    tail = audio_data[:, tail_start:]
    audio_data = np.concatenate([fade_ins, tail], axis=1)
  if fade_out > 0:
    fade_out_start = librosa.time_to_samples(a_duration-fade_out, sr=sr)
    out_y = audio_data[:, fade_out_start:]
    fade_outs = []
    for channel in out_y:
      fade = [ smp-(i/len(channel)*smp) for i, smp in enumerate(channel) ]
      fade_outs.append(fade)
    fade_outs = np.array(fade_outs)
    head_start = fade_out_start-1
    head = audio_data[:, :head_start]
    audio_data = np.concatenate([head, fade_outs], axis=1)
  return audio_data

# Time-stretch audio signal
# Returns time-stretched audio signal
def time_stretch_audio(audio, to_length, sr=global_sr):
  dur = librosa.get_duration(audio, sr=sr)
  #librosa.effects.time_stretch(y, dur/to_length)
  return np.array([librosa.effects.time_stretch(channel, dur/to_length) for channel in split_channels(audio)])

# Change pitch of audio signal +/- N semitones
# Returns pitched audio signal
def pitch(audio_data, semitones, sr=global_sr):
  pitched = np.array([librosa.effects.pitch_shift(channel, sr=sr, n_steps=semitones, bins_per_octave=12) for channel in split_channels(audio_data)])
  audio_data = None
  return pitched

# Change pitch of audio signal to note
# Returns pitched audio signal
def autotune_audio(audio_data, note='C', sr=global_sr, t=10):
  target_note = librosa.note_to_midi(note)
  # print('note', note)
  mono_audio = librosa.to_mono(audio_data)
  pitch = detect_pitch(mono_audio, t, sr=sr)
  # print('pitch hz', pitch)
  source_note = round(librosa.hz_to_midi(pitch))
  # print('hz_to_midi', midi)
  if source_note > 0:
    diff = round(target_note-source_note)
    oct = 12 if diff > 0 else -12
    octs = math.floor(diff/oct)
    if octs > 0:
      diff = diff-octs*oct
    elif octs < 0:
      diff = diff+octs*oct

    if diff < -6:
      if octs < 0:
        diff = octs*oct-diff
      else:
        diff = oct-diff
    elif diff > 6:
      if octs > 0:
        diff = octs*oct+diff
      else:
        diff = oct-diff 
    tuned = np.array([librosa.effects.pitch_shift(channel, sr=sr, n_steps=diff, bins_per_octave=12) for channel in split_channels(audio_data)])
  else:
    tuned = audio_data
  mono_audio = None
  audio_data = None
  return tuned

# Apply effects to audio signal, fx=[tremolo, release, autotune, pitch]
# Returns audio signal with effects applied
def apply_fx(audio_data, duration, fx=[]):
  # tremolo
  if fx[0] == True:
    xtremolo = duration/2
    audio_data = fade_audio(audio_data, xtremolo, xtremolo)
  #release
  elif fx[1] > 0:
    xrelease = fx[1]/100*duration
    audio_data = fade_audio(audio_data, global_fade, xrelease)
  # autotune
  if fx[2] != "None":
    t = math.ceil(10*duration)
    if t < 2:
      t = 3
    audio_data = autotune_audio(audio_data, note=fx[2], t=t)
  #pitch
  elif fx[3] != 0:
    audio_data = pitch(audio_data, fx[3])
  return audio_data

## Beat-slicing related

In [None]:
# Separate drum track with Deezer Spleeter CNN
# Returns drum track as mono audio
def separate_drums(audio_track):
  warnings.filterwarnings('ignore')
  separator = Separator('/content/cfg.json')
  audio_loader = get_default_audio_adapter()
  drum_track = separator.separate(audio_track.T)['drums']
  return librosa.to_mono(drum_track.T)

# Separate vocal track with Deezer Spleeter CNN
# Returns vocal track as mono audio
def separate_vocals(audio_track):
  warnings.filterwarnings('ignore')
  separator = Separator('/content/cfg.json')
  audio_loader = get_default_audio_adapter()
  vocal_track = separator.separate(audio_track.T)['vocals']
  return librosa.to_mono(vocal_track.T)

# Detect beat positions in audio signal
# Returns starting positions of beats in seconds as a list of numbers
def get_beats(audio, sr=global_sr):
  bt = BeatTrackerMultiFeature()
  beats, _ = bt(audio)
  return beats

# Calculate differences between a list of numbers (e.g. beat positions)
# Returns differences as a list of numbers (e.g. beat durations)
def get_differences(blist, round=0):
  x = list(np.array(blist.tolist()).flatten())
  xdiff = [x[n]-x[n-1] for n in range(1,len(x))]
  if round > 0:
    rounded = [ '%.2f' % el for el in xdiff ]
    return rounded
  else:
    return xdiff

# Get most frequent value from list
# Returns most frequent value
def most_frequent(list):
  freq = max(set(list), key = list.count)
  return freq

# Get modal value of list
# Returns mode
def get_mode(lst):
  d = {}
  for a in lst:
    if not a in d:
      d[a]=1
    else:
      d[a]+=1
  return [k for k,v in d.items() if v==max(d.values())]

# Filter list with a rounded range
# Returns filtered list
def filter_durations(durations, range, decs):
  filtered_durations = []
  for duration in durations:
    duration = float(duration)
    range = float(range)
    if round(duration, decs) == range:
      filtered_durations.append(duration)
  return filtered_durations

# Detect peaks at the end of audio signal
# Returns fitst tail-peak position or amplitude?
def get_tail_peaks(audio, nudgeTime=False, wat=''):
  dur = librosa.get_duration(audio, sr=sr)
  nudged = False
  minPos = 0.96
  minPeakDis = 0.001
  th = 0.4
  pos, amp = PeakDetection(threshold=th, minPeakDistance=minPeakDis, minPosition=minPos)(audio.astype(np.float32))
  if len(pos) > 0:
    # This slice needs handling
    for i, peak in enumerate(pos):
      if amp[i] > th and nudged == False:
        nudgePeak = peak
        nudged = True
  if nudgeTime == True and nudged == True:
    return nudgePeak
  elif nudgeTime == True and nudged == False:
    return 0
  else:
    if wat == 'amp':
      return amp
    else:
      return dur-(pos*dur)

# Nudge start times (beat positions) backwards in time until tail peaks are in subsequent slice
# Returns new (beat) start positions or audio signals of slices
def nudge(beats, timing_track, anal_duration, real_duration, timetravel, return_type='time', starting_beat=0, sr=global_sr):
  beat_positions = []
  a = 0
  for i, beat in enumerate(beats):
    if i > starting_beat:
      # Get slice
      s_time = beat+timetravel
      e_time = anal_duration-a
      s_sample = librosa.time_to_samples(s_time, sr=sr)
      e_sample = librosa.time_to_samples(s_time+e_time, sr=sr)
      beat_timing = timing_track[s_sample:e_sample]
      # Nudge
      nudge_peak = get_tail_peaks(beat_timing, True)
      if nudge_peak > 0:
        ns_time = s_time-(anal_duration-anal_duration*nudge_peak)-a
        ns_sample = librosa.time_to_samples(ns_time, sr=sr)
      else:
        ns_time = s_time
        ns_sample = s_sample
      ns_sample = librosa.time_to_samples(ns_time-timetravel, sr=sr)
      ne_sample = librosa.time_to_samples(ns_time+(real_duration-a), sr=sr)
      # Return start:end of each beat in either 'samples' or in 'seconds'
      if return_type == 'samples':
        beat_positions.append([ns_sample, ne_sample])
      else:
        beat_positions.append([ns_time, ns_time+(real_duration-a)])
  return beat_positions