# Pitch-discretization with librosa

## load libs and input signal

In [None]:
# define default samplerate of 44100Hz and not 22050Hz
# and fft length and hop size
from presets import Preset
import librosa as _librosa
import librosa.display as _display
_librosa.display = _display
librosa = Preset(_librosa)

librosa['sr'] = 44100
librosa['n_fft'] = 4096
librosa_hop_len = 1024
librosa['hop_length'] = librosa_hop_len

# other needed modules
import matplotlib.pyplot as plt
import numpy as np
import IPython

#x, sr = librosa.load('../samples/Toms_diner.wav')
#x, sr = librosa.load('../samples/sweep_20Hz_20kHz_10s.wav')

pos = 5
dur = 10
x, sr = librosa.load("../samples/ave-maria.wav", offset=pos, duration=dur)

# plot and play
t = np.linspace(0, x.size/sr, x.size)
plt.plot(t, x)
IPython.display.Audio(x, rate=sr)

## pitch-tracking

In [None]:
# parameters
fmin = 60
fmax = 1000
frame_len = 2048 # default is 2048

f0, voiced_flag, voiced_probs = librosa.pyin(x, fmin=fmin, fmax=fmax, sr=sr, frame_length=frame_len)

times = librosa.times_like(f0)

D = librosa.amplitude_to_db(np.abs(librosa.stft(x)), ref=np.max)

plt.rcParams['figure.figsize'] = [15, 8]
fig, ax = plt.subplots()
img = librosa.display.specshow(D, x_axis='time', y_axis='log', ax=ax)
ax.set(title='pYIN fundamental frequency estimation')
fig.colorbar(img, ax=ax, format="%+2.f dB")
ax.plot(times, f0, label='f0', color='cyan', linewidth=3)
ax.legend(loc='upper right')

## pitch-shifting to constant pitch

We get an
- audio signal
- the analyzed pitch signal
- the pitch analysis hop length (librosa_hop_len)
- the wanted pitch

Now we want to generate a signal with constant pitch, so the pitch shifting factor depends on the pitch of the input signal.

- Figure the blocksize out
- calculate the pitch shifting factor for each block
- do pitch shifting on each block (and do nothing for unvoiced frames or stay constant)

In [None]:
f_out = 200 # desired pitch of the output signal

semitone_ratio = 2**(1/12)

blocksize = librosa_hop_len
n_blocks = int(x.size / blocksize)

#print("n_blocks is", n_blocks, "and should be one less than the f0.size:", f0.size)
# so later take n_blocks = f0.size - 1

y = []
# loop over blocks
for i in range(n_blocks):
    pitch = f0[i]
    if np.isnan(pitch):
        # unvoiced
        shifted = x[i*blocksize:(i+1)*blocksize] # unprocessed
    else:
        # pitch-shifting ratio rho
        rho = f_out / pitch
        # to semitones
        semitones = 12*np.log2(rho)
        # pitch shift
        shifted = librosa.effects.pitch_shift(x[i*blocksize:(i+1)*blocksize], n_steps=semitones)
    y.append(shifted)
y = np.concatenate(y)

In [None]:
t = np.linspace(0, y.size/sr, y.size)
plt.plot(t, y)
IPython.display.Audio(y, rate=sr)

## appearing problems

* Discontinuities, because the pitch-shifting algorithm is not for small block processing but rather for shifting a big chunk of continuous audio data.

* The blocksize can not be smaller than 2048 samples because the phase vocoder is hard coded to this fft-size.

So make your own OLA-pitch-shifter for the simplest implementation.

## Pitch discretization

In [None]:
def scale(f0, n_tones=12, tune=440):
    """
    nonlinear frequency scale
    f0...input frequency
    n_tones...make a scale with n_tones tones
    tune...tuning frequency
    return discrete frequencies
    """
    tone = n_tones * np.log2(f0/tune)
    discrete = int(tone)
    return tune * (2 ** (discrete / n_tones))

semitone_ratio = 2**(1/12)

blocksize = librosa_hop_len
n_blocks = int(x.size / blocksize)

#print("n_blocks is", n_blocks, "and should be one less than the f0.size:", f0.size)
# so later take n_blocks = f0.size - 1

y = []
# loop over blocks
for i in range(n_blocks):
    pitch = f0[i]
    if np.isnan(pitch):
        # unvoiced
        shifted = x[i*blocksize:(i+1)*blocksize] # unprocessed
    else:
        f_out = scale(pitch, 12)
        # pitch-shifting ratio rho
        rho = f_out / pitch
        # to semitones
        semitones = 12*np.log2(rho)
        # pitch shift
        shifted = librosa.effects.pitch_shift(x[i*blocksize:(i+1)*blocksize], n_steps=semitones)
    y.append(shifted)
y = np.concatenate(y)

In [None]:
t = np.linspace(0, y.size/sr, y.size)
plt.plot(t, y)
IPython.display.Audio(y, rate=sr)

## pitch-shift OLA

This method takes

- the audio signal
- the analyzed pitch signal
- the pitch analysis hop length (librosa_hop_len)
- and the wanted pitch signal

and outputs the pitch-shifted signal
(maybe employ some smoothing of pitch transitions?)

In [None]:
def pitch_shift_OLA(x, f_0, f_out, f_hop_size, N, Sa):
    """
    dynamically pitch shifting x to the desired frequencies f_out to output y
    x...input signal
    f_0...fundamental frequencies of x
    f_out...desired fundamental frequencies of y
    f_hop_size...hop size of the fundamental frequency estimation
    N...block size of OLA algorithm (analysis and synthesis)
    Sa...analysis hop size of OLA algorithm
    return y...pitch-shifted output signal
    """
    # do TSM by OLA
    
    # resampling

## Upgrade to SOLA?

## Do pitch discretization