# Rollers Pitch-Shifting Algorithm

The "Rollers" pitch-shifting algorithm is based on narrow subband frequency shifting.
For yielding low latencies, an IIR filter bank is used.


## Prototype Design

* constant Q IIR filter bank
    - third-octave butterworth filter bank
    - with 28 bands

* frequency shifting per band using single sideband modulation
    - analytical signal via `scipy.signal.hilbert()`
    - modulation with a complex sinusoid

In [None]:
# import the needed modules
import numpy as np
from scipy import signal
from scipy.io import wavfile
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 3]
from scipy.fft import rfft, fft, rfftfreq, fftfreq
import IPython.display as ipd

In [None]:
# let's define a function for plotting the magnitude spectrum of a signal
def plot_mag_spec(sig, fs, name="", negative=False, f_range=(20, 20000), db_range=(-40, 0), f_log=True):
    """
    Plot the magnitude spectrum of the signal 'sig'.
    sig:    signal to be analyzed
    fs:     sampling rate of sig
    name:   name of the plotted spectrum
    negative: show negative frequencies (default: False)
    f_range: define a frequency range for the spectrum (default: 20Hz to 20kHz)
    db_range: define a magnitude range in dB for the spectrum (default: 0dB to 100dB)
    f_log:  logarithmic or linear frequency axis? (default: True is logarithmic)
    """
    w = signal.hann(sig.size) # window
    if negative == False:
        # just positive frequencies
        freq = rfftfreq(sig.size, 1 / fs)
        mag = 20*np.log10(np.abs((1/sig.size)*rfft(sig*w)))
        if f_log == True:
            plt.semilogx(freq, mag)
        else:
            plt.plot(freq, mag)
        plt.xlim(f_range)
    else:
        # positive and negative frequencies
        freq = fftfreq(sig.size, 1 / fs)
        mag = 20*np.log10(np.abs((1/sig.size)*fft(sig*w)))
        if f_log == True:
            freq = freq[1:] # omit 0Hz
            mag = mag[1:]
        plt.plot(freq, mag)
        plt.xlim((-f_range[1], f_range[1]))
        if f_log == True:
            plt.xscale('symlog')
    plt.ylim(db_range)
    plt.xlabel("f [Hz]")
    plt.ylabel("amplitude [dB]")
    plt.title(name)
    plt.show()

Let's define classes for an object oriented design for the filter bank and the frequency shifter.

In [None]:
class Filterbank:
    """A  constant Q IIR butterworth filter bank for audio signal processing"""
    
    def __init__(self, n, order, fs, filt_type=""):
        self.n = n
        self.order = order
        self.fs = fs
        
        if filt_type == "third octave":
            # third-octave filter bank
            freq_offset = 2
            k = np.arange(n + 2) - n // 2 - freq_offset

            # center frequencies are defined relative to a bandpass with center frequency at 1kHz
            f_cs = np.power(2, k / 3) * 1000

            f_chs = [] # high cutoff frequencies
            f_cls = [] # low cutoff frequencies
            for k in range(1, f_cs.size-1):
                f_chs.append(np.sqrt(f_cs[k] * f_cs[k+1]))
                f_cls.append(np.sqrt(f_cs[k-1] * f_cs[k]))
        else:
            # log spaced filter bank
            # 2**4 = 16Hz
            # 14.3 = 20171Hz
            f = np.logspace(4, 14.3, n*3, base=2)

            # low cutoff frequencies
            f_cls = f[::3]

            # center frequencies
            f_cs = f[1::3]

            # high cutoff frequencies
            f_chs = f[2::3]

        filters = []
        for k in range(f_cs.size-2):
            sos = self.butter_bp(f_cls[k], f_chs[k])
            filters.append(sos)
        
        self.fcs = f_cs[1:-1]
        self.filters = filters
    
    def butter_bp(self, lowcut, highcut):
        """
        Design a butterworth bandpass filter and return the 'sos' filter.
        lowcut: low cutoff frequency
        highcut: high cutoff frequency
        """
        f_nyq = 0.5 * self.fs
        low = lowcut / f_nyq
        high = highcut / f_nyq
        return signal.butter(self.order, [low, high], btype='band', output='sos')
    
    def plot_filters(self):
        """Plot the magnitude spectrum of all filter of the filter bank."""
        for sos in self.filters:
            w, h = signal.sosfreqz(sos, worN=10000)
            plt.semilogx((self.fs * 0.5 / np.pi) * w[1:], 20*np.log10(np.abs(h[1:])))
            plt.ylim((-100, 5))
            plt.xlim((10, 20000))
            plt.ylabel('H [dB]')
            plt.xlabel('f [Hz]')
            plt.title('third-octave filter bank')
        plt.show()
    
    def filt(self, in_sig):
        """Filter the in_sig and return an array of the filtered signals."""
        filtered_signals = []
        for sos in self.filters:
            filtered_signals.append(signal.sosfilt(sos, in_sig))
        return filtered_signals

# test the Filterbank class
#filtb = Filterbank(28, 4, 44100)
#filtb.plot_filters()
#noise = np.random.normal(0, 1, 10*1024)
#filtered_noise = filtb.filt(noise)
#plot_mag_spec(filtered_noise[10], 44100, db_range=[-100, -30])

In [None]:
class Frequency_shifter:
    """A frequency shifter for audio processing using single sideband moduation"""
    
    def shift(self, in_sig, fs, f_shift):
        """Do frequency shifting of the signal 'in_sig' by the freuency 'f_shift'."""
        # the carrier is a complex exponential
        t = np.linspace(0, in_sig.size/fs, in_sig.size)
        carrier = np.exp(1j*2*np.pi*f_shift*t)
        return (signal.hilbert(in_sig) * carrier).real

# test the Frequency_shifter class
#filtb = Filterbank(28, 4, 44100)
#noise = np.random.normal(0, 1, 10*1024)
#filtered_noise = filtb.filt(noise)
#plot_mag_spec(filtered_noise[20], 44100, "original", db_range=[-100, -30], f_log=False)
#
#f_shifter = Frequency_shifter()
#shifted_filtered_noise = f_shifter.shift(filtered_noise[20], 44100, 5000)
#plot_mag_spec(shifted_filtered_noise, 44100, "shifted", db_range=[-100, -30], f_log=False)

Now to the actual signal processing using the defined classes from above.

In [None]:
# load audio material
fs, x = wavfile.read("../../samples/Toms_diner.wav")
x = x / np.abs(x.max())

plt.plot(x)
plt.title("original")
ipd.Audio(x, rate=fs)

In [None]:
# pitch shift ratio
psr = 1.25

filt_bank = Filterbank(28, 4, fs, filt_type="third octave")
freq_shifter = Frequency_shifter()

# filtering by the filter bank
x_filtered = filt_bank.filt(x)

# frequency shifting in every band
out_signals = []
for i in range(len(x_filtered)):
    fc = filt_bank.fcs[i]
    f_shift = fc * psr - fc
    out_signals.append(freq_shifter.shift(x_filtered[i], fs, f_shift))

# add them together
out_sig = np.zeros(out_signals[0].size)
for sig in out_signals:
    out_sig += sig

In [None]:
# plot
plt.plot(out_sig)
plt.title("'Rollers' pitch shifted")
ipd.Audio(out_sig, rate=fs)

With the very coarse filter bank for this application, the result is shifted in pitch, but it also has inharmonic frequency content and sounds like a chorus effect, since the frequency bands are too wide.
This is an interesting artifact, but to increase the sound quality, we have to design a filter bank with a lot more filters.
The paper suggests $n=73$ filters for low quality and $n=1500$ for for perfect quality.
*Good results* can be achieved with $n=200$ bands.


## A design with a larger filter bank



In [None]:
# define the rollers algorithm
def rollers(x, psr, fs, n, order):
    """
    Rollers pitch-shifting algorithm.
    x: input signal to be pitch-shifted
    psr: pitch shifting ratio (1: prim, 2: octave up, 0.5: octave down, etc.)
    fs:  sampling rate of x
    n: number of frequency bands of the filter bank
    order: order of the butterworth filters in the filter bank
    """
    filt_bank = Filterbank(n, order, fs)
    freq_shifter = Frequency_shifter()

    # filtering by the filter bank
    x_filtered = filt_bank.filt(x)

    # frequency shifting in every band
    out_signals = []
    for i in range(len(x_filtered)):
        fc = filt_bank.fcs[i]
        f_shift = fc * psr - fc
        out_signals.append(freq_shifter.shift(x_filtered[i], fs, f_shift))

    # add them together
    y = np.zeros(out_signals[0].size)
    for sig in out_signals:
        y += sig
    return y

In [None]:
psr = 1.25
order = 4
n = 200

out_large = rollers(x, psr, fs, n, order)

In [None]:
# plot
plt.plot(out_large)
plt.title("'Rollers' pitch shifted")
ipd.Audio(out_large, rate=fs)

There are stronger downward chirp artifacts with $n=200$, but the *detuning* artifact and the chorus type effect is far less prominant. Also there is a kind of metallic reverb, which might also be a result of the filter resonances.

In [None]:
# Let's try a lower filter order
psr = 1.25
order = 2
n = 200

out_large2 = rollers(x, psr, fs, n, order)

In [None]:
# plot
plt.plot(out_large2)
plt.title("'Rollers' pitch shifted")
ipd.Audio(out_large2, rate=fs)

With order 2 filters, the chirp and reverb artifcts are less prominant, but the *detuning* artifact is audible.


## Further improvement

### sound quality

* Formant preservation (adds latency)

* Allow notches between the bands to reduce _detuning_ in expense of perfect reconstruction

* To reduce resonance force frequency width of the lower bands to some fixed minimum width, and use a logarithmic scale only for the higher bands.


### performance

* IIR allpass filters instead of true Hilbert transform

* polyphase filterbank (downsampling)?


Let's check the spectrogram of the impulse response of the filter bank to identify the resonance frequencies.

In [None]:
x = signal.unit_impulse(10*1024)
psr = 1
order = 2
n = 200

y = rollers(x, psr, fs, n, order)

# plot spectrogram
f, t, Sxx = signal.spectrogram(y, fs, nperseg=256)
plt.pcolormesh(t, f, Sxx, shading='gouraud')
plt.yscale('symlog')
plt.ylim((100, 20000))
plt.ylabel('f [Hz]')
plt.xlabel('t [sec]')
plt.show()

In [None]:
# let's try it with white noise to get some energy
x = np.hstack((np.random.normal(0, 1, 10*1024), np.zeros(25*1024)))
psr = 1
order = 2
n = 200

y = rollers(x, psr, fs, n, order)

# plot spectrogram
f, t, Sxx = signal.spectrogram(y, fs, nperseg=512)
plt.pcolormesh(t, f, Sxx, shading='gouraud')
plt.yscale('symlog')
plt.ylim((10, 20000))
plt.ylabel('f [Hz]')
plt.xlabel('t [sec]')
plt.show()

In [None]:
ipd.Audio(y, rate=fs)

## Time variant pitch-shifting

For using the Rollers algorithm in a pitch-discretization effect, time variant pitch-shifting is needed.
A pitch-tracking algorithm like pYIN provides

- an estimated fundamental frequency for every block

- the pitch-tracking analysis block size

In [None]:
# analysis block size
N = 10*1000

# pitch estimates
f0 = np.asarray([100, 200, 300, 400, 500, 600, 1000, 1500, 2000, 1000])

# generate input signal
fs = 44100
dur = N / fs
t = np.linspace(0, dur, N)
x = np.asarray(())
for f in f0:
    x_new = np.sin(2*np.pi*f*t) * signal.hann(N)
    x = np.hstack((x, x_new))

plt.plot(x)
ipd.Audio(x, rate=fs)

Now we do time variant pitch shifting to a constant target pitch.
This leads to a time variant _pitch-shifting ratio_.

In [None]:
# target pitch
ft = 500

# filter bank parameters
order = 2
n = 200


# Rollers
filt_bank = Filterbank(n, order, fs)
freq_shifter = Frequency_shifter()

# divide input into frequency bands
x_filtered = filt_bank.filt(x)

# calculate the pitch shifting ratio for every pitch analysis block
psr = ft / f0
print("pitch shift ratios:", psr)

# frequency shifting in every band
out_signals = []
t = np.linspace(0, x_filtered[0].size/fs, x_filtered[0].size)
for i in range(len(x_filtered)):
    # calculate time variant carrier frequencies for every block
    fc = filt_bank.fcs[i]
    f_shift = fc * psr - fc
    
    # frequency shifting with time variable carrier frequency
    carrier = np.zeros(x_filtered[i].size, dtype=complex)
    for j in range(f_shift.size):
        f = f_shift[j] # discontinuous carrier frequency causes cracks
        carrier[j*N:(j+1)*N] = np.exp(1j*2*np.pi*f*t[j*N:(j+1)*N])
    band = (signal.hilbert(x_filtered[i]) * carrier).real

    out_signals.append(band)

# add bands together
y = np.zeros(out_signals[0].size)
for sig in out_signals:
    y += sig

In [None]:
plt.plot(y)
ipd.Audio(y, rate=fs)

The time variant pitch shifting to a constant pitch works, since the pitch stays constant.
The amplitude fluctuations are from the detuning artifact, which is most prominent on sine tones.
The discontiuities are from the discontiuous change of carrier frequencies.
To remove these cracks, the carrier frequencies might be interpolated or filtered.

In [None]:
plt.rcParams['figure.figsize'] = [15, 9]
fig, ax = plt.subplots(4)
ax[0].plot(carrier.real)
ax[0].set_ylabel("real")
ax[0].set_title("carrier")
ax[1].plot(carrier.imag)
ax[1].set_ylabel("imag")
ax[2].plot(x)
ax[2].set_title("original")
ax[3].plot(y)
ax[3].set_title("shifted")
plt.show()
plt.rcParams['figure.figsize'] = [15, 3]

### Fixing the carrier signal with upsampling

Now let's get rid of the crackling caused by the discontinuous frequency for the carrier signal by _upsampling_ and interpolating it.

From the pitch estimates $f_0$, the pitch shifting ratio $psr$ is calculated by $psr = \frac{f_{target}}{f_0}$.
Then for every band with center frequency $f_c$, the frequency shift is calculated by $f_{shift} = f_c \cdot psr - f_c$

In [None]:
f0 = np.asarray([100, 200, 300, 400, 500, 600, 1000, 1500, 2000, 1000])

# take the band at 1kHz
fc = 1000

ft = 500
psr = ft / f0

f_shift = fc * psr - fc

plt.stem(f_shift)
plt.title("f_shift")
plt.show()

Since the pitch analysis block size $N$ is much bigger than the audio sampling period, for every frequency step, a click appears, which is audible as a crackling sound.
If we upsample this signal and interpolate it, we get a smooth transition between the frequencies and there should be no audible click or crackling in the output signal.
We have to investigate, what the interpolation of the frequency does to the overall algorithm.

In [None]:
f_shift_resampled = signal.resample(f_shift, f_shift.size * 20)

plt.stem(f_shift_resampled)
plt.title("f_shift_resampled")
plt.show()

The `scipy.signal.resample()` function uses the fourier method, which leads to artifacts in the frequency signal.
So let's do the resampling by ourselves with linear inperpolation or equivalently with a moving average filter.
(see Oppenheim and Schafer)

In [None]:
# sampling rate expansion
L = 10 # upsampling factor
f_expanded = np.zeros(L*f_shift.size)
f_expanded[::L] = f_shift
plt.stem(f_expanded)
plt.show()

In [None]:
# moving average FIR
h = signal.windows.triang(2*L - 1)
plt.stem(h)
plt.title("moving average impulse response")
plt.show()

In [None]:
# filtering (truncate convolution tale with mode="same")
f_upsampled = signal.convolve(f_expanded, h, mode="same")

plt.rcParams['figure.figsize'] = [15, 9]
fig, ax = plt.subplots(3)
ax[0].stem(f_shift)
ax[0].set_ylabel("original")
ax[0].set_title("upsampling")
ax[1].stem(f_expanded)
ax[1].set_ylabel("expanded")
ax[2].stem(f_upsampled)
ax[2].set_ylabel("moving average filtered")
plt.show()

So, this works fine, but we want a different value succession.
We want the values to stay constant and vary smoothly at the changes.

In [None]:
f_repeated = np.repeat(f_shift, L)
f_upsampled = signal.convolve(f_repeated, h, mode="same")

fig, ax = plt.subplots(3)
ax[0].stem(f_shift)
ax[0].set_ylabel("original")
ax[0].set_title("upsampling")
ax[1].stem(f_repeated)
ax[1].set_ylabel("repeated")
ax[2].stem(f_upsampled)
ax[2].set_ylabel("moving average filtered")
plt.show()

TODO: entweder Nullen einfügen und moving average filtern oder letzten Wert wiederholen!!!!
Ein moving average filter für ein signal, das wiederholte letzte Werte hat, muss mit einem Faktor (IR Länge) angepasst werden, um es zu glätten!!!!!

It looks, that there is too much smoothing going on.
Since we want to just smooth the edges, a IR of length 3 should be right.
This can be adjusted and varies according to the upsampling factor L, if there are still audible clicks.

In [None]:
h = signal.windows.triang(3)
f_upsampled = signal.convolve(f_repeated, h, mode="same")

fig, ax = plt.subplots(3)
ax[0].stem(f_shift)
ax[0].set_ylabel("original")
ax[0].set_title("upsampling")
ax[1].stem(f_repeated)
ax[1].set_ylabel("repeated")
ax[2].stem(f_upsampled)
ax[2].set_ylabel("moving average filtered")
plt.show()

So let's add the upsampling to the Rollers algorithm and check the audio quality of the result.

In [None]:
# target pitch
ft = 500

# filter bank parameters
order = 2
n = 200

# control signal upsampling factor
L = 500
# length of control signal smoothing filter
l = 160
h_smooth = signal.windows.triang(l)
M = N // L
print("upsampling L:", L, "smoothing l:", l, "block length M:", M)

# Rollers
filt_bank = Filterbank(n, order, fs)
freq_shifter = Frequency_shifter()

# divide input into frequency bands
x_filtered = filt_bank.filt(x)

# calculate the pitch shifting ratio for every pitch analysis block
psr = ft / f0

# frequency shifting in every band
out_signals_smooth = []
t = np.linspace(0, x_filtered[0].size/fs, x_filtered[0].size)
for i in range(len(x_filtered)):
    # calculate time variant carrier frequencies for every block
    fc = filt_bank.fcs[i]
    f_shift = fc * psr - fc
    
    # control signal upsampling and smoothing
    f_shift_repeated = np.repeat(f_shift, L)
    f_shift_upsampled = signal.convolve(f_shift_repeated, h_smooth, mode="same")
    
    # frequency shifting with time variable carrier frequency
    carrier = np.zeros(x_filtered[i].size, dtype=complex)
    for j in range(f_shift_upsampled.size):
        # discontinuous carrier frequency might cause cracks
        # => increase control signal smoothing filter length
        f = f_shift_upsampled[j]
        carrier[j*M:(j+1)*M] = np.exp(1j*2*np.pi*f*t[j*M:(j+1)*M])
    band = (signal.hilbert(x_filtered[i]) * carrier).real

    out_signals_smooth.append(band)

# add bands together
y_smooth = np.zeros(out_signals_smooth[0].size)
for sig in out_signals_smooth:
    y_smooth += sig

In [None]:
plt.plot(y_smooth)
ipd.Audio(y_smooth, rate=fs)

This does not because the frequency modulation is done wrong!!!!