# Voice transformation algorithm

# Introduction

This notebook describes practical approaches to transforming voice of one to another. A goal of this notebook is to create synthetic voice that is as natural as possible.

## Table of contents
1. [User-defined library](#userdefinedlibrary)
2. [Pitch control by resampling](#pitchctrlresample)
3. [Pitch control by PSOLA algorithm](#pitchctrlpsola)

In [3]:
import numpy as np
import librosa 
from scipy.io import wavfile
import IPython.display as ipd
from copy import copy
import samplerate
import matplotlib.pyplot as plt

# User-defined library <a name="userdefinedlibrary"></a>

In [4]:
def frameize(x: np.array, N: int, H_a: int, hfilt: np.array) -> list:
    """Truncate audio sample into frames.
    
    Params
    ------
    x: audio array
    N: segment size
    H_a: analysis hop size
    hfilt: windowing filter
    
    Returns
    -------
    frames: segments of audio sample
    """
    frames = []
    idx = 0 
    
    while True:
        try: frames += [hfilt*x[H_a*idx:H_a*idx+N]]
        except: break   
        idx += 1
    
    return frames


def find_hfilt_norm(hfilt: np.array, H_s: int, delta: int=0) -> np.array:
    """Compute normalization filter array for windowing effect.
    
    Params
    ------
    hfilt: filter window used for our purpose
    H_s: synthesis hop size
    delta: small shift for synchronization
    
    Returns
    -------
    hf_norm: normalization filter array 
    """
    hf_norm = copy(hfilt)
    N = len(hfilt)
    
    if (H_s+delta) < N and (H_s+delta) >= 0:
        # add right superposed
        hf_norm[(H_s+delta):] += hfilt[:N-(H_s+delta)]
        # add left superposed
        hf_norm[:N-(H_s+delta)] += hfilt[(H_s+delta):]
        
    return hf_norm


def find_min_delta(frame1: np.array, frame2:np.array,
                   H_a: int, H_s: int, interval: int) -> int:
    """Find minimum difference lag within interval.
    
    Params
    ------
    frame1: left frame
    frame2: right frame
    H_a: analysis hop size
    H_s: synthetic hop size
    interval: interval size to search for minimum difference
    
    Returns
    -------
    min_delta: minimum difference lag 
    """
    N = len(frame1)
    deltas = []
    
    search_interval = np.arange(-interval//2, -interval//2+interval)
    
    for i in search_interval:
        try: deltas += [np.sum(abs(frame1[(H_s+i):]-frame2[:N-(Hs+i)]))]
        except: deltas += [np.inf]
    
    min_delta = search_interval[np.argmin(deltas)]
    
    return min_delta

    
def distort_time(x: np.array, N: int, H_a: int,
                 hfilt: np.array, alpha: float) -> np.array:
    """Distort time of audio sample by given ratio.
    
    Params
    ------
    x: audio data
    N: segment size
    H_a: analysis hop size
    hfilt: windowing filter
    alpha: time-scaling factor
    
    Returns
    -------
    out_x: time-scaled data 
    """
    # put into frames
    frames = frameize(x, N, H_a, hfilt)
    
    H_s = int(np.round(H_a*alpha))
    interval = 200 # search area for best match
    out_x = np.zeros(len(frames)*H_s+N)
        
    # time-distorting
    for i, frame in enumerate(frames):
        # end parts
        if i == len(frames) - 1:
            min_delta = 0 
            hfilt_norm = find_hfilt_norm(hfilt, H_s)
        # start, middle parts
        else:
            # min_delta = find_min_delta(frames[i], frames[i+1], H_a, H_s, interval) 
            min_delta = 0
            hfilt_norm = find_hfilt_norm(hfilt, H_s, delta=min_delta)
            # print(min_delta)

        out_x[i*(H_s+min_delta):i*(H_s+min_delta)+N] += frame/hfilt_norm
    
    
    
    return out_x
    

def synthesize_pitch(x: np.array, sr: int, N: int, H_a: int,
                      hfilt: np.array, alpha: float) -> np.array:
    """Synthesize sound sample into new one with different pitch using PSOLA algorithm.
    
    Params
    ------
    x: audio data
    sr: sampling rate
    N: segment size
    H_a: analysis hop size
    hfilt: windowing filter
    alpha: pitch factor
    
    Returns
    -------
    syn_x: synthesized data
    """
    syn_data = distort_time(x, N, H_a, hfilt, alpha)

    # resampling
    syn_data = samplerate.resample(syn_data, 1/alpha, 'sinc_best')
    syn_data = syn_data/np.max(abs(syn_data))
        
    return syn_data


def warp_spectrum(S, factor: float) -> np.array:
    """Frequency stretching of spectrogram"""
    out_S = np.array([np.interp((np.arange(0, len(s)) / len(s)) * factor,
                               (np.arange(0, len(s)) / len(s)),
                               s)
                      for s in S.T], dtype=complex).T
    return out_S

# Pitch control by resampling <a name="pitchctrlresample"></a>

In [6]:
import librosa 
import IPython.display as ipd

data, sr = librosa.load('example.wav', sr=None)
out_data1 = samplerate.resample(data, 0.5, 'sinc_best')
out_data2 = samplerate.resample(data, 1.5, 'sinc_best')

print('Higher pitch:')
ipd.display(ipd.Audio(out_data1, rate=sr))
print('Lower pitch:')
ipd.display(ipd.Audio(out_data2, rate=sr))

Higher pitch:


Lower pitch:


# Pitch control by PSOLA algorithm <a name="pitchctrlpsola"></a>

In [8]:
#make segments of 0.05-seconds (2205)
N = 1024 # segment size for sampling rate 44100 Hz
H_a = int(N*0.5) # analysis hop size between 0.5 ~ 1
hfilt = np.hanning(N) # filter type

# input 
data, sr = librosa.load('example.wav', sr=None)
ipd.display(ipd.Audio(data, rate=sr, normalize=False))

# synthesize 
out_data1 = synthesize_pitch(data, sr, N, H_a, hfilt, alpha=1.5)
out_data2 = synthesize_pitch(data, sr, N, H_a, hfilt, alpha=0.9)

print('Higher pitch:')
ipd.display(ipd.Audio(out_data1, rate=sr, normalize=True))
print('Lower pitch:')
ipd.display(ipd.Audio(out_data2, rate=sr, normalize=True))

Higher pitch:


Lower pitch:


# Pitch & frequency stretch control by PSOLA algorithm

In [12]:
%%time
#make segments of 0.05-seconds (2205)
N = 1024 # segment size for sampling rate 44100 Hz
H_a = int(N*0.5) # analysis hop size between 0.5 ~ 1
hfilt = np.hanning(N) # filter type

# input 
data, sr = librosa.load('example.wav', sr=None)
ipd.display(ipd.Audio(data, rate=sr, normalize=False))
alpha = 1.6 # pitch

# pitch increase
data = synthesize_pitch(data, sr, N, H_a, hfilt, alpha=alpha)

# frequency stretching
S1 = librosa.stft(data, n_fft=512, hop_length=64)
S2 = warp_spectrum(S1, alpha**(1/3))
data = librosa.istft(S2, hop_length=64, win_length=512)

ipd.display(ipd.Audio(data, rate=sr, normalize=True))

CPU times: user 2.2 s, sys: 78.1 ms, total: 2.28 s
Wall time: 2.29 s
