In [1]:
import sys
import os

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [2]:
import torch
import librosa
import malaya_speech
import scipy
import numpy as np
from packaging import version

In [3]:
import tensorflow as tf

# tf.compat.v1.enable_eager_execution()

In [4]:
y, sr = malaya_speech.load('../speech/example-speaker/shafiqah-idayu.wav')
len(y) / sr, sr

(3.518625, 16000)

In [5]:
class STFT(torch.nn.Module):
    """computes the Short-Term Fourier Transform (STFT).
    This class computes the Short-Term Fourier Transform of an audio signal.
    It supports multi-channel audio inputs (batch, time, channels).
    Arguments
    ---------
    sample_rate : int
        Sample rate of the input audio signal (e.g 16000).
    win_length : float
        Length (in ms) of the sliding window used to compute the STFT.
    hop_length : float
        Length (in ms) of the hope of the sliding window used to compute
        the STFT.
    n_fft : int
        Number of fft point of the STFT. It defines the frequency resolution
        (n_fft should be <= than win_len).
    window_fn : function
        A function that takes an integer (number of samples) and outputs a
        tensor to be multiplied with each window before fft.
    normalized_stft : bool
        If True, the function returns the  normalized STFT results,
        i.e., multiplied by win_length^-0.5 (default is False).
    center : bool
        If True (default), the input will be padded on both sides so that the
        t-th frame is centered at time t×hop_length. Otherwise, the t-th frame
        begins at time t×hop_length.
    pad_mode : str
        It can be 'constant','reflect','replicate', 'circular', 'reflect'
        (default). 'constant' pads the input tensor boundaries with a
        constant value. 'reflect' pads the input tensor using the reflection
        of the input boundary. 'replicate' pads the input tensor using
        replication of the input boundary. 'circular' pads using  circular
        replication.
    onesided : True
        If True (default) only returns nfft/2 values. Note that the other
        samples are redundant due to the Fourier transform conjugate symmetry.
    Example
    -------
    >>> import torch
    >>> compute_STFT = STFT(
    ...     sample_rate=16000, win_length=25, hop_length=10, n_fft=400
    ... )
    >>> inputs = torch.randn([10, 16000])
    >>> features = compute_STFT(inputs)
    >>> features.shape
    torch.Size([10, 101, 201, 2])
    """

    def __init__(
        self,
        sample_rate,
        win_length=25,
        hop_length=10,
        n_fft=400,
        window_fn=torch.hamming_window,
        normalized_stft=False,
        center=True,
        pad_mode="constant",
        onesided=True,
    ):
        super().__init__()
        self.sample_rate = sample_rate
        self.win_length = win_length
        self.hop_length = hop_length
        self.n_fft = n_fft
        self.normalized_stft = normalized_stft
        self.center = center
        self.pad_mode = pad_mode
        self.onesided = onesided

        # Convert win_length and hop_length from ms to samples
        self.win_length = int(
            round((self.sample_rate / 1000.0) * self.win_length)
        )
        self.hop_length = int(
            round((self.sample_rate / 1000.0) * self.hop_length)
        )

        self.window = window_fn(self.win_length)

    def forward(self, x):
        """Returns the STFT generated from the input waveforms.
        Arguments
        ---------
        x : tensor
            A batch of audio signals to transform.
        """

        # Managing multi-channel stft
        or_shape = x.shape
        if len(or_shape) == 3:
            x = x.transpose(1, 2)
            x = x.reshape(or_shape[0] * or_shape[2], or_shape[1])

        if version.parse(torch.__version__) <= version.parse("1.6.0"):
            stft = torch.stft(
                x,
                self.n_fft,
                self.hop_length,
                self.win_length,
                self.window.to(x.device),
                self.center,
                self.pad_mode,
                self.normalized_stft,
                self.onesided,
            )
        else:
            stft = torch.stft(
                x,
                self.n_fft,
                self.hop_length,
                self.win_length,
                self.window.to(x.device),
                self.center,
                self.pad_mode,
                self.normalized_stft,
                self.onesided,
                return_complex=False,
            )
        # return stft

        # Retrieving the original dimensionality (batch,time, channels)
        if len(or_shape) == 3:
            stft = stft.reshape(
                or_shape[0],
                or_shape[2],
                stft.shape[1],
                stft.shape[2],
                stft.shape[3],
            )
            stft = stft.permute(0, 3, 2, 4, 1)
        else:
            # (batch, time, channels)
            stft = stft.transpose(2, 1)

        return stft
    
stft = STFT(sr)

In [10]:
y_pt = torch.from_numpy(np.expand_dims(y, 0).astype(np.float32))
s_pt = stft(y_pt)
s_pt.shape

torch.Size([1, 352, 201, 2])

In [11]:
win_length = int(
    round((sr / 1000.0) * 25)
)
hop_length = int(
    round((sr / 1000.0) * 10)
)

In [12]:
s = librosa.stft(y, n_fft = 400, hop_length=hop_length, win_length=win_length, window=stft.window.detach().numpy(),
            pad_mode = 'constant')
s = np.concatenate([np.expand_dims(s.real, -1), np.expand_dims(s.imag, -1)], -1)
s.shape

(201, 352, 2)

In [13]:
np.transpose(s, (1, 0, 2)).shape

(352, 201, 2)

In [14]:
ECAPA_TDNN_WINDOWS = np.array(
    [
        0.08000001311302185,
        0.08005675673484802,
        0.08022701740264893,
        0.08051067590713501,
        0.08090773224830627,
        0.08141803741455078,
        0.08204150199890137,
        0.0827779769897461,
        0.08362725377082825,
        0.08458912372589111,
        0.08566337823867798,
        0.08684971928596497,
        0.08814787864685059,
        0.08955749869346619,
        0.09107831120491028,
        0.09270986914634705,
        0.09445175528526306,
        0.09630361199378967,
        0.0982649028301239,
        0.10033521056175232,
        0.10251399874687195,
        0.1048007607460022,
        0.10719487071037292,
        0.10969573259353638,
        0.11230283975601196,
        0.11501544713973999,
        0.11783286929130554,
        0.12075451016426086,
        0.12377956509590149,
        0.1269073486328125,
        0.13013699650764465,
        0.13346782326698303,
        0.1368989646434784,
        0.1404295265674591,
        0.14405867457389832,
        0.14778554439544678,
        0.15160918235778809,
        0.15552863478660583,
        0.15954294800758362,
        0.16365116834640503,
        0.16785219311714172,
        0.1721450686454773,
        0.1765287220478058,
        0.18100205063819885,
        0.18556392192840576,
        0.19021326303482056,
        0.1949489414691925,
        0.1997697353363037,
        0.20467448234558105,
        0.20966193079948425,
        0.2147308886051178,
        0.2198801338672638,
        0.22510835528373718,
        0.2304142713546753,
        0.23579657077789307,
        0.24125391244888306,
        0.2467850148677826,
        0.2523884177207947,
        0.25806280970573425,
        0.26380670070648193,
        0.26961880922317505,
        0.2754976153373718,
        0.28144165873527527,
        0.28744953870773315,
        0.2935197353363037,
        0.29965072870254517,
        0.30584096908569336,
        0.31208905577659607,
        0.318393349647522,
        0.3247523307800293,
        0.3311644196510315,
        0.337628036737442,
        0.3441416025161743,
        0.35070347785949707,
        0.35731202363967896,
        0.36396563053131104,
        0.37066274881362915,
        0.37740159034729004,
        0.38418057560920715,
        0.3909980058670044,
        0.39785221219062805,
        0.40474146604537964,
        0.41166412830352783,
        0.41861844062805176,
        0.4256027042865753,
        0.4326151907444,
        0.4396541714668274,
        0.44671788811683655,
        0.4538046419620514,
        0.46091267466545105,
        0.46804019808769226,
        0.4751855134963989,
        0.482346773147583,
        0.4895222783088684,
        0.49671024084091187,
        0.5039088726043701,
        0.5111164450645447,
        0.5183310508728027,
        0.5255510807037354,
        0.5327746868133545,
        0.5400000214576721,
        0.5472254157066345,
        0.5544490218162537,
        0.5616689920425415,
        0.5688837170600891,
        0.5760912299156189,
        0.5832898616790771,
        0.5904778242111206,
        0.597653329372406,
        0.6048146486282349,
        0.6119599342346191,
        0.6190874576568604,
        0.62619549036026,
        0.6332822442054749,
        0.6403459906578064,
        0.6473849415779114,
        0.6543974280357361,
        0.661381721496582,
        0.6683359146118164,
        0.6752586364746094,
        0.6821478605270386,
        0.6890020370483398,
        0.6958194971084595,
        0.7025984525680542,
        0.7093373537063599,
        0.7160344123840332,
        0.7226880788803101,
        0.7292966842651367,
        0.7358585596084595,
        0.7423720955848694,
        0.7488356828689575,
        0.7552477717399597,
        0.761606752872467,
        0.7679110765457153,
        0.7741591334342957,
        0.7803494334220886,
        0.7864804267883301,
        0.7925505638122559,
        0.7985584735870361,
        0.8045024871826172,
        0.810381293296814,
        0.8161934018135071,
        0.8219373226165771,
        0.8276116847991943,
        0.8332151174545288,
        0.838746190071106,
        0.8442035913467407,
        0.8495858907699585,
        0.8548917770385742,
        0.8601199984550476,
        0.8652691841125488,
        0.8703380823135376,
        0.8753255605697632,
        0.8802303075790405,
        0.8850511312484741,
        0.8897868394851685,
        0.8944361209869385,
        0.8989980220794678,
        0.9034713506698608,
        0.9078550338745117,
        0.9121478796005249,
        0.916348934173584,
        0.920457124710083,
        0.9244714379310608,
        0.9283908605575562,
        0.9322144985198975,
        0.9359413981437683,
        0.9395705461502075,
        0.9431011080741882,
        0.9465322494506836,
        0.949863076210022,
        0.9530927538871765,
        0.9562205076217651,
        0.9592455625534058,
        0.9621672034263611,
        0.964984655380249,
        0.967697262763977,
        0.9703043103218079,
        0.9728052020072937,
        0.9751993417739868,
        0.9774860739707947,
        0.9796648621559143,
        0.9817351698875427,
        0.983696460723877,
        0.9855483174324036,
        0.9872902035713196,
        0.9889217615127563,
        0.9904425144195557,
        0.9918521642684937,
        0.9931503534317017,
        0.9943366646766663,
        0.9954109191894531,
        0.9963728189468384,
        0.9972220659255981,
        0.9979585409164429,
        0.9985820055007935,
        0.9990923404693604,
        0.9994893670082092,
        0.9997730255126953,
        0.9999432563781738,
        1.0,
        0.9999432563781738,
        0.9997730255126953,
        0.9994893670082092,
        0.9990923404693604,
        0.9985820055007935,
        0.9979585409164429,
        0.9972220659255981,
        0.9963728189468384,
        0.9954109191894531,
        0.9943366050720215,
        0.9931502938270569,
        0.9918521642684937,
        0.9904425144195557,
        0.9889217615127563,
        0.9872901439666748,
        0.9855482578277588,
        0.9836964011192322,
        0.981735110282898,
        0.9796648025512695,
        0.9774860143661499,
        0.9751992225646973,
        0.9728051424026489,
        0.9703042507171631,
        0.9676971435546875,
        0.9649845361709595,
        0.9621671438217163,
        0.959245502948761,
        0.9562203884124756,
        0.9530926942825317,
        0.9498629570007324,
        0.946532130241394,
        0.9431010484695435,
        0.939570426940918,
        0.9359412789344788,
        0.9322144985198975,
        0.9283908605575562,
        0.924471378326416,
        0.920457124710083,
        0.9163488745689392,
        0.9121478199958801,
        0.9078549146652222,
        0.9034712910652161,
        0.898997962474823,
        0.8944361209869385,
        0.8897867202758789,
        0.8850510716438293,
        0.8802303075790405,
        0.8753255605697632,
        0.8703380823135376,
        0.8652690649032593,
        0.8601198196411133,
        0.8548916578292847,
        0.8495857119560242,
        0.8442034721374512,
        0.8387460708618164,
        0.8332149982452393,
        0.8276115655899048,
        0.8219372034072876,
        0.8161932229995728,
        0.8103811740875244,
        0.8045023679733276,
        0.7985582947731018,
        0.7925504446029663,
        0.7864802479743958,
        0.7803492546081543,
        0.7741589546203613,
        0.767910897731781,
        0.7616065740585327,
        0.7552475929260254,
        0.748835563659668,
        0.7423719167709351,
        0.7358583807945251,
        0.7292965054512024,
        0.7226879000663757,
        0.7160342931747437,
        0.7093371748924255,
        0.7025983333587646,
        0.6958193182945251,
        0.6890019178390503,
        0.6821476817131042,
        0.6752583980560303,
        0.6683357954025269,
        0.6613814830780029,
        0.654397189617157,
        0.6473847031593323,
        0.6403457522392273,
        0.6332820057868958,
        0.6261952519416809,
        0.6190872192382812,
        0.61195969581604,
        0.6048144102096558,
        0.5976530909538269,
        0.5904775857925415,
        0.583289623260498,
        0.5760909914970398,
        0.5688834190368652,
        0.5616687536239624,
        0.5544487237930298,
        0.5472253561019897,
        0.5400000214576721,
        0.5327746272087097,
        0.5255510807037354,
        0.5183310508728027,
        0.5111163854598999,
        0.5039088129997253,
        0.4967101812362671,
        0.48952221870422363,
        0.48234671354293823,
        0.47518542408943176,
        0.4680401384830475,
        0.4609125852584839,
        0.4538045823574066,
        0.4467178285121918,
        0.43965408205986023,
        0.43261510133743286,
        0.42560261487960815,
        0.418618381023407,
        0.41166406869888306,
        0.40474140644073486,
        0.3978521227836609,
        0.3909979462623596,
        0.3841805160045624,
        0.37740153074264526,
        0.3706626296043396,
        0.36396557092666626,
        0.3573119044303894,
        0.3507033586502075,
        0.34414148330688477,
        0.33762791752815247,
        0.33116430044174194,
        0.32475221157073975,
        0.3183932304382324,
        0.3120889365673065,
        0.3058408796787262,
        0.2996505796909332,
        0.29351961612701416,
        0.2874494194984436,
        0.2814415693283081,
        0.2754974961280823,
        0.2696186900138855,
        0.26380661129951477,
        0.2580626606941223,
        0.2523882985115051,
        0.24678486585617065,
        0.2412538230419159,
        0.23579645156860352,
        0.23041415214538574,
        0.22510823607444763,
        0.21988001465797424,
        0.21473079919815063,
        0.2096618115901947,
        0.20467433333396912,
        0.19976958632469177,
        0.19494882225990295,
        0.1902131736278534,
        0.1855638027191162,
        0.1810019314289093,
        0.17652860283851624,
        0.17214497923851013,
        0.16785207390785217,
        0.16365104913711548,
        0.15954285860061646,
        0.15552854537963867,
        0.15160906314849854,
        0.14778542518615723,
        0.14405855536460876,
        0.14042943716049194,
        0.13689884543418884,
        0.13346773386001587,
        0.13013693690299988,
        0.12690722942352295,
        0.12377947568893433,
        0.1207544207572937,
        0.11783277988433838,
        0.11501532793045044,
        0.1123027503490448,
        0.1096956729888916,
        0.10719478130340576,
        0.10480067133903503,
        0.10251393914222717,
        0.10033515095710754,
        0.09826484322547913,
        0.09630352258682251,
        0.09445169568061829,
        0.09270983934402466,
        0.09107828140258789,
        0.08955749869346619,
        0.0881478488445282,
        0.08684971928596497,
        0.08566337823867798,
        0.08458912372589111,
        0.08362725377082825,
        0.0827779769897461,
        0.08204150199890137,
        0.08141803741455078,
        0.08090773224830627,
        0.08051067590713501,
        0.08022701740264893,
        0.08005675673484802,
    ]
)


In [15]:
import math

class Filterbank(torch.nn.Module):
    """computes filter bank (FBANK) features given spectral magnitudes.
    Arguments
    ---------
    n_mels : float
        Number of Mel filters used to average the spectrogram.
    log_mel : bool
        If True, it computes the log of the FBANKs.
    filter_shape : str
        Shape of the filters ('triangular', 'rectangular', 'gaussian').
    f_min : int
        Lowest frequency for the Mel filters.
    f_max : int
        Highest frequency for the Mel filters.
    n_fft : int
        Number of fft points of the STFT. It defines the frequency resolution
        (n_fft should be<= than win_len).
    sample_rate : int
        Sample rate of the input audio signal (e.g, 16000)
    power_spectrogram : float
        Exponent used for spectrogram computation.
    amin : float
        Minimum amplitude (used for numerical stability).
    ref_value : float
        Reference value used for the dB scale.
    top_db : float
        Top dB valu used for log-mels.
    freeze : bool
        If False, it the central frequency and the band of each filter are
        added into nn.parameters. If True, the standard frozen features
        are computed.
    param_change_factor: bool
        If freeze=False, this parameter affects the speed at which the filter
        parameters (i.e., central_freqs and bands) can be changed.  When high
        (e.g., param_change_factor=1) the filters change a lot during training.
        When low (e.g. param_change_factor=0.1) the filter parameters are more
        stable during training
    param_rand_factor: float
        This parameter can be used to randomly change the filter parameters
        (i.e, central frequencies and bands) during training.  It is thus a
        sort of regularization. param_rand_factor=0 does not affect, while
        param_rand_factor=0.15 allows random variations within +-15% of the
        standard values of the filter parameters (e.g., if the central freq
        is 100 Hz, we can randomly change it from 85 Hz to 115 Hz).
    Example
    -------
    >>> import torch
    >>> compute_fbanks = Filterbank()
    >>> inputs = torch.randn([10, 101, 201])
    >>> features = compute_fbanks(inputs)
    >>> features.shape
    torch.Size([10, 101, 40])
    """

    def __init__(
        self,
        n_mels=80,
        log_mel=True,
        filter_shape="triangular",
        f_min=0,
        f_max=8000,
        n_fft=400,
        sample_rate=16000,
        power_spectrogram=2,
        amin=1e-10,
        ref_value=1.0,
        top_db=80.0,
        param_change_factor=1.0,
        param_rand_factor=0.0,
        freeze=True,
    ):
        super().__init__()
        self.n_mels = n_mels
        self.log_mel = log_mel
        self.filter_shape = filter_shape
        self.f_min = f_min
        self.f_max = f_max
        self.n_fft = n_fft
        self.sample_rate = sample_rate
        self.power_spectrogram = power_spectrogram
        self.amin = amin
        self.ref_value = ref_value
        self.top_db = top_db
        self.freeze = freeze
        self.n_stft = self.n_fft // 2 + 1
        self.db_multiplier = math.log10(max(self.amin, self.ref_value))
        self.device_inp = torch.device("cpu")
        self.param_change_factor = param_change_factor
        self.param_rand_factor = param_rand_factor

        if self.power_spectrogram == 2:
            self.multiplier = 10
        else:
            self.multiplier = 20

        # Make sure f_min < f_max
        if self.f_min >= self.f_max:
            err_msg = "Require f_min: %f < f_max: %f" % (
                self.f_min,
                self.f_max,
            )
            logger.error(err_msg, exc_info=True)

        # Filter definition
        mel = torch.linspace(
            self._to_mel(self.f_min), self._to_mel(self.f_max), self.n_mels + 2
        )
        hz = self._to_hz(mel)

        # Computation of the filter bands
        band = hz[1:] - hz[:-1]
        self.band = band[:-1]
        self.f_central = hz[1:-1]

        # Adding the central frequency and the band to the list of nn param
        if not self.freeze:
            self.f_central = torch.nn.Parameter(
                self.f_central / (self.sample_rate * self.param_change_factor)
            )
            self.band = torch.nn.Parameter(
                self.band / (self.sample_rate * self.param_change_factor)
            )

        # Frequency axis
        all_freqs = torch.linspace(0, self.sample_rate // 2, self.n_stft)

        # Replicating for all the filters
        self.all_freqs_mat = all_freqs.repeat(self.f_central.shape[0], 1)

    def forward(self, spectrogram):
        """Returns the FBANks.
        Arguments
        ---------
        x : tensor
            A batch of spectrogram tensors.
        """
        # Computing central frequency and bandwidth of each filter
        f_central_mat = self.f_central.repeat(
            self.all_freqs_mat.shape[1], 1
        ).transpose(0, 1)
        band_mat = self.band.repeat(self.all_freqs_mat.shape[1], 1).transpose(
            0, 1
        )

        # Uncomment to print filter parameters
        # print(self.f_central*self.sample_rate * self.param_change_factor)
        # print(self.band*self.sample_rate* self.param_change_factor)

        # Creation of the multiplication matrix. It is used to create
        # the filters that average the computed spectrogram.
        if not self.freeze:
            f_central_mat = f_central_mat * (
                self.sample_rate
                * self.param_change_factor
                * self.param_change_factor
            )
            band_mat = band_mat * (
                self.sample_rate
                * self.param_change_factor
                * self.param_change_factor
            )

        # Regularization with random changes of filter central frequnecy and band
        elif self.param_rand_factor != 0 and self.training:
            rand_change = (
                1.0
                + torch.rand(2) * 2 * self.param_rand_factor
                - self.param_rand_factor
            )
            f_central_mat = f_central_mat * rand_change[0]
            band_mat = band_mat * rand_change[1]
        
        print(f_central_mat.shape, band_mat.shape)
        fbank_matrix = self._create_fbank_matrix(f_central_mat, band_mat).to(
            spectrogram.device
        )
        # return fbank_matrix
        

        sp_shape = spectrogram.shape
        print(sp_shape)

        # Managing multi-channels case (batch, time, channels)
        if len(sp_shape) == 4:
            spectrogram = spectrogram.reshape(
                sp_shape[0] * sp_shape[3], sp_shape[1], sp_shape[2]
            )
            print(spectrogram.shape)

        # FBANK computation
        fbanks = torch.matmul(spectrogram, fbank_matrix)
        #return fbanks
        if self.log_mel:
            fbanks = self._amplitude_to_DB(fbanks)

        # Reshaping in the case of multi-channel inputs
        if len(sp_shape) == 4:
            fb_shape = fbanks.shape
            fbanks = fbanks.reshape(
                sp_shape[0], fb_shape[1], fb_shape[2], sp_shape[3]
            )

        return fbanks

    @staticmethod
    def _to_mel(hz):
        """Returns mel-frequency value corresponding to the input
        frequency value in Hz.
        Arguments
        ---------
        x : float
            The frequency point in Hz.
        """
        return 2595 * math.log10(1 + hz / 700)

    @staticmethod
    def _to_hz(mel):
        """Returns hz-frequency value corresponding to the input
        mel-frequency value.
        Arguments
        ---------
        x : float
            The frequency point in the mel-scale.
        """
        return 700 * (10 ** (mel / 2595) - 1)

    def _triangular_filters(self, all_freqs, f_central, band):
        """Returns fbank matrix using triangular filters.
        Arguments
        ---------
        all_freqs : Tensor
            Tensor gathering all the frequency points.
        f_central : Tensor
            Tensor gathering central frequencies of each filter.
        band : Tensor
            Tensor gathering the bands of each filter.
        """

        # Computing the slops of the filters
        slope = (all_freqs - f_central) / band
        print(slope.shape)
        left_side = slope + 1.0
        right_side = -slope + 1.0

        # Adding zeros for negative values
        zero = torch.zeros(1, device=self.device_inp)
        fbank_matrix = torch.max(
            zero, torch.min(left_side, right_side)
        ).transpose(0, 1)

        return fbank_matrix

    def _create_fbank_matrix(self, f_central_mat, band_mat):
        """Returns fbank matrix to use for averaging the spectrum with
           the set of filter-banks.
        Arguments
        ---------
        f_central : Tensor
            Tensor gathering central frequencies of each filter.
        band : Tensor
            Tensor gathering the bands of each filter.
        smooth_factor: Tensor
            Smoothing factor of the gaussian filter. It can be used to employ
            sharper or flatter filters.
        """
        if self.filter_shape == "triangular":
            fbank_matrix = self._triangular_filters(
                self.all_freqs_mat, f_central_mat, band_mat
            )

        elif self.filter_shape == "rectangular":
            fbank_matrix = self._rectangular_filters(
                self.all_freqs_mat, f_central_mat, band_mat
            )

        else:
            fbank_matrix = self._gaussian_filters(
                self.all_freqs_mat, f_central_mat, band_mat
            )

        return fbank_matrix

    def _amplitude_to_DB(self, x):
        """Converts  linear-FBANKs to log-FBANKs.
        Arguments
        ---------
        x : Tensor
            A batch of linear FBANK tensors.
        """
        x_db = self.multiplier * torch.log10(torch.clamp(x, min=self.amin))
        x_db -= self.multiplier * self.db_multiplier

        # Setting up dB max
        new_x_db_max = torch.tensor(
            float(x_db.max()) - self.top_db, dtype=x_db.dtype, device=x.device,
        )
        # Clipping to dB max
        x_db = torch.max(x_db, new_x_db_max)

        return x_db
    
filterbank = Filterbank()

In [16]:
o = filterbank(s_pt)
o.shape

torch.Size([80, 201]) torch.Size([80, 201])
torch.Size([80, 201])
torch.Size([1, 352, 201, 2])
torch.Size([2, 352, 201])


torch.Size([1, 352, 80, 2])

In [17]:
class Deltas(torch.nn.Module):
    """Computes delta coefficients (time derivatives).
    Arguments
    ---------
    win_length : int
        Length of the window used to compute the time derivatives.
    Example
    -------
    >>> inputs = torch.randn([10, 101, 20])
    >>> compute_deltas = Deltas(input_size=inputs.size(-1))
    >>> features = compute_deltas(inputs)
    >>> features.shape
    torch.Size([10, 101, 20])
    """

    def __init__(
        self, input_size, window_length=5,
    ):
        super().__init__()
        self.n = (window_length - 1) // 2
        self.denom = self.n * (self.n + 1) * (2 * self.n + 1) / 3

        self.register_buffer(
            "kernel",
            torch.arange(-self.n, self.n + 1, dtype=torch.float32,).repeat(
                input_size, 1, 1
            ),
        )
        
    def forward(self, x):
        """Returns the delta coefficients.
        Arguments
        ---------
        x : tensor
            A batch of tensors.
        """
        # Managing multi-channel deltas reshape tensor (batch*channel,time)
        x = x.transpose(1, 2).transpose(2, -1)
        or_shape = x.shape
        if len(or_shape) == 4:
            x = x.reshape(or_shape[0] * or_shape[2], or_shape[1], or_shape[3])

        # Padding for time borders
        print(x.shape)
        x = torch.nn.functional.pad(x, (self.n, self.n), mode="replicate")
        print(x.shape, self.kernel.shape, x.shape[1])
        #return x, self.kernel

        # Derivative estimation (with a fixed convolutional kernel)
        delta_coeff = (
            torch.nn.functional.conv1d(x, self.kernel, groups=x.shape[1])
            / self.denom
        )
        print(delta_coeff.shape)

        # Retrieving the original dimensionality (for multi-channel case)
        if len(or_shape) == 4:
            delta_coeff = delta_coeff.reshape(
                or_shape[0], or_shape[1], or_shape[2], or_shape[3],
            )
        print(delta_coeff.shape)
        delta_coeff = delta_coeff.transpose(1, -1).transpose(2, -1)

        return delta_coeff

    
deltas = Deltas(80)

In [18]:
o.shape, deltas(o).shape

torch.Size([2, 80, 352])
torch.Size([2, 80, 356]) torch.Size([80, 1, 5]) 80
torch.Size([2, 80, 352])
torch.Size([1, 80, 2, 352])


(torch.Size([1, 352, 80, 2]), torch.Size([1, 352, 80, 2]))

In [19]:
d = deltas(o)

torch.Size([2, 80, 352])
torch.Size([2, 80, 356]) torch.Size([80, 1, 5]) 80
torch.Size([2, 80, 352])
torch.Size([1, 80, 2, 352])


In [20]:
d.shape

torch.Size([1, 352, 80, 2])

In [None]:
class ContextWindow(torch.nn.Module):
    """Computes the context window.
    This class applies a context window by gathering multiple time steps
    in a single feature vector. The operation is performed with a
    convolutional layer based on a fixed kernel designed for that.
    Arguments
    ---------
    left_frames : int
         Number of left frames (i.e, past frames) to collect.
    right_frames : int
        Number of right frames (i.e, future frames) to collect.
    Example
    -------
    >>> import torch
    >>> compute_cw = ContextWindow(left_frames=5, right_frames=5)
    >>> inputs = torch.randn([10, 101, 20])
    >>> features = compute_cw(inputs)
    >>> features.shape
    torch.Size([10, 101, 220])
    """

    def __init__(
        self, left_frames=0, right_frames=0,
    ):
        super().__init__()
        self.left_frames = left_frames
        self.right_frames = right_frames
        self.context_len = self.left_frames + self.right_frames + 1
        self.kernel_len = 2 * max(self.left_frames, self.right_frames) + 1

        # Kernel definition
        self.kernel = torch.eye(self.context_len, self.kernel_len)

        if self.right_frames > self.left_frames:
            lag = self.right_frames - self.left_frames
            self.kernel = torch.roll(self.kernel, lag, 1)

        self.first_call = True

    def forward(self, x):
        """Returns the tensor with the surrounding context.
        Arguments
        ---------
        x : tensor
            A batch of tensors.
        """

        x = x.transpose(1, 2)

        if self.first_call is True:
            self.first_call = False
            self.kernel = (
                self.kernel.repeat(x.shape[1], 1, 1)
                .view(x.shape[1] * self.context_len, self.kernel_len,)
                .unsqueeze(1)
            )

        # Managing multi-channel case
        or_shape = x.shape
        if len(or_shape) == 4:
            x = x.reshape(or_shape[0] * or_shape[2], or_shape[1], or_shape[3])

        # Compute context (using the estimated convolutional kernel)
        cw_x = torch.nn.functional.conv1d(
            x,
            self.kernel.to(x.device),
            groups=x.shape[1],
            padding=max(self.left_frames, self.right_frames),
        )

        # Retrieving the original dimensionality (for multi-channel case)
        if len(or_shape) == 4:
            cw_x = cw_x.reshape(
                or_shape[0], cw_x.shape[1], or_shape[2], cw_x.shape[-1]
            )

        cw_x = cw_x.transpose(1, 2)

        return cw_x


In [159]:
sess = tf.Session()

In [184]:
def log10(x):
    numerator = tf.math.log(x)
    denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
    return numerator / denominator

class ECAPA_TCNNFeaturizer:
    def __init__(
        self,
        sample_rate = 16000,
        win_length = 25,
        hop_length = 10,
        n_fft = 400,
        n_mels = 80,
        log_mel = True,
        f_min = 0,
        f_max = 8000,
        power_spectrogram = 2,
        amin = 1e-10,
        ref_value = 1.0,
        top_db = 80.0,
        param_change_factor = 1.0,
        param_rand_factor = 0.0,
        window_length = 5,
        **kwargs,
    ):
        self.sample_rate = sample_rate
        self.win_length = int(round((sample_rate / 1000.0) * win_length))
        self.hop_length = int(round((sample_rate / 1000.0) * hop_length))
        self.n_fft = n_fft
        self.n_mels = n_mels
        self.log_mel = log_mel
        self.f_min = f_min
        self.f_max = f_max
        self.n_stft = self.n_fft // 2 + 1
        self.amin = amin
        self.ref_value = ref_value
        self.db_multiplier = math.log10(max(self.amin, self.ref_value))
        self.power_spectrogram = power_spectrogram
        self.top_db = top_db

        if self.power_spectrogram == 2:
            self.multiplier = 10
        else:
            self.multiplier = 20

        mel = tf.linspace(
            self._to_mel(self.f_min), self._to_mel(self.f_max), self.n_mels + 2
        )
        hz = self._to_hz(mel)

        band = hz[1:] - hz[:-1]
        self.band = band[:-1]
        self.f_central = hz[1:-1]

        all_freqs = tf.linspace(0.0, tf.cast(self.sample_rate // 2, tf.float32),self.n_stft)
        all_freqs = tf.expand_dims(all_freqs, 0)
        self.all_freqs_mat = tf.tile(all_freqs, (self.f_central.shape[0], 1))

        self.n = (window_length - 1) // 2
        self.denom = self.n * (self.n + 1) * (2 * self.n + 1) / 3
        a = np.arange(-self.n, self.n + 1, dtype = np.float32)
        a = np.expand_dims(np.expand_dims(a, 0), 0)
        self.kernel = tf.tile(a, (self.n_mels, 1, 1))

    def _to_hz(self, mel):
        return 700 * (10 ** (mel / 2595) - 1)

    def _to_mel(self, hz):
        return 2595 * math.log10(1 + hz / 700)

    def _triangular_filters(self, all_freqs, f_central, band):
        slope = (all_freqs - f_central) / band
        left_side = slope + 1.0
        right_side = -slope + 1.0
        zero = tf.zeros(1,)
        fbank_matrix = tf.transpose(tf.maximum(zero, tf.minimum(left_side, right_side)))

        return fbank_matrix

    def _amplitude_to_DB(self, x):
        x_db = self.multiplier * log10(tf.clip_by_value(x, self.amin, tf.math.reduce_max(x)))
        x_db -= self.multiplier * self.db_multiplier
        new_x_db_max = tf.reduce_max(x_db) - self.top_db
        x_db = tf.maximum(x_db, new_x_db_max)
        return x_db

    def _group_conv(self, x, kernel):
        p = []
        for i in range(self.n_mels):
            c = tf.nn.conv1d(
                x[:, :, i : i + 1],
                kernel[:, :, i : i + 1],
                1,
                padding = 'VALID',
            )
            p.append(c)

        return tf.concat(p, axis = 2)
    
    def stft(self, y):
        return librosa.stft(
            y,
            n_fft = self.n_fft,
            hop_length = self.hop_length,
            win_length = self.win_length,
            window = ECAPA_TDNN_WINDOWS,
            pad_mode = 'constant',
        )

    def vectorize(self, signal):
        s = tf.compat.v1.numpy_function(self.stft, [y], tf.complex128)
        s.set_shape(((self.n_fft // 2) + 1, None))
        r = tf.cast(tf.real(s), tf.float32)
        i = tf.cast(tf.math.imag(s), tf.float32)
        s = tf.concat(
            [tf.expand_dims(r, -1), tf.expand_dims(i, -1)], -1
        )
        s = tf.transpose(s, (1, 0, 2))
        f_central_mat = tf.transpose(tf.tile(
            tf.expand_dims(self.f_central, 0), (self.all_freqs_mat.shape[1], 1)
        ))
        band_mat = tf.transpose(tf.tile(
            tf.expand_dims(self.band, 0), (self.all_freqs_mat.shape[1], 1)
        ))
        fbank_matrix = self._triangular_filters(
            self.all_freqs_mat, f_central_mat, band_mat
        )
        s = tf.expand_dims(s, 0)
        sp_shape = tf.shape(s)
        s = tf.reshape(s, (sp_shape[0] * sp_shape[3], sp_shape[1], sp_shape[2]))
        fbanks = tf.einsum('ijk,kl->ijl', s, fbank_matrix)
        fbanks = self._amplitude_to_DB(fbanks)
        fb_shape = tf.shape(fbanks)
        fbanks = tf.reshape(fbanks, (sp_shape[0], fb_shape[1], fb_shape[2], sp_shape[3]))
        x = tf.transpose(fbanks, (0, 2, 3, 1))
        or_shape = tf.shape(x)
        len_shape_x = x.shape
        if len(len_shape_x) == 4:
            x = tf.reshape(x, (or_shape[0] * or_shape[2], or_shape[1], or_shape[3]))
        x = tf.pad(x,((0, 0), (0, 0), (self.n, self.n)), mode = 'SYMMETRIC')
        return x
        x = tf.transpose(x, (0, 2, 1))
        k = tf.transpose(self.kernel, (2, 1, 0))
        conv = self._group_conv(x, k)
        conv = tf.transpose(conv, (0, 2, 1))
        delta_coeff = conv / self.denom
        if len(len_shape_x) == 4:
            delta_coeff = tf.reshape(delta_coeff, (or_shape[0], or_shape[1], or_shape[2], or_shape[3]))
        delta_coeff = tf.transpose(delta_coeff, (0, 3, 1, 2))
        return delta_coeff

In [185]:
class ECAPA_TCNNFeaturizer_np:
    def __init__(
        self,
        sample_rate=16000,
        win_length=25,
        hop_length=10,
        n_fft=400,
        n_mels=80,
        log_mel=True,
        f_min=0,
        f_max=8000,
        power_spectrogram=2,
        amin=1e-10,
        ref_value=1.0,
        top_db=80.0,
        param_change_factor=1.0,
        param_rand_factor=0.0,
        window_length=5,
        **kwargs,
    ):
        self.sample_rate = sample_rate
        self.win_length = int(round((sample_rate / 1000.0) * win_length))
        self.hop_length = int(round((sample_rate / 1000.0) * hop_length))
        self.n_fft = n_fft
        self.n_mels = n_mels
        self.log_mel = log_mel
        self.f_min = f_min
        self.f_max = f_max
        self.n_stft = self.n_fft // 2 + 1
        self.amin = amin
        self.ref_value = ref_value
        self.db_multiplier = math.log10(max(self.amin, self.ref_value))
        self.power_spectrogram = power_spectrogram
        self.top_db = top_db

        if self.power_spectrogram == 2:
            self.multiplier = 10
        else:
            self.multiplier = 20

        mel = np.linspace(
            self._to_mel(self.f_min), self._to_mel(self.f_max), self.n_mels + 2
        )
        hz = self._to_hz(mel)

        band = hz[1:] - hz[:-1]
        self.band = band[:-1]
        self.f_central = hz[1:-1]

        all_freqs = np.linspace(0, self.sample_rate // 2, self.n_stft)
        all_freqs = np.expand_dims(all_freqs, 0)
        self.all_freqs_mat = np.tile(all_freqs, (self.f_central.shape[0], 1))

        self.n = (window_length - 1) // 2
        self.denom = self.n * (self.n + 1) * (2 * self.n + 1) / 3
        a = np.arange(-self.n, self.n + 1, dtype=np.float32)
        a = np.expand_dims(np.expand_dims(a, 0), 0)
        self.kernel = np.tile(a, (self.n_mels, 1, 1))

        if not tf.executing_eagerly():
            with tf.device('/cpu:0'):
                self._X = tf.compat.v1.placeholder(tf.float32, (None, None, 1))
                self._K = tf.compat.v1.placeholder(tf.float32, (None, 1, 1))
                self._conv = tf.nn.conv1d(
                    self._X, self._K, 1, padding='VALID'
                )
            config = tf.compat.v1.ConfigProto()
            config.gpu_options.allow_growth = True
            self._sess = tf.compat.v1.Session(config=config)

    def _to_hz(self, mel):
        return 700 * (10 ** (mel / 2595) - 1)

    def _to_mel(self, hz):
        return 2595 * math.log10(1 + hz / 700)

    def _triangular_filters(self, all_freqs, f_central, band):
        slope = (all_freqs - f_central) / band
        left_side = slope + 1.0
        right_side = -slope + 1.0
        zero = np.zeros(1)
        fbank_matrix = np.maximum(zero, np.minimum(left_side, right_side)).T

        return fbank_matrix

    def _amplitude_to_DB(self, x):
        x_db = self.multiplier * np.log10(
            np.clip(x, a_min=self.amin, a_max=None)
        )
        x_db -= self.multiplier * self.db_multiplier
        new_x_db_max = x_db.max() - self.top_db
        x_db = np.maximum(x_db, new_x_db_max)
        return x_db

    def _group_conv(self, x, kernel):
        x = x.astype(np.float32)
        kernel = kernel.copy().astype(np.float32)
        p = []
        for i in range(self.n_mels):
            if tf.executing_eagerly():
                c = tf.nn.conv1d(
                    x[:, :, i: i + 1],
                    kernel[:, :, i: i + 1],
                    1,
                    padding='VALID',
                )
            else:
                c = self._sess.run(
                    self._conv,
                    feed_dict={
                        self._X: x[:, :, i: i + 1],
                        self._K: kernel[:, :, i: i + 1],
                    },
                )
            p.append(c)

        return np.concatenate(p, axis=2)

    def vectorize(self, signal):
        s = librosa.stft(
            y,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            win_length=self.win_length,
            window=ECAPA_TDNN_WINDOWS,
            pad_mode='constant',
        )
        s = np.concatenate(
            [np.expand_dims(s.real, -1), np.expand_dims(s.imag, -1)], -1
        )
        s = np.transpose(s, (1, 0, 2))
        f_central_mat = np.tile(
            np.expand_dims(self.f_central, 0), (self.all_freqs_mat.shape[1], 1)
        ).T
        band_mat = np.tile(
            np.expand_dims(self.band, 0), (self.all_freqs_mat.shape[1], 1)
        ).T
        fbank_matrix = self._triangular_filters(
            self.all_freqs_mat, f_central_mat, band_mat
        )
        s = np.expand_dims(s, 0)
        sp_shape = s.shape
        s = s.reshape(sp_shape[0] * sp_shape[3], sp_shape[1], sp_shape[2])
        fbanks = np.einsum('ijk,kl->ijl', s, fbank_matrix)
        fbanks = self._amplitude_to_DB(fbanks)
        fb_shape = fbanks.shape
        fbanks = fbanks.reshape(
            sp_shape[0], fb_shape[1], fb_shape[2], sp_shape[3]
        )
        x = np.transpose(fbanks, (0, 2, 3, 1))
        or_shape = x.shape
        if len(or_shape) == 4:
            x = x.reshape(or_shape[0] * or_shape[2], or_shape[1], or_shape[3])
        x = np.pad(x, ((0, 0), (0, 0), (self.n, self.n)), mode='edge')
        return x
        x = np.transpose(x, (0, 2, 1))
        k = np.transpose(self.kernel, (2, 1, 0))
        conv = self._group_conv(x, k)
        conv = np.transpose(conv, (0, 2, 1))
        delta_coeff = conv / self.denom
        if len(or_shape) == 4:
            delta_coeff = delta_coeff.reshape(
                or_shape[0], or_shape[1], or_shape[2], or_shape[3]
            )
        delta_coeff = np.transpose(delta_coeff, (0, 3, 1, 2))
        return delta_coeff

In [186]:
ecapa = ECAPA_TCNNFeaturizer()

In [187]:
ecapa_np = ECAPA_TCNNFeaturizer_np()

In [188]:
y.shape

(56298,)

In [189]:
Y = tf.placeholder(tf.float32, (None,))

In [190]:
%%time
d_tf = ecapa.vectorize(Y)
d_tf.shape

CPU times: user 76.6 ms, sys: 2.24 ms, total: 78.9 ms
Wall time: 77.6 ms


TensorShape([Dimension(None), Dimension(None), Dimension(None)])

In [191]:
d_tf_ = sess.run(d_tf, feed_dict = {Y: y})

In [192]:
%%time
d_tf_np = ecapa_np.vectorize(y)
d_tf_np.shape

CPU times: user 8.29 ms, sys: 1.17 ms, total: 9.46 ms
Wall time: 8.37 ms


(2, 80, 356)

In [193]:
d_tf_.shape, d_tf_np.shape

((2, 80, 356), (2, 80, 356))

In [196]:
np.round(d_tf_, 3)

array([[[-62.128, -62.128, -62.128, ..., -62.128, -62.128, -62.128],
        [-62.128, -62.128, -62.128, ..., -62.128, -62.128, -62.128],
        [-62.128, -62.128, -62.128, ..., -13.739, -13.739, -62.128],
        ...,
        [-17.869, -19.218, -19.218, ..., -62.128, -62.128, -62.128],
        [-19.745, -23.208, -23.208, ..., -23.068, -23.068, -20.808],
        [-62.128, -62.128, -62.128, ..., -18.741, -18.741, -62.128]],

       [[-62.128, -62.128, -62.128, ..., -62.128, -62.128, -24.996],
        [-62.128, -62.128, -62.128, ..., -62.128, -62.128, -18.882],
        [-19.4  , -62.128, -62.128, ..., -62.128, -62.128, -19.697],
        ...,
        [-62.128, -26.995, -26.995, ..., -24.731, -24.731, -19.325],
        [-17.579, -62.128, -62.128, ..., -62.128, -62.128, -25.446],
        [-62.128, -22.034, -22.034, ..., -22.39 , -22.39 , -62.128]]],
      dtype=float32)

In [197]:
np.round(d_tf_np, 3)

array([[[-62.128, -62.128, -62.128, ..., -62.128, -62.128, -62.128],
        [-62.128, -62.128, -62.128, ..., -62.128, -62.128, -62.128],
        [-62.128, -62.128, -62.128, ..., -13.739, -13.739, -13.739],
        ...,
        [-19.217, -19.217, -19.217, ..., -62.128, -62.128, -62.128],
        [-23.209, -23.209, -23.209, ..., -23.067, -23.067, -23.067],
        [-62.128, -62.128, -62.128, ..., -18.741, -18.741, -18.741]],

       [[-62.128, -62.128, -62.128, ..., -62.128, -62.128, -62.128],
        [-62.128, -62.128, -62.128, ..., -62.128, -62.128, -62.128],
        [-62.128, -62.128, -62.128, ..., -62.128, -62.128, -62.128],
        ...,
        [-26.993, -26.993, -26.993, ..., -24.732, -24.732, -24.732],
        [-62.128, -62.128, -62.128, ..., -62.128, -62.128, -62.128],
        [-22.034, -22.034, -22.034, ..., -22.39 , -22.39 , -22.39 ]]])