# MFCCs 

In [1]:
import numpy as np
import scipy.io.wavfile
from scipy.fftpack import dct

READ .wav file 

In [2]:
wavs_path = "/content/OSR_us_000_0010_8k.wav"
sample_rate, signal = scipy.io.wavfile.read(wavs_path)  
signal = signal[0:int(3.5 * sample_rate)]  # Keep the first 3.5 seconds

Pre-Emphasis 

In [3]:
pre_emphasis = 0.97
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])

Framing

In [4]:
frame_size = 0.025
frame_stride = 0.01

frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_signal, z) 
# Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal

indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]

Windowing

In [5]:
# A Hamming window :
def w_hamming(k) :
  w = 0.54 - 0.46 * np.cos((2 * np.pi * n) / (k - 1))
  return w

#frames *= w_hamming(frame_length)

In [6]:
frames *= np.hamming(frame_length)
frames *= np.hanning(frame_length)

Fourier-Transform and Power Spectrum

In [7]:
NFFT = 512
mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) 

# Power Spectrum
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  

Mel FilterBanks

In [8]:
nfilt = 40

low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
bin = np.floor((NFFT + 1) * hz_points / sample_rate)

fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
    f_m_minus = int(bin[m - 1])   # left
    f_m = int(bin[m])             # center
    f_m_plus = int(bin[m + 1])    # right

    for k in range(f_m_minus, f_m):
        fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
    for k in range(f_m, f_m_plus):
        fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
filter_banks = 20 * np.log10(filter_banks)  # dB

ceptrum (IDFT)

In [10]:
num_ceps = 12

mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)] # Keep 2-13


dynamic features (delta)

Mean Normalization

In [11]:
filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)