In [1]:
import numpy as np
import pandas as pd
import scipy.io.wavfile as wav
from scipy.fftpack import dct

In [2]:
THRESHOLD = 0.05
ALPHA = 0.97
FRAME_SIZE = 0.025
FRAME_STRIDE = 0.01
SAMPLE_RATE = 16000
NUM_FILTERS = 26
MIN_FREQ = 0
N_FFT = 512
WINDOW_SIZE = 32768
NUM_CEPS = 13

In [3]:
def remove_silence(signal, threshold = THRESHOLD):
    signal[np.abs(signal) < threshold] = 0
    return signal

In [4]:
def pre_emphasis(signal, alpha = ALPHA):
    emphasized_signal = np.append(signal[0], signal[1:] - alpha * signal[:-1])
    return emphasized_signal


In [5]:
def framing(signal, frame_size = FRAME_SIZE, frame_stride = FRAME_STRIDE, sample_rate = SAMPLE_RATE):
    frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
    signal_length = len(signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))

    padded_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((padded_signal_length - signal_length))
    pad_signal = np.append(signal, z) 

    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) 
    + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy = False)]
    return frames

In [6]:
def apply_window(frames, window_func = np.hamming):
    frames *= window_func(frames.shape[1])
    return frames

In [7]:
def create_mel_filterbank(sample_rate, num_filters = NUM_FILTERS, 
                        min_freq = MIN_FREQ, max_freq = None, n_fft = N_FFT):
    if max_freq is None:
        max_freq = sample_rate // 2
    #  The formula to convert frequency in Hertz to Mel is given by: Mel=2595×log_10(1+𝑓/700)
    mel_min = 2595 * np.log10(1 + min_freq / 700)
    mel_max = 2595 * np.log10(1 + max_freq / 700)
    mel_points = np.linspace(mel_min, mel_max, num_filters + 2)
    hz_points = 700 * (10**(mel_points / 2595) - 1)
    bin_points = np.floor((n_fft + 1) * hz_points / sample_rate).astype(int)
    
    filterbank = np.zeros((num_filters, n_fft // 2 + 1))
    for i in range(1, num_filters + 1):
        filterbank[i - 1, bin_points[i - 1]:bin_points[i]] =
        (bin_points[i] - bin_points[i - 1]) / (hz_points[i] - hz_points[i - 1])
        filterbank[i - 1, bin_points[i]:bin_points[i + 1]] = 
        (bin_points[i + 1] - bin_points[i]) / (hz_points[i + 1] - hz_points[i])
    return filterbank

In [8]:
def compute_mfcc(signal, sample_rate = SAMPLE_RATE, num_ceps = NUM_CEPS):
    frames = framing(signal, sample_rate = sample_rate)
    frames *= WINDOW_SIZE
    frames = apply_window(frames)
    magnitude_spectrum = np.abs(np.fft.rfft(frames, n = N_FFT))
    mel_filterbank = create_mel_filterbank(sample_rate, num_filters = NUM_FILTERS, n_fft = N_FFT)
    mel_spectrum = np.dot(magnitude_spectrum, mel_filterbank.T)
    log_mel_spectrum = np.log(mel_spectrum + 1e-10) 
    mfcc = dct(log_mel_spectrum, type = 2, axis = 1, norm = 'ortho')[:, 1 : (num_ceps + 1)]  
    return mfcc

In [9]:
hisb_60_warsh_transcript_df = pd.read_csv('warsh_transcript/hisb_60_and_Al_fatihah_for_all_recitors_audio_path_and_transcript.csv')
hisb_60_warsh_transcript_df

Unnamed: 0,sura_no,sura_name_en,sura_name_ar,aya_no,aya_text,recitor_en,recitor_ar,recitor_reciting_aya_wav_path
0,1.0,Al-Fātiḥah,الفَاتِحة,1.0,اِ۬لْحَمْدُ لِلهِ رَبِّ اِ۬لْعَٰلَمِينَ ١,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...
1,1.0,Al-Fātiḥah,الفَاتِحة,2.0,اَ۬لرَّحْمَٰنِ اِ۬لرَّحِيمِ ٢,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...
2,1.0,Al-Fātiḥah,الفَاتِحة,3.0,مَلِكِ يَوْمِ اِ۬لدِّينِۖ ٣,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...
3,1.0,Al-Fātiḥah,الفَاتِحة,4.0,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُۖ ٤,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...
4,1.0,Al-Fātiḥah,الفَاتِحة,5.0,اُ۪هْدِنَا اَ۬لصِّرَٰطَ اَ۬لْمُسْتَقِيمَ ٥,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...
...,...,...,...,...,...,...,...,...
889,114.0,An-Nās,النَّاس,2.0,مَلِكِ اِ۬لنَّاسِ ٢,Abdul Basit,عبد الباسط,warsh_Abdul_Basit_wav_files/114/002.wav
890,114.0,An-Nās,النَّاس,3.0,إِلَٰهِ اِ۬لنَّاسِ ٣,Abdul Basit,عبد الباسط,warsh_Abdul_Basit_wav_files/114/003.wav
891,114.0,An-Nās,النَّاس,4.0,مِن شَرِّ اِ۬لْوَسْوَاسِ اِ۬لْخَنَّاسِ ٤,Abdul Basit,عبد الباسط,warsh_Abdul_Basit_wav_files/114/004.wav
892,114.0,An-Nās,النَّاس,5.0,اِ۬لذِے يُوَسْوِسُ فِے صُدُورِ اِ۬لنَّاسِ ٥,Abdul Basit,عبد الباسط,warsh_Abdul_Basit_wav_files/114/005.wav


In [10]:
data = []
for index, row in hisb_60_warsh_transcript_df.iterrows():
    # Load audio file
    sample_rate, signal = wav.read(row['recitor_reciting_aya_wav_path'])
    # Remove silence
    signal = remove_silence(signal)
    # Apply pre-emphasis
    signal = pre_emphasis(signal)
    # Compute MFCCs
    mfcc = compute_mfcc(signal, sample_rate = sample_rate)
    data.append(mfcc.tolist())

mfcc_df = pd.DataFrame({'mfcc': data})
mfcc_df

Unnamed: 0,mfcc
0,"[[-2.491397213802345, 1.1010472775054547, -0.0..."
1,"[[-2.6060128460700507, 2.0262079519491634, 0.7..."
2,"[[-2.963568909731852, 1.5978105678210048, 0.53..."
3,"[[-1.7085122141038942, 2.050192216986418, 0.83..."
4,"[[-2.488483895713236, 2.168361353104937, 1.165..."
...,...
889,"[[0.0, 0.0, 0.0, 3.826852553242415e-15, 0.0, 0..."
890,"[[0.0, 0.0, 0.0, 3.826852553242415e-15, 0.0, 0..."
891,"[[0.0, 0.0, 0.0, 3.826852553242415e-15, 0.0, 0..."
892,"[[0.0, 0.0, 0.0, 3.826852553242415e-15, 0.0, 0..."


In [11]:
hisb_60_and_Al_fatihah_audio_with_transcript_and_MFCC = pd.concat([hisb_60_warsh_transcript_df, mfcc_df], axis = 1)
hisb_60_and_Al_fatihah_audio_with_transcript_and_MFCC

Unnamed: 0,sura_no,sura_name_en,sura_name_ar,aya_no,aya_text,recitor_en,recitor_ar,recitor_reciting_aya_wav_path,mfcc
0,1.0,Al-Fātiḥah,الفَاتِحة,1.0,اِ۬لْحَمْدُ لِلهِ رَبِّ اِ۬لْعَٰلَمِينَ ١,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...,"[[-2.491397213802345, 1.1010472775054547, -0.0..."
1,1.0,Al-Fātiḥah,الفَاتِحة,2.0,اَ۬لرَّحْمَٰنِ اِ۬لرَّحِيمِ ٢,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...,"[[-2.6060128460700507, 2.0262079519491634, 0.7..."
2,1.0,Al-Fātiḥah,الفَاتِحة,3.0,مَلِكِ يَوْمِ اِ۬لدِّينِۖ ٣,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...,"[[-2.963568909731852, 1.5978105678210048, 0.53..."
3,1.0,Al-Fātiḥah,الفَاتِحة,4.0,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُۖ ٤,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...,"[[-1.7085122141038942, 2.050192216986418, 0.83..."
4,1.0,Al-Fātiḥah,الفَاتِحة,5.0,اُ۪هْدِنَا اَ۬لصِّرَٰطَ اَ۬لْمُسْتَقِيمَ ٥,Ibrahim_Aldosary,إبراهيم الدوسري,warsh_ibrahim_aldosary_wav_files_madani_hisb_6...,"[[-2.488483895713236, 2.168361353104937, 1.165..."
...,...,...,...,...,...,...,...,...,...
889,114.0,An-Nās,النَّاس,2.0,مَلِكِ اِ۬لنَّاسِ ٢,Abdul Basit,عبد الباسط,warsh_Abdul_Basit_wav_files/114/002.wav,"[[0.0, 0.0, 0.0, 3.826852553242415e-15, 0.0, 0..."
890,114.0,An-Nās,النَّاس,3.0,إِلَٰهِ اِ۬لنَّاسِ ٣,Abdul Basit,عبد الباسط,warsh_Abdul_Basit_wav_files/114/003.wav,"[[0.0, 0.0, 0.0, 3.826852553242415e-15, 0.0, 0..."
891,114.0,An-Nās,النَّاس,4.0,مِن شَرِّ اِ۬لْوَسْوَاسِ اِ۬لْخَنَّاسِ ٤,Abdul Basit,عبد الباسط,warsh_Abdul_Basit_wav_files/114/004.wav,"[[0.0, 0.0, 0.0, 3.826852553242415e-15, 0.0, 0..."
892,114.0,An-Nās,النَّاس,5.0,اِ۬لذِے يُوَسْوِسُ فِے صُدُورِ اِ۬لنَّاسِ ٥,Abdul Basit,عبد الباسط,warsh_Abdul_Basit_wav_files/114/005.wav,"[[0.0, 0.0, 0.0, 3.826852553242415e-15, 0.0, 0..."


In [12]:
hisb_60_and_Al_fatihah_audio_with_transcript_and_MFCC.to_csv('warsh_transcript/hisb_60_and_Al_fatihah_audio_with_transcript_and_MFCC.csv', index = False)