In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile

In [None]:
def initialize_values():
    training_path = "NguyenAmHuanLuyen-16k"
    test_path = "NguyenAmKiemThu-16k"
    training_folders = [folder for folder in os.listdir(training_path) if os.path.isdir(os.path.join(training_path, folder))]
    test_folders = [folder for folder in os.listdir(test_path) if os.path.isdir(os.path.join(test_path, folder))]
    vowel_files = ["a.wav", "e.wav", "i.wav", "o.wav", "u.wav"]
    frame_duration = 0.03
    frame_shift = 0.015
    N_FFT_array = [512, 1024, 2048]

    return training_path, training_folders, test_path, test_folders, vowel_files, frame_duration, frame_shift, N_FFT_array

In [None]:
import warnings

def process_speech_signal(path, folder, file, frame_duration, frame_shift):
    filepath = f"{path}/{folder}/{file}"
    warnings.filterwarnings('ignore', category=wavfile.WavFileWarning)
    Fs, data = wavfile.read(filepath)
    T = 1 / Fs                            # Thoi gian lay mau
    n = len(data)                         # So mau tin hieu
    t = n * T                             # Thoi gian tin hieu
    signal = data
    data = data / abs(max(data))          # Chuan hoa bien do ve [-1, 1]
    # Do dai khung (30ms)
    frame_len = round(frame_duration * Fs)
    # Do dich khung (15ms)
    frame_shift_len = round(frame_shift * Fs)
    # Tong so khung
    n_f = int(np.floor((n - frame_len) / frame_shift_len) + 1)
    # Chia tin hieu thanh cac khung
    list_frames = [data[i * frame_shift_len:i * frame_shift_len + frame_len] for i in range(n_f)]
    frames = np.array(list_frames)
    # Tinh nang luong cua tung khung
    ste = np.sum(np.square(frames), axis=1)
    # Chuan hoa nang luong
    ste = ste / max(ste)
    # Tim cac khung co nang luong lon hon nguong 0.03
    id = np.where(ste >= 0.03)[0]
    # Tim khung bat dau va ket thuc cua doan tieng noi
    len_id = len(id)
    distance = int(np.ceil((id[-1] - id[0]) / 3))
    frame_start = id[0] + distance
    frame_end = id[0] + 2 * distance

    # Plotting the signal and the STE
    plt.figure()
    plt.plot(np.arange(0, len(signal) * T, T), signal) # Original signal
    plt.title("Signal and (STE) of the Signal " + f"{folder}/{file}")
    # Vertical lines for speech marks
    plt.axvline(x=(id[0]) * frame_shift_len * T, color='r', linestyle='--', label="Start of Speech")
    plt.axvline(x=(id[-1]) * frame_shift_len * T, color='r', linestyle='--', label="End of Speech")
    plt.axvline(x=(frame_start) * frame_shift_len * T, color='b', linestyle='--', label="Start of Stable Speech")
    plt.axvline(x=(frame_end) * frame_shift_len * T, color='b', linestyle='--', label="End of Stable Speech")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.legend()
    plt.tight_layout()
    plt.show() 
    return frames, frame_start, frame_end
# Example usage
frames, frame_start, frame_end = process_speech_signal("NguyenAmHuanLuyen-16k", "23MTL", "a.wav", 0.02, 0.01)
frames, frame_start, frame_end = process_speech_signal("NguyenAmHuanLuyen-16k", "23MTL", "o.wav", 0.02, 0.01)
frames, frame_start, frame_end = process_speech_signal("NguyenAmHuanLuyen-16k", "23MTL", "u.wav", 0.02, 0.01)
frames, frame_start, frame_end = process_speech_signal("NguyenAmHuanLuyen-16k", "23MTL", "e.wav", 0.02, 0.01)
frames, frame_start, frame_end = process_speech_signal("NguyenAmHuanLuyen-16k", "23MTL", "i.wav", 0.02, 0.01)
