In [2]:
# %%
import numpy as np
import librosa
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os

# speech-silence and voice-unvoiced
BASE_PATH = None


def read_lab(lab_file_name: str):
    """Read lab file
    lab_file_name: str, name of lab file
    Return: list of lists [start_time, end_time, label]
    """
    data = []
    with open(lab_file_name) as f:
        for line in f.readlines():
            data.append(line.split())
    return data


def get_closest(arr, values):
    """Get closest value in an sorted array
    arr: np.ndarray
    values: List of values to find
    Return: List of closest values to input values
    """
    arr = np.array(arr)
    values = np.array(values, dtype=np.float64)
    idx = np.searchsorted(arr, values)
    idx = np.array(idx)
    idx[arr[idx] - values > np.diff(arr).mean() * 0.5] -= 1
    return arr[idx]


def get_closest_idx(arr, values):
    """Get closest index in an sorted array
    arr: np.ndarray
    values: List of values to find
    Return: List of closest index to input values
    """
    arr = np.array(arr)
    values = np.array(values, dtype=np.float64)
    idx = np.searchsorted(arr, values, side='left')
    idx = np.array(idx)
    return idx


def array_norm(arr: np.ndarray):
    min_arr=np.min(arr)
    max_arr=np.max(arr)
    return (arr-min_arr)/(max_arr-min_arr)

def array_norm_by_threshold(arr: np.ndarray, threshold):
    """Normalize given function by threshold
    arr: np.ndarray
    T: threshold
    """
    min_arr = min(arr)
    max_arr = max(arr)
    return np.where(arr >= threshold, (arr-threshold)/(max_arr-threshold), (arr-threshold)/(threshold-min_arr))



In [3]:


def load_data(audio_name: str):
    signal, sr = librosa.load(os.path.join(BASE_PATH, f'{audio_name}.wav'))
    lab_data = read_lab(os.path.join(BASE_PATH, f'{audio_name}.lab'))
    mean_std = lab_data[-2:]
    timestamp_label = lab_data[:-2]
    t_i = 0
    t_f = signal.shape[0] / sr
    t = np.linspace(t_i, t_f, num=signal.shape[0])
    return signal, sr, t, timestamp_label


def separate_frames(signal, sr, t, frame_length=0.02):
    """Separate signal into frames
    signal: np.ndarray
    sr: sampling rate
    t: time array
    frame_length: length of frame
    Return: Array of frames
    """
    frame_size = int(sr * frame_length)
    frame_count = len(signal) // frame_size
    signal_frames = []
    for i in range(0, frame_count * frame_size, frame_size):
        signal_frames.append(signal[i:i + frame_size])
    return np.array(signal_frames), frame_size, frame_count

def separate_sp_sl(STE, timestamp_label, t):
    STE_sp = np.array([])
    STE_sl = np.array([])
    for line in timestamp_label:
        if line[2] == 'sp':
            try:
                idx1 = int(get_closest_idx(t, line[0]))
                idx2 = int(get_closest_idx(t, line[1]))
                STE_sp = np.append(STE_sp, STE[idx1:idx2])
            except:
                print(line)
        if line[2] == 'sl':
            try:
                idx1 = int(get_closest_idx(t, line[0]))
                idx2 = int(get_closest_idx(t, line[1]))
                STE_sl = np.append(STE_sl, STE[idx1:idx2])
            except:
                print(line)
    return np.array(STE_sp), np.array(STE_sl)

In [12]:
def main():


In [13]:
main()

['studio_M2.wav', 'studio_F2.wav', 'phone_M2.wav', 'phone_M2.lab', 'phone_F2.lab', 'studio_M2.lab', 'studio_F2.lab', 'README', 'phone_F2.wav']


In [24]:
global BASE_PATH
BASE_PATH = 'TinHieuHuanLuyen'
audio_name_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(BASE_PATH)))
audio_name_list = list(map(lambda x: x[:-4], audio_name_list))
signal_list = [0]*len(audio_name_list)
sr_list = [0]*len(audio_name_list)
t_list = [0]*len(audio_name_list)
timestamp_label_list = [0]*len(audio_name_list)
signal_frames_list = [0]*len(audio_name_list)
frame_size_list = [0]*len(audio_name_list)
frames_count_list = [0]*len(audio_name_list)
STE_list = [0]*len(audio_name_list)
STE_speech_list = [0]*len(audio_name_list)
STE_silence_list = [0]*len(audio_name_list)
T_STE_list = [0]*len(audio_name_list)
for audio in audio_name_list:
    signal_list[i], sr_list[i], t_list[i], timestamp_label_list[i] = load_data(
            audio_name_list[i])

    
    

['studio_M2', 'studio_F2', 'phone_M2', 'phone_F2']


In [26]:
i=0
signal_list[i], sr_list[i], t_list[i], timestamp_label_list[i] = load_data(
            audio_name_list[i])

In [27]:
signal_list[i], sr_list[i], t_list[i], timestamp_label_list[i]

(array([ 3.28162307e-04,  2.69860262e-04,  6.54201722e-05, ...,
        -3.36857338e-04, -9.89952678e-05,  1.19587174e-04], dtype=float32),
 22050,
 array([0.00000000e+00, 4.53523374e-05, 9.07046749e-05, ...,
        2.38176871e+00, 2.38181406e+00, 2.38185941e+00]),
 [['0.00', '0.45', 'sil'],
  ['0.45', '0.48', 'uv'],
  ['0.48', '0.77', 'v'],
  ['0.77', '0.79', 'uv'],
  ['0.79', '0.88', 'v'],
  ['0.88', '0.92', 'uv'],
  ['0.92', '1.32', 'v'],
  ['1.32', '1.37', 'uv'],
  ['1.37', '1.53', 'v'],
  ['1.53', '1.59', 'uv'],
  ['1.59', '1.93', 'v'],
  ['1.93', '2.38', 'sil']])