In [1]:
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

from glob import glob as glob_module
import os

import librosa
import librosa.display

from python_speech_features import mfcc

import logging
import soundfile as sf
import sounddevice as sd

from joblib import delayed, Parallel

import json
import pickle

In [2]:
logging.basicConfig(level=logging.INFO, filename="log.log", filemode="w")
logger = logging.getLogger(__name__)

handler = logging.FileHandler("dataprocessing_test.log", mode="w")
formatter = logging.Formatter("%(name)s - %(message)s")
handler.setFormatter(formatter)

logger.addHandler(handler)


In [3]:
curr_notebook_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(curr_notebook_dir,os.pardir))
train_path = r"data\raw\LibriSpeech\train-clean-100\LibriSpeech\train-clean-100\**\**\*.flac"
test_path = r"data\raw\LibriSpeech\test-clean\LibriSpeech\test-clean\**\**\*.flac"
dev_path = r"data\raw\LibriSpeech\dev-clean\LibriSpeech\dev-clean\**\**\*.flac"

train_path_whole = os.path.join(parent_dir,train_path)
test_path_whole = os.path.join(parent_dir,test_path)
dev_path_whole = os.path.join(parent_dir,dev_path)

In [4]:
data_files_train = glob_module(train_path_whole, recursive=True)
data_files_dev = glob_module(dev_path_whole, recursive=True)
data_files_test = glob_module(test_path_whole, recursive=True)

In [5]:
# audio_data, sample_rate = sf.read(data_files_dev[849])
# sd.play(audio_data,samplerate=sample_rate)
# sd.wait()

In [6]:
# plt.figure(figsize=(10, 5))
# Pxx, freqs, bins, im = plt.specgram(audio_data, NFFT=256, Fs=sample_rate, noverlap=128, cmap="jet")

# plt.colorbar(label='Intensity [dB]')
# plt.ylabel('Frequency [Hz]')
# plt.xlabel('Time [s]')
# plt.title('Spectrogram')
# plt.show()

In [7]:
# mfccs = mfcc(audio_data, sample_rate, numcep=13)
# mfccs_l = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
# mfccs_normalized = (mfccs - np.mean(mfccs, axis=0)) / np.std(mfccs, axis=0)
# mfccs_l_normalized = (mfccs_l - np.mean(mfccs_l, axis=0)) / np.std(mfccs_l, axis=0)

In [8]:

# plt.figure(figsize=(10, 5))
# plt.imshow(mfccs_normalized, cmap="jet", origin='lower', aspect='auto')
# plt.colorbar()
# plt.title('MFCCs')
# plt.ylabel('Time')
# plt.xlabel('MFCC Coefficients')
# plt.show()

In [5]:
def process_audio(file_path):
    try:
        audio_data, sample_rate = sf.read(file_path)
        logger.info("Audio data loaded successfully.")
        
        Pxx, freqs, bins, im = plt.specgram(audio_data, NFFT=256, Fs=sample_rate, noverlap=128, cmap="jet")
        logger.info("Spectrogram computed successfully.")
        
        mfccs = mfcc(audio_data, samplerate=sample_rate, numcep=13)
        logger.info("MFCCs computed successfully.")
        
        mfccs_normalized = (mfccs - np.mean(mfccs, axis=0)) / np.std(mfccs, axis=0)
        logger.info("MFCCs normalized successfully.")
        
        return Pxx, mfccs_normalized
    except Exception as e:
        logger.error(f"Error processing audio file: {file_path}. Error message: {str(e)}")
        return None, None

In [None]:
results_train = Parallel(n_jobs=8)(delayed(process_audio)(file_path) for file_path in data_files_train)
audio_feature_list_train, mfcc_list_train = zip(*[result for result in results_train if result[0] is not None and result[1] is not None])

In [None]:
results_test = Parallel(n_jobs=8)(delayed(process_audio)(file_path) for file_path in data_files_test)
audio_feature_list_test, mfcc_list_test = zip(*[result for result in results_test if result[0] is not None and result[1] is not None])

In [41]:
results_dev = Parallel(n_jobs=8)(delayed(process_audio)(file_path) for file_path in data_files_dev)
audio_feature_list_dev, mfcc_list_dev = zip(*[result for result in results_dev if result[0] is not None and result[1] is not None])

In [None]:
# audio_feature_list_train,mfcc_list_train = [], []
# for i, file_path in enumerate(data_files_train):
#     audio_feature, mfcc_feature = process_audio(file_path)
#     if audio_feature is not None and mfcc_feature is not None:
#         audio_feature_list_train.append(audio_feature)
#         mfcc_list_train.append(mfcc_feature)
#     else:
#         logging.error(f"Error processing audio file at index {i}: {file_path}")

In [12]:
# audio_feature_list_test,mfcc_list_test = [], []
# for i, file_path in enumerate(data_files_test):
#     audio_feature, mfcc_feature = process_audio(file_path)
#     if audio_feature is not None and mfcc_feature is not None:
#         audio_feature_list_test.append(audio_feature)
#         mfcc_list_test.append(mfcc_feature)
#     else:
#         logger.error(f"Error processing audio file at index {i}: {file_path}")

In [None]:
# audio_feature_list_dev,mfcc_list_dev = [], []
# for i, file_path in enumerate(data_files_dev):
#     audio_feature, mfcc_feature = process_audio(file_path)
#     if audio_feature is not None and mfcc_feature is not None:
#         audio_feature_list_dev.append(audio_feature)
#         mfcc_list_dev.append(mfcc_feature)
#     else:
#         logging.error(f"Error processing audio file at index {i}: {file_path}")

In [6]:
def corpus(file):
    file_name = os.path.basename(file)
    file_name_abs = file_name.split(".")
    cur_dir = os.path.dirname(file)
    transcript_file = glob_module(os.path.join(cur_dir, "*.txt"))
    with open(transcript_file[0],"r") as f:
        lines = f.readlines()
    first_words_list = [(line.split()[0],i) for i,line in enumerate(lines)]
    for elements in first_words_list:
        if elements[0]==file_name_abs[0]:
            line = lines[elements[1]]
            line = line.split()
            transcript = " ".join(line[1:])
            break
    return {file_name_abs[0]:transcript}
    

In [7]:
ordered_transcripts_train = []
for file in data_files_train:
    transcript = corpus(file)
    ordered_transcripts_train.append(transcript)
    

In [8]:
ordered_transcripts_test = []
for file in data_files_test:
    transcript = corpus(file)
    ordered_transcripts_test.append(transcript)

In [9]:
ordered_transcripts_dev = []
for file in data_files_dev:
    transcript = corpus(file)
    ordered_transcripts_dev.append(transcript)

In [10]:
corpus_path = r"data\processed"
curr_notebook_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(curr_notebook_dir,os.pardir))
corpus_path_whole = os.path.join(parent_dir,corpus_path)

In [14]:
dev_corpus_file = os.path.join(corpus_path_whole, "dev_corpus.json")
with open(dev_corpus_file, "w") as f:
    json.dump(ordered_transcripts_dev, f)

In [15]:
train_corpus_file = os.path.join(corpus_path_whole, "train_corpus.json")
with open(train_corpus_file, "w") as f:
    json.dump(ordered_transcripts_train, f)

In [16]:
test_corpus_file = os.path.join(corpus_path_whole, "test_corpus.json")
with open(test_corpus_file, "w") as f:
    json.dump(ordered_transcripts_test, f)

In [None]:
train_audio_features = os.path.join(corpus_path_whole, "train_audio_features.pickle")
with open(train_audio_features, "wb") as f:
    pickle.dump(audio_feature_list_train, f)
        

In [62]:
train_mfcc_features = os.path.join(corpus_path_whole, "train_mfcc_features.pickle")
with open(train_mfcc_features, "wb") as f:
    pickle.dump(mfcc_list_train, f)

In [60]:
test_audio_features = os.path.join(corpus_path_whole, "test_audio_features.pickle")
with open(test_audio_features, "wb") as f:
    pickle.dump(audio_feature_list_test, f)

In [61]:
test_mfcc_features = os.path.join(corpus_path_whole, "test_mfcc_features.pickle")
with open(test_mfcc_features, "wb") as f:
    pickle.dump(mfcc_list_test, f)

In [58]:
dev_audio_features = os.path.join(corpus_path_whole, "dev_audio_features.pickle")
with open(dev_audio_features, "wb") as f:
    pickle.dump(audio_feature_list_dev, f)

In [59]:
dev_mfcc_features = os.path.join(corpus_path_whole, "dev_mfcc_features.pickle")
with open(dev_mfcc_features, "wb") as f:
    pickle.dump(mfcc_list_dev, f)