# Data Exploration & Split

**Summary:** The purpose of this notebook is to dive deeper into the processing-blocks-master code created by Edge Impulse. Specifically this notebook examines what the data looks like before processing, during processing, and final expected outcome. 

### Dependencies

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from IPython.display import Audio
from scipy.io import wavfile

import sys, os
import pathlib
import base64
import shutil
from IPython.display import SVG
import pickle

import warnings
from scipy.io.wavfile import WavFileWarning
warnings.simplefilter("ignore", WavFileWarning)

# process_data script
ROOT = pathlib.Path('/home/bukowskin/CSC_7901_ML_Capstone')
PROCESS_PATH = ROOT / 'functions' 
sys.path.append(str(PROCESS_PATH))
from process_data import Process_Audio_Data

# speechpy functions
SPEECHPY_PATH =  ROOT / 'functions' / 'edge-impulse-functions' / 'third_party'/ 'speechpy'/'__init__.py' 
MODULE_NAME = 'speechpy'
import importlib
spec = importlib.util.spec_from_file_location(MODULE_NAME, SPEECHPY_PATH)
speechpy = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = speechpy
spec.loader.exec_module(speechpy)

from sklearn.model_selection import train_test_split

## Path Variables

In [None]:
data_path = '/home/bukowskin/CSC_7901_ML_Capstone/data/all_data/' # original, not split or processed data

## Data Exploration

### Single File 

In [None]:
single_audio_sample = f'{data_path}/two.noise_0.female.iphone.SZQlO4oFWd.wav'

In [None]:
sampling_freq, raw_data = wavfile.read(single_audio_sample)

In [None]:
Audio(data=single_audio_sample, rate=sampling_freq)

### Functions

In [None]:
def fft_spectrum(frames, fft_points=512):
    """This function computes the one-dimensional n-point discrete Fourier
    Transform (DFT) of a real-valued array by means of an efficient algorithm
    called the Fast Fourier Transform (FFT). Please refer to
    https://docs.scipy.org/doc/numpy/reference/generated/numpy.fft.rfft.html
    for further details.
        NOTE: Taken function from functions/edge-impulse-functions/third-party/speechpy/processing
    Args:
        frames (array): The frame array in which each row is a frame.
        fft_points (int): The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.

    Returns:
            array: The fft spectrum.
            If frames is an num_frames x sample_per_frame matrix, output
            will be num_frames x FFT_LENGTH.
    """
    SPECTRUM_VECTOR = np.fft.rfft(frames, n=fft_points, axis=-1, norm=None)
    return np.absolute(SPECTRUM_VECTOR)

### Raw Audio 

In [None]:
# generating time axis
duration = len(raw_data) / sampling_freq
time_axis = np.linspace(0, duration, num=len(raw_data))

# Plotting raw audio signal
plt.figure(figsize=(10, 6))
plt.plot(time_axis, raw_data, color='blue')
plt.title(f'Raw Audio Signal - Unique ID: 2eDgHfQz2u')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.grid(True)
plt.show()

### FFT-Spectrogram 

In [None]:
raw_data = raw_data.reshape(int(len(raw_data) / len(['accY'])), len(['accY']))

features = []
graphs = []

for ax in range(0, len(['accY'])):
    signal = raw_data[:,ax]

    numframes, _, __ = speechpy.processing.calculate_number_of_frames(
        signal,
        implementation_version=4,
        sampling_frequency=sampling_freq,
        frame_length= 0.02,
        frame_stride= 0.02,
        zero_padding=False)


In [None]:
# Stack frames
frames = speechpy.processing.stack_frames(
    signal,
    implementation_version=4,
    sampling_frequency=sampling_freq,
    frame_length=0.02,
    frame_stride=0.02,
    filter=lambda x: np.ones(
        (x,
         )),
    zero_padding=False)
# Note: code segment taking from process_data script, initial step required to run fft_spectrogram function

In [None]:
fft_spectrogram = fft_spectrum(frames,256)

In [None]:
%matplotlib inline
# Plot the spectrogram
plt.figure(figsize=(10, 6))
plt.imshow(fft_spectrogram.T, aspect='auto', origin='lower', cmap='viridis')
plt.colorbar(label='Spectrogram Amplitude')
plt.title('FFT-Spectrogram - Unique ID: 2eDgHfQz2u')
plt.ylabel('Frequency (Hz)')
plt.xlabel('Time')
plt.show()

### MFE-Spectrogram

In [None]:
n_mels = 40
fft_length = 256
noise_floor_db = -52
window_size = 0.1
path = '/home/bukowskin/CSC_7901_ML_Capstone/data/all_data/'

In [None]:
preprocess_mel_40 = Process_Audio_Data(path,
                                       True, # create mfe graphs
                                       n_mels, # number of filters
                                       fft_length,
                                       noise_floor_db,
                                       window_size,
                                       0) # low freqeuncy

In [None]:
preprocess_mel_40.generate_features()

#### MFE Image

In [None]:
image = preprocess_mel_40.mfe_graphs['SZQlO4oFWd']
image = image[0]['image']

In [None]:
decoded_image = base64.b64decode(image)
svg_string = decoded_image.decode('utf-8')
SVG(svg_string)

#### MFCC Image

In [None]:
image = preprocess_mel_40.mfcc_graphs['SZQlO4oFWd']
image = image[0]['image']

In [None]:
decoded_image = base64.b64decode(image)
svg_string = decoded_image.decode('utf-8')
plt.title('MFCC-Spectrogram - Unique ID: 2eDgHfQz2u')
SVG(svg_string)

# Data Split

Applying split for training and testing and save it into corresponding folders that can be easily loaded and maintain a similar train/test split throughout. 

### Parse Filenames

In [None]:
parsed_data = []
for filename in os.listdir(data_path):
    file_parts = filename.split(".")
    parsed_data.append({
        "file_name": filename,
        "label": file_parts[0],
        "noise_type": file_parts[1],
        "gender": file_parts[2],
        "device": file_parts[3],
        "id": file_parts[4]
    })
    
df = pd.DataFrame(parsed_data)
df        

### Split

In [None]:
random_state = 42

In [None]:
df["groups"] = df["label"] + "." + df["noise_type"] + "." + df["gender"] + "." + df["device"]
train, test = train_test_split(df, test_size=0.2, stratify=df["groups"], random_state=random_state)

### Save Train/Test Split Files 

In [None]:
base_path = '/home/bukowskin/CSC_7901_ML_Capstone/data'

# each unique directories name for saving raw data after each split
data_save_path = {
    42: f'{base_path}/data_split_random_state_42',
    73: f'{base_path}/data_split_random_state_73',
    13: f'{base_path}/data_split_random_state_13'
}

In [None]:
def save_split_files(df, destination_dir):
    for index, row in df.iterrows():
        source_file = os.path.join(data_path, row["file_name"])
        destin_file = os.path.join(destination_dir, row["file_name"])

        shutil.copy(source_file, destin_file)  # copying file to new path

In [None]:
save_split_files(train, f'{data_save_path[random_state]}/raw_data_split/train/')
save_split_files(test, f'{data_save_path[random_state]}/raw_data_split/test/')

## Preprocess Data

In [None]:
# Data Process Parameters
n_mels = 58
fft_length = 256
noise_floor_db = -52
window_size = 0.1

#### Training Data

In [None]:
train_data_path = f'{data_save_path[random_state]}/raw_data_split/train/'
preprocess_train = Process_Audio_Data(train_data_path,
                                       True, # create mfe graphs
                                       n_mels, # number of filters
                                       fft_length,
                                       noise_floor_db,
                                       window_size,
                                       0) # low freqeuncy

# Generating MFE and MFCC features from edited Edge Impulse Processing Block code [2]
preprocess_train.generate_features()

#### Testing Data

In [None]:
test_data_path = f'{data_save_path[random_state]}/raw_data_split/test/'
preprocess_test = Process_Audio_Data(test_data_path,
                                       True, # create mfe graphs
                                       n_mels, # number of filters
                                       fft_length,
                                       noise_floor_db,
                                       window_size,
                                       0) # low freqeuncy

# Generating MFE and MFCC features from edited Edge Impulse Processing Block code [2]
preprocess_test.generate_features()

### Save Train/Test Split Files 

In [None]:
def save_generated_feats(save_path, num_mel, split_type, feat_type, preprocess):
    '''
    Saving generated features
    '''
    if feat_type == 'mfe':
        save_data = {
            'features': preprocess.mfe_features,
            'labels': preprocess.data_info['label'].reset_index(drop=True).values,
            'mfe_height': preprocess.mfe_height,
            'mfe_width': preprocess.mfe_width,
        }
    else:
        save_data = {
            'features': preprocess.mfcc_features,
            'labels': preprocess.data_info['label'].reset_index(drop=True).values,
            'mfe_height': preprocess.mfcc_height,
            'mfe_width': preprocess.mfcc_width,
        }
    with open(f'{save_path}/number_mel_filters_{num_mel}/{feat_type}_data_split/{split_type}_data.pkl', 'wb') as f:
        pickle.dump(save_data, f)

In [None]:
# MFE
save_generated_feats(data_save_path[random_state], n_mels,'train', 'mfe', preprocess_test)
save_generated_feats(data_save_path[random_state], n_mels,'test', 'mfe', preprocess_test)

# MFCC
save_generated_feats(data_save_path[random_state],n_mels,'train', 'mfcc', preprocess_test)
save_generated_feats(data_save_path[random_state],n_mels,'test', 'mfcc', preprocess_test)