# Feature extraction from Coswara dataset

In this notebook the first 15 MFCCs are extraced from the augmented Coswara dataset (see the [Coswara 2 notebook](./coswara_2_data_augmentation.ipynb)). The extraced features are saved in a JSON file along with the label. The augmented positive instances are labeled `augmented_p_data` and need to be relabeled at the data loading stage. The saved features can be loaded an prepared for later usage using the following data loader function:

```Python
import json
import numpy as np
from typing import Tuple

def load_data(data_path:str) -> Tuple[np.ndarray, np.ndarray]:
    '''
        Function to load dataset and label from JSON file and do necessary preprocessing.

        Parameters:
            data_path: String corresonding to the path to the saved JSON file to be loaded.

        Returns:
            A tuple with 2 NumPy N dimensional array with the features and the binary encoded labels. 
    '''
    
    with open(data_path, 'r') as f:
        data = json.load(f)
    
    # Extract labels and MFCCs
    X = np.array(data['mfcc'])
    y = np.array(data['label'])
    
    X = X.reshape(X.shape[0], -1, 15, 1)
    
    # Encode labels
    y[y=='p'] = 1
    y[y=='n'] = 0
    y[y=='augmented_p_data'] = 1
    y = y.astype(np.int32)
    
    return X, y
```

In [None]:
import re
import os
import glob
import librosa
import json
from tqdm import tqdm
import pandas as pd

In [None]:
# Path to Extraced_data folder
data_dir = '../../../Coswara-Data/data'

# Data collection parameters
suffix = 'shallow'
recording_regex = r'*/*.wav'
search_path = os.path.join(data_dir, suffix, recording_regex)

In [None]:
# Collect paths to recordings to analyse
paths = glob.glob(search_path)
len(paths)

In [None]:
# Sanity check for correct files gathered
paths[0]

In [None]:
# Extracting MFCCs
duration = 7
sample_rate = 22050

data = {
    'mfcc': [],
    'label': []
    }
    
for path in tqdm(paths):
    try:
        y, sr = librosa.load(path, sr=sample_rate)
        y = librosa.util.fix_length(y, size=sample_rate * duration)
        mfcc = librosa.feature.mfcc(y=y, n_mfcc=15, n_fft=2048, hop_length=512)
        mfcc = mfcc.T
        
        label = re.split(r'/|\\', path)[-2]
        
        data['mfcc'].append(mfcc.tolist())
        data['label'].append(label)
        
    except:
        continue

In [None]:
# Save features in a JSON file
json_path = os.path.join(data_dir, suffix, f'{suffix}_mfcc15_augdata.json')

with open(json_path, 'w') as fp:
    json.dump(data, fp, indent=4)