### Feature: MelSpectogram, first order derivative and second order derivatives as 3-d matrix.

This work used cleaned_df created by Dharvi and train-test split as shown by Shibendra.


In [1]:
# import libraries
import os
import pandas as pd
import numpy as np
import librosa
import random

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler

import random
import tensorflow as tf
#from tensorflow_addons.image import sparse_image_warp

In [15]:
# import cleaned data file and check severity column
df_cleanedData =  pd.read_csv('cleaned_coughvid_data.csv')
print(df_cleanedData.shape)

# drop records where severity==unknown
df = df_cleanedData[df_cleanedData.severity != 'unknown']
print(df.shape)
df['severity'].value_counts()

(2587, 17)
(2449, 17)


mild           1691
pseudocough     526
severe          230
Name: severity, dtype: int64

In [16]:
# check for NaN values in Severity column
print(df['severity'].isnull().values.any())
df['severity'].isnull().values.sum()

True


2

In [18]:
# drop NaN values
withoutNan_df = df.dropna(subset=['severity'])
withoutNan_df.shape

(2447, 17)

In [19]:
withoutNan_df['severity'].isnull().values.sum()

0

In [20]:
# convert labels categoricals to numericals
severity_label_map = {"mild":0, "pseudocough":1, "severe": 2}
withoutNan_df = withoutNan_df.replace({"severity": severity_label_map})
withoutNan_df.head()

Unnamed: 0,uuid,cough_detected,age,gender,respiratory_condition,fever_muscle_pain,status,status_SSL,cough_type,dyspnea,wheezing,stridor,choking,congestion,nothing,diagnosis,severity
0,01567151-7bb2-45ee-9aa8-a1332b5941ea,0.982,,,,,,,dry,False,False,False,True,False,False,COVID-19,0
1,018b40a1-c109-459a-9e31-86cbd2cb3918,0.9869,,,,,,,wet,False,False,False,False,False,True,lower_infection,0
2,01ff40e8-63e6-4570-a463-9778ea30cad7,0.9686,24.0,other,False,False,symptomatic,,dry,False,False,False,False,False,True,healthy_cough,1
3,0379c586-c500-483c-83a6-95b63afe6931,0.9916,63.0,male,True,False,COVID-19,,dry,False,False,False,False,False,True,healthy_cough,1
4,038592cb-c8db-4f55-8052-e20059146cb5,0.9824,28.0,male,False,False,healthy,,dry,False,False,False,False,False,True,COVID-19,0


In [21]:
# get uuid and lebels as required-df 
reqdf = withoutNan_df[["uuid","severity"]]
reqdf.sample(6)

Unnamed: 0,uuid,severity
274,7876c549-066a-4ea1-a82e-45772114f964,0
1076,d801aaa2-a086-4213-a552-7dd6fbf53943,0
1820,f99a8315-236c-483b-8222-449dea88604a,0
925,99e55cd3-f001-4c4d-98e8-c9e4e7a3410b,0
2376,b986fda8-71e3-4de9-8702-5cc2c82b5faa,0
405,b4cb0e7d-ee40-4088-8e82-21a910caa86a,0


In [22]:
# assign uuid as X and severity as y

X = reqdf.drop(["severity"], axis=1)
y = reqdf["severity"]
X.shape, y.shape

((2447, 1), (2447,))

In [23]:
# generate train-validation-test sets as 70-15-15%

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

# split test sets into validation and test sets
X_val,X_test,y_val,y_test=train_test_split(X_test,y_test,test_size=0.5,random_state=0)

print("training set size: ",X_train.shape, y_train.shape)
print("validation set size: ",X_val.shape, y_val.shape)
print("test set size: ",X_test.shape, y_test.shape)

training set size:  (1712, 1) (1712,)
validation set size:  (367, 1) (367,)
test set size:  (368, 1) (368,)


In [24]:
# prepare dataframe of splitted sets
df_train = X_train.assign(severity = y_train)
df_train.severity.value_counts()

0    1194
1     366
2     152
Name: severity, dtype: int64

In [25]:
df_validation = X_val.assign(severity = y_val)
df_validation.severity.value_counts()

0    251
1     79
2     37
Name: severity, dtype: int64

In [26]:
df_test = X_test.assign(severity = y_test)
df_test.severity.value_counts()

0    246
1     81
2     41
Name: severity, dtype: int64

Extract features for all three sets and save

In [97]:
# features to extract: Log-Melspectogram and first two order derivatives together as a 3-d matrix
def extract_audio_feature(path):
    # found maximum shape of MSP and created temp matrix to pad zeros
    feature_matrix = np.zeros((64,316,3))
   
    # load audio
    y, sr = librosa.load(path, sr=16000)
       
    # extract features and derivatives of selected features
    msp = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=64, fmax=8000)
    lg_msp = np.log(msp + 1e-9) # add small number to avoid log(0)
    msp_d1 = librosa.feature.delta(lg_msp)
    msp_d2 = librosa.feature.delta(lg_msp, order=2)

    # all MSP and derivatives will have exact 64 mel freq hence instead of "0:msp.shape[0]" we can consider default range
    feature_matrix[:, 0:msp.shape[1], 0] = lg_msp
    feature_matrix[:, 0:msp_d1.shape[1], 1] = msp_d1
    feature_matrix[:, 0:msp_d2.shape[1], 2] = msp_d2

    return feature_matrix

In [98]:
# generate MelSpectogram for all three sets and save those 

# df_train:

ftr_train = np.zeros((64,316,3,len(df_train)))

# load audio using uuid and extract features
opath =  "./wavFiles/"

labels = []

# progress counter
file_count = 0

for index, ids in enumerate(df_train.uuid):
    labels.append((df_train.severity).iloc[index])

    filepath = opath+ids+'.wav'
    msp_mat = extract_audio_feature(filepath)
    ftr_train[:,:,:,index]=msp_mat

    file_count += 1
    # keep track of data loader's progress
    print('\r'+f' Processed {file_count}/{len(df_train)} audio samples',end='')

 Processed 1712/1712 audio samples

In [99]:
# generate MelSpectogram for all three sets and save those 

# df_validation:

ftr_validation = np.zeros((64,316,3,len(df_validation)))

# load audio using uuid and extract features
opath =  "./wavFiles/"

labels = []

# progress counter
file_count = 0

for index, ids in enumerate(df_validation.uuid):
    labels.append((df_validation.severity).iloc[index])

    filepath = opath+ids+'.wav'
    msp_mat = extract_audio_feature(filepath)
    ftr_validation[:,:,:,index]=msp_mat

    file_count += 1
    # keep track of data loader's progress
    print('\r'+f' Processed {file_count}/{len(df_validation)} audio samples',end='')

 Processed 367/367 audio samples

In [100]:
# generate MelSpectogram for all three sets and save those 

# df_test:

ftr_test = np.zeros((64,316,3,len(df_test)))

# load audio using uuid and extract features
opath =  "./wavFiles/"

labels = []

# progress counter
file_count = 0

for index, ids in enumerate(df_test.uuid):
    labels.append((df_test.severity).iloc[index])

    filepath = opath+ids+'.wav'
    msp_mat = extract_audio_feature(filepath)
    ftr_test[:,:,:,index]=msp_mat

    file_count += 1
    # keep track of data loader's progress
    print('\r'+f' Processed {file_count}/{len(df_test)} audio samples',end='')

 Processed 368/368 audio samples

In [101]:
# save df_train, df_validation, df_test along with their respective features: log-mel-spectogram and derivatives

#df_train.to_csv('df_train.csv')
#df_validation.to_csv('df_validation.csv')
#df_test.to_csv('df_test.csv')

#np.save("ftr_train.npy",ftr_train)
#np.save("ftr_validation.npy",ftr_validation)
#np.save("ftr_test.npy",ftr_test)

### Load dataframes of train, validation and test sets and respective feature matrices

In [2]:
# load dataframes of train, validation and test sets 
df_train = pd.read_csv('df_train.csv')
df_validation = pd.read_csv('df_validation.csv')
df_test = pd.read_csv('df_test.csv')

# load respective feature matrices
feature_train = np.load("ftr_train.npy")
feature_validation = np.load("ftr_validation.npy")
feature_test = np.load("ftr_test.npy")

In [3]:
# check dimentions
print(df_train.shape, df_validation.shape, df_test.shape)
print(feature_train.shape, feature_validation.shape, feature_test.shape)

(1712, 3) (367, 3) (368, 3)
(64, 316, 3, 1712) (64, 316, 3, 367) (64, 316, 3, 368)


### Train set contains samples per labels:

1194 for mild (0), 366 for psuedocough (1), and 152 for severe (2).

### Let's augment log-mel-spectograms for psuedocough by 100% and for severe by 300% using SpecAugment method.

For SpecAugment method, 
Refer: SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition Park, Daniel S. and Chan, William and Zhang, Yu and Chiu, Chung-Cheng and Zoph, Barret and Cubuk, Ekin D. and Le, Quoc V. Interspeech 2019 
<https://www.isca-speech.org/archive/pdfs/interspeech_2019/park19e_interspeech.pdf>

Other resource: https://github.com/pyyush/SpecAugment/tree/master

In [38]:
# Feature augmentation functions:

# frequency masking
def freq_masking(msp):
    v = msp.shape[0] # no. of mel bins (=64)
    num_freq_mask = 1 
    F = 12   # freq_mask parameter
       
    # apply number of frequency masks to the log mel spectrogram
    for i in range(num_freq_mask):
        f = int(np.random.uniform(0, F)) # [0, F)
        f0 = random.randint(0, v - f) # [0, v - f)
        msp[f0:f0+f, :] = 0
            
    return msp


# time masking
def time_masking(msp, num_time_mask):
    tau = msp.shape[1] # time frames
    #num_time_mask = 1
    T = 15  #Time Mask parameter  
        
    # apply numbeer of time masks to the log mel spectrogram
    for i in range(num_time_mask):
        t = int(np.random.uniform(0,T)) # [0, T)
        t0 = random.randint(0, tau - t) # [0, tau - t)
        msp[:, t0:t0+t] = 0
            
    return msp

In [42]:
'''
Need to augment only for labels == 1 and 2. 
    if label == 1: augment once either time or freq masking
    if label == 2: augment with time, freq and third one with two times with time masking and once with freq masking,
    as time axis is almost 5 times larger than frequency axis
Input to augment features: log-mel-spectogram which is first dimentional matrix
Output: augmented log-mel-spectogram. Find derivatives and create new augmented matrix. 
Add as a new feature along with uuid and labels by creating new augment train df.
'''
aug_id = []
aug_lbl = []
counter = 0
augLen = 366+(152*3) # df_train.severity==1 + 2* df_train.severity==2
ftr_train_aug = np.zeros((64,316,3,augLen))

for index, ids in enumerate(df_train.uuid):

    lbl = (df_train.severity).iloc[index]
    
    if lbl==0:
        continue
    
    if lbl==1:
        lmsp = feature_train[:,:,1,index]
        
        choice = random.uniform(0, 1)
        if choice > 0.5:
            new_lmsp = time_masking(lmsp, num_time_mask=1)
        else:
            new_lmsp = freq_masking(lmsp)
        
        # find derivatives of masked lmsp
        lmsp_d1 = librosa.feature.delta(new_lmsp)
        lmsp_d2 = librosa.feature.delta(new_lmsp, order=2)
        
        ftr_train_aug[:, 0:new_lmsp.shape[1], 0, counter] = new_lmsp
        ftr_train_aug[:, 0:lmsp_d1.shape[1], 1, counter] = lmsp_d1
        ftr_train_aug[:, 0:lmsp_d2.shape[1], 2, counter] = lmsp_d2
        
        aug_name = ids+'-'+str(counter)
        aug_id.append(aug_name)
        aug_lbl.append(lbl)
        counter=counter+1
    
    if lbl==2:
        lmsp = feature_train[:,:,1,index]
        
        # apply time masking
        new_lmsp_tm = time_masking(lmsp, num_time_mask=1)
            
        # find derivatives of masked lmsp
        lmsp_d1 = librosa.feature.delta(new_lmsp_tm)
        lmsp_d2 = librosa.feature.delta(new_lmsp_tm, order=2)
        
        ftr_train_aug[:, 0:new_lmsp_tm.shape[1], 0, counter] = new_lmsp_tm
        ftr_train_aug[:, 0:lmsp_d1.shape[1], 1, counter] = lmsp_d1
        ftr_train_aug[:, 0:lmsp_d2.shape[1], 2, counter] = lmsp_d2
        
        aug_name = ids+'-'+str(counter)
        aug_id.append(aug_name)
        aug_lbl.append(lbl)
        counter=counter+1
        
        # apply freq masking
        new_lmsp_freq = freq_masking(lmsp)
        
        # find derivatives of masked lmsp
        lmsp_d1 = librosa.feature.delta(new_lmsp_freq)
        lmsp_d2 = librosa.feature.delta(new_lmsp_freq, order=2)
        
        ftr_train_aug[:, 0:new_lmsp_freq.shape[1], 0, counter] = new_lmsp_freq
        ftr_train_aug[:, 0:lmsp_d1.shape[1], 1, counter] = lmsp_d1
        ftr_train_aug[:, 0:lmsp_d2.shape[1], 2, counter] = lmsp_d2
        
        aug_name = ids+'-'+str(counter)
        aug_id.append(aug_name)
        aug_lbl.append(lbl)
        counter=counter+1
        
        # apply time masking two times and freq masking once
        new_lmsp_time = time_masking(lmsp, num_time_mask=2)
        new_lmsp_frequency = freq_masking(lmsp)
        new_mat = new_lmsp_time + new_lmsp_frequency
        
        # find derivatives of masked lmsp
        lmsp_d1 = librosa.feature.delta(new_mat)
        lmsp_d2 = librosa.feature.delta(new_mat, order=2)
        
        ftr_train_aug[:, 0:new_mat.shape[1], 0, counter] = new_mat
        ftr_train_aug[:, 0:lmsp_d1.shape[1], 1, counter] = lmsp_d1
        ftr_train_aug[:, 0:lmsp_d2.shape[1], 2, counter] = lmsp_d2
        
        aug_name = ids+'-'+str(counter)
        aug_id.append(aug_name)
        aug_lbl.append(lbl)
        counter=counter+1
        
    print('\r'+f' Processed {counter}/{augLen} audio samples',end='')
    
print(len(aug_id), len(aug_lbl))

 Processed 822/822 audio samples822 822


In [43]:
# save aug feature matrices
#np.save("ftr_train_aug.npy",ftr_train_aug)

In [50]:
# convert df of aug_id and aug_lbl

aug_df_train = pd.DataFrame({'uuid': aug_id,'severity':aug_lbl})
aug_df_train.head()

Unnamed: 0,uuid,severity
0,2ef79bd0-5d88-4583-a45d-a6570dfbd4c7-0,1
1,e4bbd7fa-3767-48e3-97cf-c5e14881314f-1,1
2,ffedc843-bfc2-4ad6-a749-2bc86bdac84a-2,1
3,c5f4b9fe-04cd-441b-801f-ae24db6b2fc7-3,1
4,a0e0ce03-2778-46f2-8b5f-aa46c503f392-4,1


In [51]:
# Save augmentated df
#aug_df_train.to_csv('aug_df_train.csv')

In [98]:
# does this unnamed first column are indices from train-test split?
df_train.head()

Unnamed: 0.1,Unnamed: 0,uuid,severity
0,149,47566b56-c2aa-48d5-a0be-92506fc53eab,0
1,1317,35647ec0-1566-4df0-ae5d-27b0357ece7c,0
2,1950,2ba1edda-3fc6-43f6-91b6-4e5c766425a3,0
3,265,74536842-b3dc-4f8f-b232-0333419d7ca2,0
4,1198,046545bc-7f2c-4425-9be2-ffc22bba515e,0


In [99]:
df_train = df_train.iloc[: , 1:]
df_train.head()

Unnamed: 0,uuid,severity
0,47566b56-c2aa-48d5-a0be-92506fc53eab,0
1,35647ec0-1566-4df0-ae5d-27b0357ece7c,0
2,2ba1edda-3fc6-43f6-91b6-4e5c766425a3,0
3,74536842-b3dc-4f8f-b232-0333419d7ca2,0
4,046545bc-7f2c-4425-9be2-ffc22bba515e,0


In [100]:
# do the same for validation and test sets df
df_validation = df_validation.iloc[: , 1:]
df_test = df_test.iloc[:, 1:]
df_validation.shape, df_test.shape

((367, 2), (368, 2))

I have appended augmented df and features below the original df and matrix.

In [110]:
merged_df = df_train.append(aug_df_train, ignore_index=True)
merged_df

  merged_df = df_train.append(aug_df_train, ignore_index=True)


Unnamed: 0,uuid,severity
0,47566b56-c2aa-48d5-a0be-92506fc53eab,0
1,35647ec0-1566-4df0-ae5d-27b0357ece7c,0
2,2ba1edda-3fc6-43f6-91b6-4e5c766425a3,0
3,74536842-b3dc-4f8f-b232-0333419d7ca2,0
4,046545bc-7f2c-4425-9be2-ffc22bba515e,0
...,...,...
2529,ce0c319b-da2c-4f90-9050-108a8156acb5-817,2
2530,f87d25cd-c9b0-409c-be18-39a7f80f2ff0-818,1
2531,e4242c29-1213-4960-b792-565fae299403-819,1
2532,2a49d183-60b0-4ccf-8147-876cc291670f-820,1


In [117]:
# Now need to arrange feature matrices as per shuffled array so labels will be consistent.
feature_train.shape, ftr_train_aug.shape

((64, 316, 3, 1712), (64, 316, 3, 822))

In [124]:
merged_ftr = np.zeros((64,316,3,2534))
merged_ftr[:,:,:,0:1712] = feature_train
merged_ftr[:,:,:,1712:] = ftr_train_aug

In [128]:
# save merged df and features
#np.save("merged_ftr.npy",merged_ftr)

#merged_df.to_csv('merged_df.csv')

In [None]:
#====================================================================

In [None]:
# For analysis:
X_train = merged_ftr
y_train = merged_df['severity']

X_val = feature_validation
y_val = df_validation['severity']

X_test = feature_test
y_test = df_test['severity']


#### Now need to prepare training set ready for analysis. Need to shuffle and mix together df_train nd aug_df_train. Also respective features. For this, one idea is to shuffle indices.

"aug_df_train" uuid ends with number that is 4th dimention for respective log-mel-spectogram feature matrix. 
Feature can be extracted from "ftr_train_aug" for "uuid" ends with '8' as ftr_train_aug[:,:,:,8]

Creating df with suffled indices is easy but creating feature matrix is tricky.


In [112]:
# create indices for combining df_train and augmented df. Shuffle indices and then join df and features
total_len = len(aug_df_train) + len(df_train)
indices = np.arange(total_len) # as end number isn't included in a sequence
shuffled_indices = random.sample(list(indices), len(indices))
shuffled_indices = np.asarray(shuffled_indices)
print(shuffled_indices.shape)

(2534,)


In [113]:
# as per shuffled_indices, newdf has shuffled merged_df
newdf=merged_df.iloc[shuffled_indices] #0:2534
newdf

Unnamed: 0,uuid,severity
39,f66386d5-f2e2-4b9b-8f00-35e51858446b,1
1347,6cd0a8d6-8e94-4ff2-8ad6-ed18ae2b207c,0
179,8396e93f-96ff-4fa3-8ef9-41c4d0ee191e,1
534,7c1b43d9-49f7-41ee-a347-2971c161ac7a,0
2083,d9b4a30f-682d-4889-b65f-b359f4ebc74d-371,2
...,...,...
1870,5d29532f-943b-4726-aca8-21191eeba327-158,1
221,98fb6294-d339-4c83-8ff5-2bbcf82e35e0,0
2504,a7197322-5d2f-4661-8c48-01e39f06a234-792,1
2034,d6ee9da5-ff1d-4677-9ab2-da397e3267c4-322,1
