data: coughvid dataset with status classification as labels
Features: Log-Mel-Spectogram and derivatives 
ML model:  
accuracy:   



In [1]:
# import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import librosa
import subprocess
from pathlib import Path
from scipy.stats import kurtosis, skew

In [2]:
# import metadata file and check status column
df = pd.read_csv('./coughvid_20211012/metadata_compiled.csv', index_col=[0])
print(df.shape)

(34434, 51)


In [3]:
# check for NaN values in Severity column
print(df['status'].isnull().values.any())
df['status'].isnull().values.sum()

True


13770

In [4]:
# drop NaN values
withoutNan_df = df.dropna(subset=['status'])
print(withoutNan_df.shape)

# confirming
print("confirming nanvalues: ",withoutNan_df['status'].isnull().values.sum())

(20664, 51)
confirming nanvalues:  0


In [5]:
withoutNan_df.status.value_counts()

healthy        15476
symptomatic     3873
COVID-19        1315
Name: status, dtype: int64

In [6]:
df_id_status = withoutNan_df[["uuid","status"]]
print(df_id_status.shape)
df_id_status.head()

(20664, 2)


Unnamed: 0,uuid,status
1,00039425-7f3a-42aa-ac13-834aaa2b6b92,healthy
2,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,healthy
3,00098cdb-4da1-4aa7-825a-4f1b9abc214b,healthy
4,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,healthy
6,001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f,healthy


In [None]:
### load data and extract features

In [55]:
# features to extract
def extract_audio_feature(y,sr):
    features =[]
    
    # extract features  
    msp = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=64, fmax=8000)
    log_msp = librosa.power_to_db(msp)
    log_msp_d1 = librosa.feature.delta(log_msp)
    log_msp_d2 = librosa.feature.delta(log_msp, order=2)
        
    features = np.hstack([log_msp.mean(axis=1),log_msp_d1.mean(axis=1),log_msp_d2.mean(axis=1)])
    
    return features

In [57]:
# load audio using uuid and extract features
opath =  "./needAudio/"

# status labels
labels_dict = {
     '0':'COVID-19',
     '1':'healthy',
     '2':'symptomatic'
}
# key and val emotion list
key_l = list(labels_dict.keys())
val_l = list(labels_dict.values())

dfids, labels, features_stat = [], [], []

# progress counter
file_count = 0

for index, ids in enumerate(df_id_status.uuid):
       
    filepath = opath+ids+'.wav'
    # load audio
    y, sr = librosa.load(path, mono=True, sr=None)
    
    if y.shape[0]<1000:
        continue
        
    ftr = extract_audio_feature(y,sr)
    features_stat.append(ftr)
    
    dfids.append(ids)
     #labels.append((df1.status).iloc[index])  #string label values
    
    # get severity label from the sample file in numeric values
    lbl_str = (df_id_status.status).iloc[index]
    lbl_num = int(key_l[val_l.index(lbl_str)])
    labels.append(lbl_num)
    
    file_count += 1
    # keep track of data loader's progress
    print('\r'+f' Processed {file_count}/{len(withoutNan_df)} audio samples',end='')

 Processed 20664/20664 audio samples

In [58]:
dt = np.array(features_stat)
print("features np array shape: ",dt.shape)

features np array shape:  (20664, 192)


In [59]:
dfids = np.asarray(dfids)
print(dfids.shape)
df_ids = np.reshape(dfids,(len(dfids),1))

labels = np.asarray(labels)
print(labels.shape)
df_lbl = np.reshape(labels,(len(labels),1))

df_ids.shape, df_lbl.shape

(20664,)
(20664,)


((20664, 1), (20664, 1))

In [60]:
# be careful, first column is uuid for refernce and last column is severity labels
all_col = np.hstack(([df_ids, dt, df_lbl]))
print(all_col.shape)

df_dt = pd.DataFrame(all_col)
print(df_dt.shape)
df_dt.head()

(20664, 194)
(20664, 194)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,00039425-7f3a-42aa-ac13-834aaa2b6b92,-46.836376,-44.678955,-40.739243,-37.124218,-35.00893,-32.390892,-25.596382,-21.496511,-21.678574,...,2.5999844e-10,-6.499961e-10,7.7999535e-10,3.1199814e-09,-1.169993e-09,3.3799799e-09,5.199969e-10,2.5999844e-10,5.199969e-10,1
1,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,-46.836376,-44.678955,-40.739243,-37.124218,-35.00893,-32.390892,-25.596382,-21.496511,-21.678574,...,2.5999844e-10,-6.499961e-10,7.7999535e-10,3.1199814e-09,-1.169993e-09,3.3799799e-09,5.199969e-10,2.5999844e-10,5.199969e-10,1
2,00098cdb-4da1-4aa7-825a-4f1b9abc214b,-46.836376,-44.678955,-40.739243,-37.124218,-35.00893,-32.390892,-25.596382,-21.496511,-21.678574,...,2.5999844e-10,-6.499961e-10,7.7999535e-10,3.1199814e-09,-1.169993e-09,3.3799799e-09,5.199969e-10,2.5999844e-10,5.199969e-10,1
3,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,-46.836376,-44.678955,-40.739243,-37.124218,-35.00893,-32.390892,-25.596382,-21.496511,-21.678574,...,2.5999844e-10,-6.499961e-10,7.7999535e-10,3.1199814e-09,-1.169993e-09,3.3799799e-09,5.199969e-10,2.5999844e-10,5.199969e-10,1
4,001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f,-46.836376,-44.678955,-40.739243,-37.124218,-35.00893,-32.390892,-25.596382,-21.496511,-21.678574,...,2.5999844e-10,-6.499961e-10,7.7999535e-10,3.1199814e-09,-1.169993e-09,3.3799799e-09,5.199969e-10,2.5999844e-10,5.199969e-10,1


In [61]:
# save df
#df_dt.to_csv('df_statusPred_mean-log-mel-spectogram-coughvid.csv', index=False)

In [96]:
# sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn import svm, metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, precision_score 
from sklearn.metrics import recall_score, f1_score, balanced_accuracy_score, accuracy_score
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef


In [None]:
###  Analysis of Log-Mel-Spectogram + 1st & 2nd order derivatives with mean stat measures for "Status"

In [63]:
# ML analysis :  remember, first col is uuid
scaler = StandardScaler().fit(df_dt.iloc[:,1:-1].values)
X = scaler.transform(df_dt.iloc[:,1:-1].values)
y = df_dt.iloc[:,-1].values

skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=True)

In [71]:
# check with SVM

clf = SVC(decision_function_shape='ovo',C=1, gamma='scale' ,kernel='rbf', class_weight=None)  

scores = cross_val_score(clf, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.10075617819503553


array([0.18722787, 0.06337687, 0.06337687, 0.06337687, 0.18731849,
       0.18731849, 0.06389158, 0.06389158, 0.06389158, 0.06389158])

In [68]:
# check with SVM

clf = SVC(decision_function_shape='ovo',C=21, gamma='scale' ,kernel='poly', degree=5, class_weight=None)  

scores = cross_val_score(clf, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.7489353511198659


array([0.74891147, 0.74891147, 0.74891147, 0.74891147, 0.74927396,
       0.74927396, 0.74878993, 0.74878993, 0.74878993, 0.74878993])

In [73]:
# check with SVM

clf = SVC(decision_function_shape='ovo',C=5, gamma='scale', kernel='poly', degree=3)  

scores = cross_val_score(clf, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.7489353511198659


array([0.74891147, 0.74891147, 0.74891147, 0.74891147, 0.74927396,
       0.74927396, 0.74878993, 0.74878993, 0.74878993, 0.74878993])

In [70]:
# check with SVM

clf = SVC(decision_function_shape='ovo',C=3, gamma='scale' ,kernel='rbf', class_weight=None)  

scores = cross_val_score(clf, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.7489353511198659


array([0.74891147, 0.74891147, 0.74891147, 0.74891147, 0.74927396,
       0.74927396, 0.74878993, 0.74878993, 0.74878993, 0.74878993])

In [84]:
# check with SVM using split sets to create model

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1, shuffle=True)

clf = SVC(decision_function_shape='ovo',C=5, gamma='scale', kernel='poly', degree=3)  
 
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
val_acc = accuracy_score(y_test, preds)
print("with SVC, 20% test accuracy: ",val_acc)

with SVC, 20% test accuracy:  0.7524800387127994


In [86]:
# save model
import pickle
filename = 'Coughvid_statusPred_svc_model.sav'
#pickle.dump(clf, open(filename, 'wb'))

In [92]:
# finding f1-score
f1score = f1_score(y_true=y_test, y_pred=preds,average='weighted')
print(f1score)

0.6461998951806727


In [97]:
# finding balanced accuracy
bal_acc = balanced_accuracy_score(y_true=y_test, y_pred=preds)
print(bal_acc)

0.3333333333333333


In [None]:
#  -------------------- below is MelSpectogram+derivatives with mean

In [87]:
# apply MLP
mlpc = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(512,256,128,64,32),activation='relu')

scores = cross_val_score(mlpc, X, y, scoring='accuracy', cv=skf)        
print('with MLP:', scores.mean())
scores

with MLP: 0.7489353511198659


array([0.74891147, 0.74891147, 0.74891147, 0.74891147, 0.74927396,
       0.74927396, 0.74878993, 0.74878993, 0.74878993, 0.74878993])

In [88]:
# apply RF
rfc = RandomForestClassifier(bootstrap = True, max_depth=20, min_samples_split=3, min_samples_leaf=3, 
                             max_features='log2', n_estimators=1900)
scores = cross_val_score(rfc, X, y, scoring='accuracy', cv=skf)        
print('with RF:', scores.mean())
scores

with RF: 0.7489353511198659


array([0.74891147, 0.74891147, 0.74891147, 0.74891147, 0.74927396,
       0.74927396, 0.74878993, 0.74878993, 0.74878993, 0.74878993])

In [89]:
# apply KNN
knn = neighbors.KNeighborsClassifier(algorithm='kd_tree', leaf_size= 30, n_neighbors= 7, weights= 'uniform')
scores = cross_val_score(knn, X, y, scoring='accuracy', cv=skf)        
print('with KNN:', scores.mean())
scores

with KNN: 0.7489353511198659


array([0.74891147, 0.74891147, 0.74891147, 0.74891147, 0.74927396,
       0.74927396, 0.74878993, 0.74878993, 0.74878993, 0.74878993])