data: coughvid dataset with severity classification as labels
Features: Log-Mel-Spectogram and derivatives 
ML model: SVC 
accuracy: 68%  

In [1]:
# import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import librosa
import subprocess
from pathlib import Path
from scipy.stats import kurtosis, skew

In [2]:
# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn import svm, metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, balanced_accuracy_score
from sklearn.metrics import cohen_kappa_score, matthews_corrcoef


In [3]:
# features to extract
def extract_audio_feature(path):
    features =[]
    # load audio
    y, sr = librosa.load(path, mono=True, sr=None)
    # extract features  
    msp = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=64, fmax=9000)
    log_msp = librosa.power_to_db(msp)
    log_msp_d1 = librosa.feature.delta(log_msp)
    log_msp_d2 = librosa.feature.delta(log_msp, order=2)
        
    features = np.hstack([log_msp.mean(axis=1),log_msp_d1.mean(axis=1),log_msp_d2.mean(axis=1)])
    
    return features

In [9]:
# import cleaned data file and check severity column
df = pd.read_csv('cleaned_coughvid_data.csv')

# drop records where severity==unknown
df1 = df[df.severity != 'unknown']
print(df1.shape)
df1['severity'].value_counts()

(2449, 17)


mild           1691
pseudocough     526
severe          230
Name: severity, dtype: int64

In [10]:
# check for NaN values in Severity column
print(df1['severity'].isnull().values.any())
df['severity'].isnull().values.sum()

True


2

In [11]:
# drop NaN values
withoutNan_df = df1.dropna(subset=['severity'])
print(withoutNan_df.shape)

# confirming
print("confirming nanvalues: ",withoutNan_df['severity'].isnull().values.sum())

(2447, 17)
confirming nanvalues:  0


In [None]:
### load data and extract features

In [12]:
# load audio using uuid and extract features
opath =  "./wavFiles/"

labels_dict = {
     '0':'mild',
     '1':'pseudocough',
     '2':'severe'
}
# key and val emotion list
key_l = list(labels_dict.keys())
val_l = list(labels_dict.values())

dfids, labels, features_stat = [], [], []

# progress counter
file_count = 0

for index, ids in enumerate(withoutNan_df.uuid):
    dfids.append(ids)
    #labels.append((df1.severity).iloc[index])  #string label values
    
    # get severity label from the sample file
    lbl_str = (withoutNan_df.severity).iloc[index]
    lbl_num = int(key_l[val_l.index(lbl_str)])
    labels.append(lbl_num)
    
    filepath = opath+ids+'.wav'
    ftr = extract_audio_feature(filepath)
    features_stat.append(ftr)
    
    file_count += 1
    # keep track of data loader's progress
    print('\r'+f' Processed {file_count}/{len(withoutNan_df)} audio samples',end='')

 Processed 2447/2447 audio samples

In [13]:
dt = np.array(features_stat)
print("features np array shape: ",dt.shape)

features np array shape:  (2447, 192)


In [15]:
dfids = np.asarray(dfids)
print(dfids.shape)
df_ids = np.reshape(dfids,(len(dfids),1))

labels = np.asarray(labels)
print(labels.shape)
df_lbl = np.reshape(labels,(len(labels),1))

df_ids.shape, df_lbl.shape

(2447,)
(2447,)


((2447, 1), (2447, 1))

In [16]:
# be careful, first column is uuid for refernce and last column is severity labels
all_col = np.hstack(([df_ids, dt, df_lbl]))
print(all_col.shape)

df_dt = pd.DataFrame(all_col)
print(df_dt.shape)
df_dt.head()

(2447, 194)
(2447, 194)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,01567151-7bb2-45ee-9aa8-a1332b5941ea,-43.55661,-41.480846,-39.643368,-37.628273,-36.043907,-35.137222,-35.187443,-35.9417,-36.021473,...,0.0,-5.105323e-10,0.0,2.5526614e-09,-6.1263874e-09,-3.0631937e-09,0.0,-1.0210646e-09,3.573726e-09,0
1,018b40a1-c109-459a-9e31-86cbd2cb3918,-47.525455,-45.16427,-43.575325,-42.80686,-42.198696,-41.441845,-40.36228,-39.576687,-39.364807,...,-0.0105933435,-0.012727754,-0.010147403,-0.0088438075,-0.0063309865,-0.0011459514,-9.099946e-10,1.2999922e-10,-8.1249495e-12,0
2,01ff40e8-63e6-4570-a463-9778ea30cad7,-43.39546,-42.701763,-40.86465,-38.634872,-37.64142,-37.915543,-37.97829,-37.030876,-36.4715,...,-0.0020552296,0.0001683656,0.00077547267,-6.659739e-10,-7.991686e-10,1.5983366e-09,-2.663897e-10,1.0988561e-09,1.664871e-11,1
3,0379c586-c500-483c-83a6-95b63afe6931,-44.38546,-41.391323,-38.90425,-37.738556,-37.81213,-37.559288,-37.387947,-37.905132,-39.083096,...,5.3277865e-10,-6.6597394e-10,2.6638916e-10,5.3277865e-10,-2.1644175e-10,-1.997924e-10,1.165451e-10,-4.0828132000000006e-16,-4.0828132000000006e-16,1
4,038592cb-c8db-4f55-8052-e20059146cb5,-36.38642,-29.606733,-24.499537,-20.719791,-18.646845,-19.742687,-20.62541,-21.448437,-21.218958,...,-0.0037539268,-0.0015414153,-0.007497791,-0.009733099,-0.012369485,-0.0073576695,-0.0005173074,6.1846583e-10,-7.730821e-11,0


In [17]:
# save df
#df_dt.to_csv('df_mean-log-mel-spectogram-coughvid.csv', index=False)

In [None]:
###  Analysis of Log-Mel-Spectogram + 1st & 2nd order derivatives with mean stat measures

In [18]:
# ML analysis :  remember, first col is uuid
scaler = StandardScaler().fit(df_dt.iloc[:,1:-1].values)
X = scaler.transform(df_dt.iloc[:,1:-1].values)
y = df_dt.iloc[:,-1].values

skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=True)

In [19]:
# check with SVM

clf = SVC(decision_function_shape='ovo',C=21, gamma='scale' ,kernel='poly', degree=5, class_weight=None)  

scores = cross_val_score(clf, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.6661241217798596


array([0.66938776, 0.67346939, 0.65306122, 0.68163265, 0.68979592,
       0.64897959, 0.64081633, 0.69262295, 0.66803279, 0.64344262])

In [25]:
# check with SVM

clf = SVC(decision_function_shape='ovo',C=3, gamma='auto' ,kernel='rbf', class_weight=None)  

scores = cross_val_score(clf, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.6877835396453664


array([0.68163265, 0.69795918, 0.68163265, 0.67346939, 0.68163265,
       0.68979592, 0.70204082, 0.68852459, 0.69262295, 0.68852459])

In [27]:
# check with SVM using split sets to create model

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1, shuffle=True)

clf = SVC(decision_function_shape='ovo',C=3, gamma='auto' ,kernel='rbf', degree=3, class_weight=None)
 
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
val_acc = accuracy_score(y_test, preds)
print("with SVC, 20% test accuracy: ",val_acc)

with SVC, 20% test accuracy:  0.6979591836734694


In [28]:
# save model
import pickle
filename = 'Coughvid_severityPred_svc_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
#  -------------------- below is MelSpectogram+derivatives with mean

In [39]:
# check with SVM

clf = SVC(decision_function_shape='ovo',C=3, gamma='auto' ,kernel='rbf', class_weight=None)  

scores = cross_val_score(clf, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.6869722315155571


array([0.68571429, 0.68163265, 0.66938776, 0.68571429, 0.69387755,
       0.68163265, 0.68979592, 0.67622951, 0.70081967, 0.70491803])

In [63]:
# apply MLP
mlpc = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(512,256,128,64,32),activation='relu')

scores = cross_val_score(knn, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.6632619605219137


array([0.65306122, 0.66530612, 0.66530612, 0.64897959, 0.68163265,
       0.64489796, 0.68163265, 0.68032787, 0.63114754, 0.68032787])

In [59]:
# apply RF
rfc = RandomForestClassifier(bootstrap = True, max_depth=20, min_samples_split=3, min_samples_leaf=3, 
                             max_features='log2', n_estimators=1900)
scores = cross_val_score(knn, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.6657042489126799


array([0.67346939, 0.66938776, 0.69795918, 0.64897959, 0.65714286,
       0.67755102, 0.65714286, 0.65983607, 0.6557377 , 0.65983607])

In [49]:
# apply KNN
knn = neighbors.KNeighborsClassifier(algorithm='kd_tree', leaf_size= 30, n_neighbors= 7, weights= 'uniform')
scores = cross_val_score(knn, X, y, scoring='accuracy', cv=skf)        
print('with SVM:', scores.mean())
scores

with SVM: 0.6645065239210439


array([0.66122449, 0.65306122, 0.65306122, 0.68571429, 0.64489796,
       0.64081633, 0.66530612, 0.67213115, 0.68032787, 0.68852459])