In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%load_ext autoreload
%autoreload 2
from google.cloud import storage
import io
import librosa
import os
import sys
sys.path.append(os.path.abspath('../coughvid'))
from segmentation import segment_cough, compute_SNR
from scipy.io import wavfile
from IPython.display import Audio
sys.path.append(os.path.abspath('../coughvid'))
from feature_class import features
from DSP import classify_cough
import pickle
import librosa.display

# First Analysis

In [None]:
df = pd.read_csv("../raw_data/metadata_compiled.csv",delimiter=',')
df.head()

In [None]:
df['status'].value_counts()

In [None]:
df['gender'].value_counts()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
fill = df[['cough_detected','quality_1','cough_type_1', 'dyspnea_1', 'wheezing_1', 'stridor_1',
       'choking_1', 'congestion_1', 'nothing_1', 'diagnosis_1', 'severity_1']]

In [None]:
fill['quality_1'].unique()

In [None]:
df['quality_1'].value_counts()

In [None]:
df['cough_type_1'].value_counts()

In [None]:
df['dyspnea_1'].value_counts()

# Specific Investigation

## Considering only audios which have a status

In [None]:
new_df = df[df['status'].notnull()].copy()
new_df


## Disconsidering the specialists features at first

In [None]:
new_df.columns

In [None]:
new_df = new_df[['uuid', 'datetime', 'cough_detected', 'SNR', 'latitude', 'longitude',
       'age', 'gender', 'respiratory_condition', 'fever_muscle_pain', 'status']].copy()
new_df

## Evaluating status

In [None]:
new_df['status'].value_counts()/new_df['status'].count()

In [None]:
new_df['status'].value_counts()

In [None]:
1155+12479

## Removing Symptomatics

In [None]:
new_df

In [None]:
new_df = new_df[new_df['status'] != 'symptomatic'].copy()

In [None]:
new_df['status'].value_counts()

## LabelEncoding features

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
new_df.columns

In [None]:
new_df['respiratory_condition'] = LabelEncoder().fit_transform(new_df['respiratory_condition'])
new_df['fever_muscle_pain'] = LabelEncoder().fit_transform(new_df['fever_muscle_pain'])
new_df['status'] = new_df['status'].map({'healthy': 0, 'COVID-19': 1})

Respiratory Condition and Fever Muscle Pain: 0 - False, 1 - True

Status: 0 - Healthy, 1 - COVID

In [None]:
new_df.isnull().sum()

In [None]:
new_df.isnull().sum() / new_df.count()

In [None]:
new_df.describe()

## Imputing Age Mean

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy="mean")
imputer.fit(new_df[['age']])
new_df['age'] = imputer.transform(new_df[['age']])

In [None]:
new_df

## Defining Target and Features

In [None]:
X = new_df.drop(columns=['uuid', 'datetime','SNR', 'latitude', 'longitude','gender','status'])
y = new_df['status']
X

In [None]:
X.describe()

## Model Evaluation

In [None]:
import statsmodels.formula.api as smf

In [None]:
new_df

In [None]:
model = smf.logit(formula='status ~ cough_detected + age + C(gender) + respiratory_condition + fever_muscle_pain - 1', data=new_df).fit()
model.summary()

# Cough Detection Threshold

In [None]:
new_df = new_df[new_df['cough_detected']>0.6].copy()
new_df

In [None]:
new_df.shape[0]

In [None]:
threshold = 0.6

In [None]:
new_df[new_df['cough_detected'] > threshold].shape[0]

# Cough Detection Model

In [None]:
data_folder = '../sample_recordings'
loaded_model = pickle.load(open(os.path.join('../models', 'cough_classifier'), 'rb'))
loaded_scaler = pickle.load(open(os.path.join('../models','cough_classification_scaler'), 'rb'))

In [None]:
filename = 'cough.wav'
x, fs = librosa.load(data_folder+'/'+filename, sr=None)
probability = classify_cough(x, fs, loaded_model, loaded_scaler)
print("The file {0} has a {1}\% probability of being a cough".format(filename,round(probability*100,2)))

In [None]:
filename = 'not_cough.wav'
x, fs = librosa.load(data_folder+'/'+filename, sr=None)
probability = classify_cough(x, fs, loaded_model, loaded_scaler)
print("The file {0} has a {1}\% probability of being a cough".format(filename,round(probability*100,2)))

In [None]:
client = storage.Client()
bucket = client.get_bucket('coughvid-650')
blob = bucket.get_blob('data/00039425-7f3a-42aa-ac13-834aaa2b6b92.wav')
audio,rate = librosa.load(io.BytesIO(blob.download_as_string()), sr=None)
probability = classify_cough(audio, rate, loaded_model, loaded_scaler)
print("The file {0} has a {1}\% probability of being a cough".format(filename,round(probability*100,2)))

In [None]:
Audio(audio, rate=rate)

In [None]:
rate

In [None]:
df

# SNR Calculator

In [None]:
client = storage.Client()

bucket = client.get_bucket('coughvid-650')

blob = bucket.get_blob('data/fff3ff61-2387-4139-938b-539db01e6be5.wav')

In [None]:
audio,rate = librosa.load(io.BytesIO(blob.download_as_string()), sr=None)

In [None]:
snr = compute_SNR(audio,rate)
print("The SNR of the cough signal is {0}".format(snr))

In [None]:
df

In [None]:
df.describe()

In [None]:
Audio(audio, rate=rate)

# Mel Spectrogram

In [None]:
client = storage.Client()
bucket = client.get_bucket('coughvid-650')
blob = bucket.get_blob('data/00039425-7f3a-42aa-ac13-834aaa2b6b92.wav')
audio,rate = librosa.load(io.BytesIO(blob.download_as_string()), sr=None)

In [None]:
Audio(audio, rate=rate)

In [None]:
librosa.feature.melspectrogram(y = audio, sr = rate)

In [None]:
librosa.feature.melspectrogram(y = audio, sr = rate).shape

In [None]:
S = librosa.feature.melspectrogram(y = audio, sr = rate, n_mels=128, fmax=8000)

In [None]:
fig, ax = plt.subplots()
S_dB = librosa.power_to_db(S, ref=np.max)
img = librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=rate, fmax=8000, ax=ax)
fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set(title='Mel-frequency spectrogram')

In [None]:
client = storage.Client()
bucket = client.get_bucket('coughvid-650')
blob = bucket.get_blob('data/0009eb28-d8be-4dc1-92bb-907e53bc5c7a.wav')
audio,rate = librosa.load(io.BytesIO(blob.download_as_string()), sr=None)

In [None]:
librosa.feature.melspectrogram(y = audio, sr = rate)

In [None]:
librosa.feature.melspectrogram(y = audio, sr = rate).shape

In [None]:
Audio(audio, rate=rate)

In [None]:
S = librosa.feature.melspectrogram(y = audio, sr = rate, n_mels=128, fmax=8000)
fig, ax = plt.subplots()
S_dB = librosa.power_to_db(S, ref=np.max)
img = librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=rate, fmax=8000, ax=ax)
fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set(title='Mel-frequency spectrogram')

# Image Model

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from sklearn.metrics import confusion_matrix
import seaborn as sns
from keras.preprocessing import image
from tensorflow.keras.models import load_model

# Try

In [None]:
new_df.shape

In [None]:
new_df[new_df.index==16330]

In [None]:
df[df.index==11501].status

In [None]:
df.status.value_counts()

In [None]:
new_df

In [None]:
# for i in range(len(audios_test)):
#     S = librosa.feature.melspectrogram(y = audios_test[i], sr = rates_test[i], n_mels=128, fmax=8000)
#     fig, ax = plt.subplots()
#     S_dB = librosa.power_to_db(S, ref=np.max)
#     img = librosa.display.specshow(S_dB, sr=rates_test[i], fmax=8000, ax=ax)
    
#     if new_df[new_df.index==index_test[i]].status.values[0] == 1:
#         plt.savefig('./test/2-COVID/' + f'test{i}.png')
#     else:
#         plt.savefig('./test/1-Healthy/' + f'test{i}.png')

In [None]:
path_dir = './test2/'

In [None]:
datagen = ImageDataGenerator(
                    rescale=1./255,
                    validation_split = 0.2)
train_generator = datagen.flow_from_directory(
    path_dir,
    target_size=(288, 432,4),
    shuffle=True,
    subset='training'
)
validation_generator = datagen.flow_from_directory(
    path_dir,
    target_size=(288, 432,4),
    subset='validation'
)

In [None]:
pd.DataFrame(train_generator.labels).value_counts()

In [None]:
pd.DataFrame(validation_generator.labels).value_counts()

In [None]:
validation_generator.image_shape

In [None]:
model = tf.keras.models.Sequential([
    #first_convolution
    tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(288, 432, 3)),
    tf.keras.layers.MaxPooling2D(2, 2),
    #second_convolution
    tf.keras.layers.Conv2D(32, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    #third_convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    #fourth_convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax') 
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = tf.keras.metrics.Recall())
model.fit(train_generator, batch_size=32,epochs=50)

In [None]:
accuracy = model.evaluate(validation_generator)
print('n', 'Test_Accuracy:-', accuracy[1])
pred = model.predict(validation_generator)
y_pred = np.argmax(pred, axis=1)
y_true = validation_generator.labels
print('confusion matrix')
print(confusion_matrix(y_true, y_pred))
    #confusion matrix
f, ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt=".0f", ax=ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

In [None]:
import joblib
joblib.dump(model, 'model.joblib')

In [None]:
new_df.shape

In [None]:
def get_all_audios(df, start = 0, finish = 10):

    client = storage.Client()

    bucket = client.get_bucket('coughvid-650')
    
    files = df.uuid.iloc[start:finish]
    
    rates = []
    
    audios = []
    
    print("Good luck!")
    
    j=0
    for i in files:
    
        blob = bucket.get_blob('data/' + f'{i}' + '.wav')
        
        print('getting ' + f'{j} - {i}' + '.wav' + '...')
        
        audio, rate = librosa.load(io.BytesIO(blob.download_as_string()), sr=None)
        
        rates.append(rate)
        
        audios.append(audio)
        j += 1
    print("Done!")
    
    return rates, audios, files.index.values

In [None]:
def audios_to_images(df, audios, rates, index, sub_folders = ['COVID', 'Healthy']):
    import os
    for folder in range(len(sub_folders)):
        if os.path.isdir(f'{sub_folders[folder]}'):
            continue
        else:
            os.mkdir(f'{sub_folders[folder]}')
    j=0
    for i in range(len(audios)):
        S = librosa.feature.melspectrogram(y = audios[i], sr = rates[i], n_mels=128, fmax=8000)
        fig, ax = plt.subplots()
        S_dB = librosa.power_to_db(S, ref=np.max)
        img = librosa.display.specshow(S_dB, sr=rates[i], fmax=8000, ax=ax)
        
        print(f'Fetching {j}')
        j += 1
        if df[df.index==index[i]].status.values[0] == 1:
            plt.savefig(f'./{sub_folders[0]}/' + f'{df.uuid[df.index==index[i]].values[0]}.png')
        else:
            plt.savefig(f'./{sub_folders[1]}/' + f'{df.uuid[df.index==index[i]].values[0]}.png')
        plt.close()
    return print('Done!')


In [None]:
# rate_info, audio_info, index_info = get_all_audios(new_df,3142,5191)

In [None]:
# audios_to_images(new_df,audio_info,rate_info,index_info)

In [6]:
# load numpy array from npy file
from numpy import load
# load array
X = load('data.npy')
X.shape

(10191, 288, 432, 4)

In [5]:
# load numpy array from npy file
from numpy import load
# load array
y = load('target.npy')
y.shape

(10191,)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
X_train.shape

(7133, 288, 432, 4)

In [9]:
X_test.shape

(3058, 288, 432, 4)

In [10]:
import pandas as pd
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import metrics
from sklearn.metrics import confusion_matrix

2021-08-06 10:52:09.402697: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-06 10:52:09.402774: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
