### CoughVid dataset generation

Notebook to generate the CoughVid dataset for the attention based CNN-LSTM model. The code written here assumes that the [pitch_shift.py](../scripts/pitch_shift.py) and [spec_augment.py](../scripts/spec_augment.py) scripts were already run in this order. The data sets generated and saved here are directly used for the training of the attention based CNN-LSTM baseline model.

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, KFold

In [None]:
path = '../coughvid-clean-silence-removed/augmented_melspectrogram'
names = sorted(os.listdir(path), key=lambda x: int(os.path.splitext(x)[0]))
imgArraySize = (88, 39)

In [None]:
# Loading Images
images = []
for filename in tqdm(names):
    img = cv2.imread(os.path.join(path,filename))
    img = cv2.resize(img,imgArraySize)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = np.asarray(img,dtype=np.float32)
    img = img/225.0
    if img is not None:
        images.append(img)

images = np.squeeze(images)

In [None]:
# Loading Labels
labels_path = '../coughvid-clean-silence-removed/labels.csv'
labels = pd.read_csv(labels_path)
covid_status = labels["label"]
covid_status = np.asarray(covid_status)

In [None]:
# Save images and labels without shuffling
features = {
    'images': images,
    'covid_status': covid_status
}

data_dir = '../coughvid_melspec'
features_path = os.path.join(data_dir, 'coughvid_melspec.npz')
np.savez(features_path, **features)

In [None]:
# Load saved features and labels
feature_path = os.path.join(data_dir, 'coughvid_melspec.npz')
features = np.load(features_path)
images = features['images']
covid_status = features['covid_status']

In [None]:
# Create splits
np.random.seed(75)
images, covid_status = shuffle(images, covid_status, random_state=75)
X_train, X_test, y_train, y_test = train_test_split(images, covid_status, test_size=0.2, shuffle=True)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, shuffle=True)

In [None]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

In [None]:
# Save sets
def save_set(data_dir, set_name, images, covid_status):
    features = {
        'images': images,
        'covid_status': covid_status        
    }

    features_path = os.path.join(data_dir, f'{set_name}_coughvid_melspec.npz')
    np.savez(features_path, **features)
    
save_set(data_dir, 'train', X_train, y_train)
save_set(data_dir, 'valid', X_valid, y_valid)
save_set(data_dir, 'test', X_test, y_test)