## Coswara data extraction notebook
This notebook is used to separate negative and positive recording instances into different folders for ease of processing.

Negative and positive instances are distinguished using the `combined_data.csv` file containing the metadata for the recordings. The target value is chosen to be the `covid_status` columb, which corresponds to the self-declared Covid status of the participant.

The `covid_status` columb values were grouped to form the negative and positive labels in the following manner:
* **Positive**
    * `positive_mild`
    * `positive_moderate`
    * `positive_asymp`
* **Negative**
    * `healthy`
    * `no_resp_illness_exposed`
    * `resp_illness_not_identified`
    * `recovered_full`

In [None]:
import os
import glob
import shutil
from tqdm import tqdm
import pandas as pd
from pathlib import Path
from typing import Union, List

In [None]:
# Path to Extraced_data folder
data_dir = '../../../Coswara-Data/'
extracted_data_dir = 'Extracted_data'

In [None]:
# Get cough paths
suffix = 'shallow'
recording_regex = fr'202*/*/cough-{suffix}.wav'
search_path = os.path.join(data_dir, extracted_data_dir, recording_regex)
paths = glob.glob(search_path)
len(paths)

In [None]:
# Sanity check for correct files gathered
paths[0]

### Move positive and negative files to separate folders

In [None]:
def mkdir(path:Union[str, Path]):
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
# Create folders for files
n_dir = os.path.join(data_dir, 'data', suffix, 'n')
p_dir = os.path.join(data_dir, 'data', suffix, 'p')
mkdir(n_dir)
mkdir(p_dir)

In [None]:
# Read metadata
meta_data_path = os.path.join(data_dir, 'combined_data.csv')
meta_data = pd.read_csv(meta_data_path)

In [None]:
ids_list = list(map(lambda x: x.split('/')[-2], paths))
len(ids_list)

In [None]:
print(meta_data.covid_status.unique())

In [None]:
id_mask = meta_data.id.isin(ids_list)
covid_status = meta_data[id_mask].covid_status
covid_status.value_counts()

id_mask = meta_data.id.isin(ids_list)
test_status = meta_data[id_mask].test_status
test_status.value_counts()

n_ids = meta_data[meta_data.test_status == 'n'].id.to_list()
p_ids = meta_data[meta_data.test_status == 'p'].id.to_list()
len(n_ids), len(p_ids)

In [None]:
n_mask = meta_data.covid_status.isin(['healthy', 'no_resp_illness_exposed', 'resp_illness_not_identified', 'recovered_full'])
p_mask = meta_data.covid_status.isin(['positive_mild', 'positive_moderate', 'positive_asymp'])
n_ids = meta_data[n_mask].id.to_list()
p_ids = meta_data[p_mask].id.to_list()
len(n_ids), len(p_ids)

In [None]:
# Map id to path
id_path_map = dict()
for path in paths:
    rec_id = path.split('/')[-2]
    if rec_id in id_path_map:
        print(f'Duplicate id :: {rec_id}')
        continue
        
    id_path_map[rec_id] = path

In [None]:
def move_recordings(ids:List[str],
                    id_path_map:dict[str, Union[str, Path]],
                    target_dir:Union[str, Path],
                    rec_format:str='.wav'):

    for rec_id in tqdm(ids):
        if rec_id not in id_path_map: continue
        
        old_path = id_path_map[rec_id]
        file_name = f'{rec_id}{rec_format}'
        new_path = os.path.join(target_dir, file_name)
        shutil.copy(old_path, new_path)

In [None]:
move_recordings(n_ids, id_path_map, n_dir)
move_recordings(p_ids, id_path_map, p_dir)

### Extract metadata for recordings

In [None]:
n_id_mask = meta_data.id.isin(n_ids)
p_id_mask = meta_data.id.isin(p_ids)
np_meta_data = meta_data[p_id_mask | n_id_mask]
np_meta_data_path = os.path.join(data_dir, 'data', suffix, 'meta_data.csv')
np_meta_data.to_csv(np_meta_data_path, index=False)