# Aggregating respiratory sound data

This notebook explains how to collect respiratory sound data files from ICBHI 2017, CoughVID, and HF_Lung, and combine them into a single dataset called `Resp` in the paper.

**Before you begin:**
- Please download the data files according to the OPERA instructions for ICBHI 2017, CoughVID, and HF_Lung in advance.
- Set paths `OPERA` and `dest_wav` in the following cell.

Then, running all cells will aggregate the files into the `dest_wav` folder ('resp_audio/' by default), which constitutes the `Resp` dataset.


In [3]:
import IPython

from IPython import get_ipython
ipython = get_ipython()
ipython.run_line_magic('reload_ext', 'autoreload')
ipython.run_line_magic('autoreload', '2')
ipython.run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pandas as pd

In [None]:
# EDIT ME: YOUR OPERA PATH
OPERA = '/your/OPERA'

# EDIT ME (OPTION): DESTINATION FOLDER FOR AGGREGATED RESPIRATORY SOUNDS
dest_wav = 'resp_audio'

# OPERA DATA PATHS -- please download the datasets and place them in the corresponding folders in advance
ICBHI2017 = OPERA+'/datasets/icbhi/ICBHI_final_database'
COUGHVID = OPERA+'/datasets/coughvid'
HF_Lung_V2 = OPERA+'/datasets/hf_lung/HF_Lung_V2'


## ICBHI 2017

In [None]:
# Download the metadata CSV file from the ICBHI2017 dataset repository
df = pd.read_csv('https://raw.githubusercontent.com/ilyassmoummad/scl_icbhi2017/refs/heads/main/data/ICBHI/metadata.csv')

# All we need is training set.
df = df[df.split == 'train']
print(len(df))  # --> shall be 4142

# Copy files to the destination folder
import shutil

from_folder = Path(ICBHI2017)
to_folder = Path(dest_wav)/'ICBHI2017'

for f in set(df.filepath.values):
    orgfile = from_folder/f
    assert orgfile.exists()
    tofile = to_folder/f
    tofile.parent.mkdir(exist_ok=True, parents=True)
    shutil.copy(orgfile, tofile)

# Check if the files were copied correctly
assert len(set(df.filepath.values)) == len(list(to_folder.rglob('*.wav')))
assert len(set(df.filepath.values)) == 539
print(f'Copied ICBHI2017 {len(set(df.filepath.values))} files to {to_folder}.')

## CoughVID

In [None]:
# Download the metadata CSV file from the CoughVID dataset repository
df = pd.read_csv('https://raw.githubusercontent.com/evelyn0414/OPERA/refs/heads/main/datasets/coughvid/metadata_compiled.csv')
# We need files with cough_detected > 0.95
df = df[(df.cough_detected > 0.95)]
print(len(df))  # --> shall be 7054

# Copy files to the destination folder
from_folder = Path(COUGHVID)/'wav'
to_folder = Path(dest_wav)/'coughvid/wav'

for i, f in enumerate(df.uuid.values):
    ## for the exceptional file names with one letter shorter than the uuid
    org_f = f
    f = f[:-1] if not (from_folder/(f + '.wav')).exists() else f
    ## end of exceptional handling
    f = f + '.wav'
    orgfile = from_folder/f
    assert orgfile.exists()
    tofile = to_folder/(org_f + '.wav')
    tofile.parent.mkdir(exist_ok=True, parents=True)
    shutil.copy(orgfile, tofile)

# Check if the files were copied correctly
assert len(df.uuid.values) == len(list(to_folder.rglob('*.wav'))), f'{len(list(to_folder.rglob("*.wav")))} is not 7054'
assert len(df.uuid.values) == 7054
print(f'Copied CoughVID {len(df.uuid.values)} files to {to_folder}.')



## HF_Lung

In [None]:
# Make a list of audio files (stethoscopes) in the HF_Lung_V2 dataset
files = list(Path(HF_Lung_V2).rglob('steth*.wav'))
print(len(files)) # --> shall be 3839

# Copy files to the destination folder
from_folder = Path(HF_Lung_V2)/'train'
to_folder = Path(dest_wav)/'HF_Lung_V2/train'

for i, f in enumerate(files):
    orgfile = from_folder/f
    assert orgfile.exists()
    tofile = to_folder/f.name
    tofile.parent.mkdir(exist_ok=True, parents=True)
    shutil.copy(orgfile, tofile)

assert len(files) == len(list(to_folder.rglob('*.wav'))), f'{len(list(to_folder.rglob("*.wav")))} is not 3839'
assert len(files) == 3839
print(f'Copied HF_Lung_V2 {len(files)} files to {to_folder}.')