## Import google-colab and respiratory-sound-database data before running

LT-2021-11-07: Shared google-colab and contents (```*mat```) with collaborator

# Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import pandas as pd

# Not immediately usable with this dataset as files contain 24-bit data
from scipy.io import wavfile
from scipy.io.wavfile import read, write

import IPython.display as ipd
from IPython.display import Audio
from numpy.fft import fft, ifft
from glob import glob

import librosa as lr
import librosa.display

import os
import pickle

import seaborn as sns

from scipy import signal as sig
from scipy.signal import butter, lfilter, sosfilt
from tqdm import tqdm
import pywt
import random
import skimage.data
from skimage.restoration import denoise_wavelet
from pywt import swt
import time as tm

# if ('pydub' in globals()) == False:
#   !pip install pydub
# from pydub import AudioSegment
# from pydub.utils import make_chunks


# LT-06-24: import io to save intermediate outputs; use "as" to shorten module.function name as "spio"
import scipy.io as spio


# LT-06-24: install modules if not found 
# if ( 'umap' in globals() ) == False: 
#   !pip install umap-learn

import umap.umap_ as umap

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, minmax_scale
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.dummy import DummyClassifier
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_samples, silhouette_score, classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix

print('Importing done')

Importing done


# Root and Directory

In [2]:
root = '../input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/audio_and_txt_files/'
outdir = '/kaggle/working/'

# Extract text filenames
textnames = [s.split('.')[0] for s in os.listdir(path = root) if '.txt' in s]

# Extract audio filenames
filenames = [s.split('.')[0] for s in os.listdir(path = root) if '.wav' in s]

textnames = sorted(textnames)
filenames = sorted(filenames)

# Information & Annotations

In [3]:
diag_df = pd.read_csv('../input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/patient_diagnosis.csv', 
                      delimiter=',',
                      names=['Subject ID', 'Diagnosis'], 
                      header=None)

# Extract file annotation information (start & end of crackles)
def extract_file(filename, root):
    tokens = filename.split('_')
    recording_annotations = pd.read_csv(os.path.join(root, filename + '.txt'), names = ['Start', 'End', 'Crackles', 'Wheezes'], delimiter= '\t')
    return recording_annotations

# Information about each file with annotations from each file
def combine_anno_info(names, root):
  information = []
  for file in names:
    anno = extract_file(file, root)
    name_data = file.split('_')
    anno['Subject ID'] = name_data[0]
    anno['Chest Location'] = name_data[2]
    anno['Mode'] = name_data[3]
    anno['Equipment'] = name_data[4]
    anno['Filename'] = file
    information.append(anno)
  return information

anno_info = combine_anno_info(textnames, root)
anno_df = pd.concat(anno_info, ignore_index=True)

# Merge diag_df with anno_df on Subject ID
diag_df['Subject ID'] = diag_df['Subject ID'].astype('int32')
anno_df['Subject ID'] = anno_df['Subject ID'].astype('int32')
sub_anno_df = pd.merge(diag_df, anno_df, on='Subject ID')
sub_anno_df.head()

Unnamed: 0,Subject ID,Diagnosis,Start,End,Crackles,Wheezes,Chest Location,Mode,Equipment,Filename
0,101,URTI,0.036,0.579,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron
1,101,URTI,0.579,2.45,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron
2,101,URTI,2.45,3.893,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron
3,101,URTI,3.893,5.793,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron
4,101,URTI,5.793,7.521,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron


In [4]:
# Get waveforms
def signal_and_sr(filename, root):
    name = root + filename + '.wav'
    signal, sr = lr.load(name, sr=None)
    return signal, sr

def waveforms(files, root):
  signals = []
  srs = []
  for name in files:
    (sig, sr) = signal_and_sr(name, root) # signal is an array with (sr * duration) values
    # if (len(sig) % 2) == 0: # signal array must be an even number of values for MODWT
    signals.append(sig)
    srs.append(sr)
  return signals, srs

In [5]:
sig_mat = spio.loadmat('../input/google-colab/signals.mat', squeeze_me=True)

signals = sig_mat['signals']
srs = sig_mat['srs']

# Slicing

In [6]:
def audio_slice(signals, files, srs, timestamp_df):
  """
  segments the extracted audio signals using timestamp annotations
  """
  slices = []
  for i, s in enumerate(signals):
    filenames = timestamp_df.loc[timestamp_df['Filename'] == files[i]]
    
    for j, row in filenames.iterrows():
      start = row['Start']
      end = row['End']
      audio_seg = slice_signal(start, end, s, srs[i])
      slices.append(audio_seg)
    return slices

def slice_signal(start, end, signal, sr):
    max_index = len(signal)
    start_index = min(int(round(start * sr)), max_index)
    end_index = min(int(round(end * sr)), max_index)
    return signal[start_index:end_index]

In [7]:
slice_mat = spio.loadmat('../input/google-colab/slices.mat', squeeze_me=True)
slices = slice_mat['slices']

In [8]:
def slice_df(filenames, timestamp_df):
  """
  compiles the slice names and diagnosis labels into one dataframe
  """
  names = []
  diagnoses = []
  for i, name in enumerate(filenames):
    files = timestamp_df.loc[timestamp_df['Filename'] == name]
    
    for i, row in files.iterrows():
      diagnoses.append(row['Diagnosis'])
      names.append(name + "_" + str(i))
  return pd.DataFrame(data = {'Slice Name': names, 'Diagnosis': diagnoses})

In [9]:
slice_anno_df = slice_df(filenames, sub_anno_df)
slicenames = slice_anno_df['Slice Name'].tolist()
slice_anno_df.tail()

Unnamed: 0,Slice Name,Diagnosis
6893,226_1b1_Pl_sc_LittC2SE_6893,Pneumonia
6894,226_1b1_Pl_sc_LittC2SE_6894,Pneumonia
6895,226_1b1_Pl_sc_LittC2SE_6895,Pneumonia
6896,226_1b1_Pl_sc_LittC2SE_6896,Pneumonia
6897,226_1b1_Pl_sc_LittC2SE_6897,Pneumonia


In [10]:
def slice_rates(timestamp_df, filenames, srs):
    """
    produces list of sample rates for all slices from audio files
    """
    slice_srs = []
    for i, name in enumerate(filenames):
        n_slices = len(timestamp_df[timestamp_df['Filename'] == name])
        slice_srs.extend([srs[i]] * n_slices)
    return slice_srs

In [11]:
try:
    slice_srs = np.load("../input/icbhi-preprocessing-v3/slice_srs.npy")
    print('loaded')
except:
    slice_srs = slice_rates(sub_anno_df, filenames, srs)
    np.save("slice_srs.npy", slice_srs)

set(slice_srs)

loaded


{4000, 10000, 44100}

# Classes

In [12]:
diagnoses = sub_anno_df['Diagnosis'].unique()
nums = np.arange(0,8)
diag_dict = dict(zip(diagnoses, nums))
diag_dict

{'URTI': 0,
 'Healthy': 1,
 'Asthma': 2,
 'COPD': 3,
 'LRTI': 4,
 'Bronchiectasis': 5,
 'Pneumonia': 6,
 'Bronchiolitis': 7}

In [13]:
def slice_classes(names, slice_df):
    classes = []
    for i, row in slice_df.iterrows():
        classes.append(diag_dict[row['Diagnosis']])
    return np.array(classes)

In [14]:
try:
    class_list = np.load('../input/icbhi-preprocessing-v4/class_list.npy')
except:
    class_list = slice_classes(slicenames, slice_anno_df)
    
    np.save('class_list.npy', class_list)

# Preprocessing
### Evaluation of features for classification of wheezes and normal respiratory sounds (Pramono et al., 2019)

In [15]:
def preprocessing(samples, orig_srs, files, annotation_df, max_length=6, new_sr=8000, lowcut=150, highcut=2000, order=5):
    """
    samples: NON-SLICED signals
    orig_srs: sampling rates of each sample
    files: filenames of samples
    annotation_df: dataframe containing start and end times for each slice
    
    1) Partition and add zeros to slices so that they are the same length or reduce their length to the maximum length
    2) Apply bandpass filters and resample to 8000Hz
    3) Scale signal amplitudes to be between -1 and 1
    """
    # Zero-padding
    padded_slices = []
    slice_srs = []
    for i, signal in enumerate(samples):
        filerows = annotation_df.loc[annotation_df['Filename'] == files[i]]
    
        for j, row in filerows.iterrows():
            start = row['Start']
            end = row['End']
        
            # If length > maximum length, change to maximum length
            if (end - start) > max_length:
                end = start + max_length
        
            s = slice_signal(start, end, signal, orig_srs[i])

            # Note: librosa automatically converts audio to single channel (mono - 'sc') even if it is multi channel (stereo - 'mc')
            required_length = max_length * orig_srs[i]
            padded_slices.append(lr.util.pad_center(s, required_length))
            slice_srs.append(orig_srs[i])
    
    # Bandpass filter & resample (technically should be resampling after filtering but this causes a bug...)
    processed = np.zeros((len(padded_slices), (max_length*new_sr)))
    for i, signal in enumerate(padded_slices):
        resample = lr.resample(signal, slice_srs[i], new_sr)
        y = butter_bandpass_filter(resample, lowcut, highcut, new_sr, order) # dataset contains more than one sr - 44100, 4000 and 10000; Pramono et al. recommend 8000Hz as respiratory sound information is bandlimited up to 2000Hz..?
        processed[i, :len(y)] = y
    
    # Amplitude scaling to [-1, 1]
    for i, signal in enumerate(processed):
        peak = max(abs(signal))
        dB = 0

        amp_lin = 10**(dB/20)
        z = amp_lin*(1/peak)*signal
        
        processed[i, :len(z)] = z
        
    return processed

def butter_bandpass_filter(data, lowcut, highcut, sr, order):
    sos = butter_bandpass(lowcut, highcut, sr, order=order)
    y = sosfilt(sos, data)
    return y
    
def butter_bandpass(lowcut, highcut, sr, order):
    nyq = 0.5 * sr
    low = lowcut / nyq
    high = highcut / nyq
    sos = butter(order, [low, high], analog=False, btype='band', output='sos')
    return sos

In [16]:
try:
    prepro_slices = np.load("../input/icbhi-preprocessing-v4/prepro_slices.npy")
    print('loaded')
except:
    prepro_slices = preprocessing(signals, srs, filenames, sub_anno_df)
    np.save("prepro_slices.npy", prepro_slices)

In [17]:
len(prepro_slices)

6898

In [18]:
np.argwhere(np.isnan(prepro_slices).all(axis=1))

array([], shape=(0, 1), dtype=int64)