Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import pandas as pd

# Not immediately usable with this dataset as files contain 24-bit data
from scipy.io import wavfile
from scipy.io.wavfile import read, write

import IPython.display as ipd
from IPython.display import Audio
from numpy.fft import fft, ifft
from glob import glob

import librosa as lr
import librosa.display

import os
import pickle

import seaborn as sns

from scipy import signal as sig
from scipy.signal import butter, lfilter
from tqdm import tqdm
import pywt
import random
import skimage.data
from skimage.restoration import denoise_wavelet
from pywt import swt
import time as tm

# if ('pydub' in globals()) == False:
#   !pip install pydub
# from pydub import AudioSegment
# from pydub.utils import make_chunks


# LT-06-24: import io to save intermediate outputs; use "as" to shorten module.function name as "spio"
import scipy.io as spio


# LT-06-24: install modules if not found 
# if ( 'umap' in globals() ) == False: 
#   !pip install umap-learn

import umap.umap_ as umap

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, minmax_scale
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.dummy import DummyClassifier
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_samples, silhouette_score, classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix

2021-12-20 21:29:17.894905: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2021-12-20 21:29:17.895033: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Directory

In [2]:
root = '../input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/audio_and_txt_files/'
outdir = '/kaggle/working/'

# Extract text filenames
textnames = [s.split('.')[0] for s in os.listdir(path = root) if '.txt' in s]

# Extract audio filenames
filenames = [s.split('.')[0] for s in os.listdir(path = root) if '.wav' in s]

textnames = sorted(textnames)
filenames = sorted(filenames)

Information & Annotations

In [3]:
diag_df = pd.read_csv('../input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/patient_diagnosis.csv', 
                      delimiter=',',
                      names=['Subject ID', 'Diagnosis'], 
                      header=None)

# Extract file annotation information (start & end of crackles)
def extract_file(filename, root):
    tokens = filename.split('_')
    recording_annotations = pd.read_csv(os.path.join(root, filename + '.txt'), names = ['Start', 'End', 'Crackles', 'Wheezes'], delimiter= '\t')
    return recording_annotations

# Information about each file with annotations from each file
def combine_anno_info(names, root):
  information = []
  for file in names:
    anno = extract_file(file, root)
    name_data = file.split('_')
    anno['Subject ID'] = name_data[0]
    anno['Chest Location'] = name_data[2]
    anno['Mode'] = name_data[3]
    anno['Equipment'] = name_data[4]
    anno['Filename'] = file
    information.append(anno)
  return information

anno_info = combine_anno_info(textnames, root)
anno_df = pd.concat(anno_info, ignore_index=True)

# Merge diag_df with anno_df on Subject ID
diag_df['Subject ID'] = diag_df['Subject ID'].astype('int32')
anno_df['Subject ID'] = anno_df['Subject ID'].astype('int32')
sub_anno_df = pd.merge(diag_df, anno_df, on='Subject ID')
sub_anno_df.head()

Unnamed: 0,Subject ID,Diagnosis,Start,End,Crackles,Wheezes,Chest Location,Mode,Equipment,Filename
0,101,URTI,0.036,0.579,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron
1,101,URTI,0.579,2.45,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron
2,101,URTI,2.45,3.893,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron
3,101,URTI,3.893,5.793,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron
4,101,URTI,5.793,7.521,0,0,Al,sc,Meditron,101_1b1_Al_sc_Meditron


Signals & Sampling Rates

In [4]:
# Get waveforms
def signal_and_sr(filename, root):
    name = root + filename + '.wav'
    signal, sr = lr.load(name, sr=None)
    return signal, sr

def waveforms(files, root):
  signals = []
  srs = []
  for name in files:
    (sig, sr) = signal_and_sr(name, root) # signal is an array with (sr * duration) values
    # if (len(sig) % 2) == 0: # signal array must be an even number of values for MODWT
    signals.append(sig)
    srs.append(sr)
  return signals, srs

In [5]:
# if (os.path.exists(outdir + '/signals.mat') == False):
#   signals, srs = waveforms(filenames, root)
#   spio.savemat(outdir + 'signals.mat', {'signals': signals, 'srs': srs})
# else:
#   sig_mat = spio.loadmat(outdir + 'signals.mat', squeeze_me=True)
#   signals = sig_mat['signals']
#   srs = sig_mat['srs']

In [6]:
sig_mat = spio.loadmat('../input/google-colab/signals.mat', squeeze_me=True)
# signals = sig_mat['signals']
srs = sig_mat['srs']

In [7]:
# plt.figure(figsize=(8,5))
# lr.display.waveplot(signals[648], sr=srs[648])
# plt.xlabel('Time (s)')
# plt.ylabel('Amplitude')
# plt.title(filenames[648])
# plt.show()

In [8]:
# set(srs)

Audio Slicing

In [9]:
def audio_slice(signals, files, srs, timestamp_df):
  """
  segments the extracted audio signals using timestamp annotations
  """
  slices = []
  for i, s in enumerate(signals):
    filenames = timestamp_df.loc[timestamp_df['Filename'] == files[i]]
    
    for j, row in filenames.iterrows():
      start = row['Start']
      end = row['End']
      audio_seg = slice_signal(start, end, s, srs[i])
      slices.append(audio_seg)
  return slices

def slice_signal(start, end, signal, sr):
  max_index = len(signal)
  start_index = min(int(start * sr), max_index)
  end_index = min(int(end * sr), max_index)
  return signal[start_index:end_index]

In [10]:
# if (os.path.exists(outdir + '/slices.mat') == False):
#   slices = audio_slice(signals, filenames, sub_anno_df)
#   spio.savemat(outdir + 'slices.mat', {'slices': slices})
# else:
#   slice_mat = spio.loadmat(outdir + 'slices.mat', squeeze_me=True)
#   slices = slice_mat['slices']

In [11]:
slice_mat = spio.loadmat('../input/google-colab/slices.mat', squeeze_me=True)
slices = slice_mat['slices']

In [12]:
def slice_df(filenames, timestamp_df):
  """
  compiles the slice names and crackle/wheeze labels into one dataframe
  """
  names = []
  crackles = []
  wheezes = []
  for i, name in enumerate(filenames):
    files = timestamp_df.loc[timestamp_df['Filename'] == name]
    
    for i, row in files.iterrows():
      crackles.append(row['Crackles'])
      wheezes.append(row['Wheezes'])
      names.append(name + "_" + str(i))
  return pd.DataFrame(data = {'Slice Name': names, 'Crackles': crackles, 'Wheezes': wheezes})

In [13]:
slice_anno_df = slice_df(filenames, sub_anno_df)
slicenames = slice_anno_df['Slice Name'].tolist()
slice_anno_df.tail()

Unnamed: 0,Slice Name,Crackles,Wheezes
6893,226_1b1_Pl_sc_LittC2SE_6893,1,0
6894,226_1b1_Pl_sc_LittC2SE_6894,0,0
6895,226_1b1_Pl_sc_LittC2SE_6895,0,0
6896,226_1b1_Pl_sc_LittC2SE_6896,1,0
6897,226_1b1_Pl_sc_LittC2SE_6897,0,0


In [14]:
def slice_rates(timestamp_df, filenames, srs):
    """
    produces list of sample rates for all slices from audio files
    """
    slice_srs = []
    for i, name in enumerate(filenames):
        n_slices = len(timestamp_df[timestamp_df['Filename'] == name])
        slice_srs.extend([srs[i]] * n_slices)
    return slice_srs

slice_srs = slice_rates(sub_anno_df, filenames, srs)
# set(slice_srs)

Butterworth Bandpass Filter

In [15]:
def butter_bandpass(lowcut, highcut, sr, order=12):
  nyq = 0.5 * sr
  low = lowcut / nyq
  high = highcut / nyq
  b, a = butter(order, [low, high], btype='band')
  return b, a

def butter_bandpass_filter(data, lowcut, highcut, sr, order=12):
  b, a = butter_bandpass(lowcut, highcut, sr, order=order)
  y = lfilter(b, a, data)
  return y

Preprocessing

In [16]:
# bbfs = []
# for i, s in enumerate(slices):
#     if slice_srs[i] != 4000:
#         resample = lr.resample(s, slice_srs[i], 4000)
#         z = butter_bandpass_filter(resample, 120, 1800, sr=4000)
#         bbfs.append(len(z))
#     else:
#         y = butter_bandpass_filter(s, 120, 1800, sr=4000)
#         bbfs.append(len(y))
#     if i == 1000:
#         print("done 1")
#     if i == 2000:
#         print("done 2")
#     if i == 3000:
#         print("done 3")
#     if i == 4000:
#         print("done 4")
#     if i == 5000:
#         print("done 5")
#     if i == 6000:
#         print("done 6")
# max(bbfs)

In [17]:
def preprocessing(signals, orig_srs, new_sr):
  processed = np.zeros((len(signals), 64652))
  for i, signal in enumerate(signals):
        if orig_srs[i] != new_sr:
            resample = lr.resample(signal, orig_srs[i], new_sr)
            y = butter_bandpass_filter(resample, 120, 1800, new_sr) # Sampling rate at 4000Hz according to Roche et al., also standard for Eko Duo (typical for librosa but dataset contains more than one sr - 44100, 4000 and 10000)
            processed[i, :len(y)] = y
        else:
            z = butter_bandpass_filter(signal, 120, 1800, new_sr)
            processed[i, :len(z)] = z
        if (i / 1000).is_integer():
            print('Running...')
  return processed

# for s in slices[0:10]:
#   print(len(butter_bandpass_filter(s, 120, 1800)))

In [18]:
# if (os.path.exists(outdir + '/prepro_slices.mat') == False):
#   prepro_slices = preprocessing(slices, slice_srs)
#   spio.savemat(outdir + 'prepro_slices.mat', {'prepro_slices': prepro_slices})
# else:
#   pps_mat = spio.loadmat(outdir + 'prepro_slices.mat', squeeze_me=True)
#   prepro_slices = pps_mat['prepro_slices']

# pps_mat = spio.loadmat('../input/google-colab/prepro_slices.mat', squeeze_me=True)
# prepro_slices = pps_mat['prepro_slices']

In [19]:
# prepro_slices = preprocessing(slices, slice_srs, 4000)

In [20]:
# np.save("prepro_slices.npy", prepro_slices)

In [21]:
try:
    prepro_slices = np.load("../input/icbhi-analysis/prepro_slices.npy")
except:
    prepro_slices = preprocessing(slices, slice_srs, 4000)
    np.save("prepro_slices.npy", prepro_slices)

Running...
Running...
Running...
Running...
Running...
Running...
Running...


In [22]:
# print(len(prepro_slices))
# print(len(slices))
# print(len(slicenames))
# print(len(slice_srs))

In [23]:
# plt.figure(figsize=(8,5))
# lr.display.waveplot(slices[400])
# plt.xlabel('Time (s)')
# plt.ylabel('Amplitude')
# plt.title(slicenames[400])
# plt.show()

In [24]:
# plt.figure(figsize=(8,5))
# lr.display.waveplot(prepro_slices[400])
# plt.xlabel('Time (s)')
# plt.ylabel('Amplitude')
# plt.title(slicenames[400])
# plt.xlim(0, 0.400)
# plt.show()

Graph Sounds

In [25]:
def graph_sounds(index, signals, srs, filenames, timestamp_df):
  df = timestamp_df.loc[timestamp_df['Filename'] == filenames[index]]
  signal = signals[index]

  crackles = df.loc[df['Crackles'] >= 1]
  crackle_times = crackles[['Start', 'End']]
  wheezes = df.loc[df['Wheezes'] >= 1]
  wheeze_times = wheezes[['Start', 'End']]

  plt.figure(figsize=(8,5))
  lr.display.waveplot(signal, sr=srs[index], color='g')
  plt.xlabel('Time (s)')
  plt.ylabel('Amplitude')
  plt.title(filenames[index])

  if len(crackle_times) != 0:
    for i, row in crackle_times.iterrows():
      plt.axvspan(row[0], row[1], alpha=0.2, color='red')
  
  if len(wheeze_times) != 0:
    for i, row in wheeze_times.iterrows():
      plt.axvspan(row[0], row[1], alpha=0.2, color='blue')
  
  cr = mpatches.Patch(color='red', alpha=0.2, label='Crackle')
  wh = mpatches.Patch(color='blue', alpha=0.2, label='Wheeze')

  plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', handles=[cr, wh])

  return plt.show()

In [26]:
# for index, row in sub_anno_df.iterrows():
#   if index in range(0, 10):
#     print(row['End'] - row['Start'])

In [27]:
# graph_sounds(352, signals, srs, filenames, sub_anno_df)

Zero-Crossing Rate

In [28]:
def zcr(signals):
  """
  counts the number of times the signal crosses the horizontal axis
  """
  count = []
  for s in signals:
    count.append(sum(librosa.zero_crossings(s, pad=False)))
  return count

Spectrogram

In [29]:
def spec_stft(signals):
  """
  converts signals to STFTs to determine the amplitude of frequencies at a given time;
  loudness of a signal over time at various frequencies
  """
  specs = []
  for s in signals:
    stft = librosa.stft(s)
    specs.append(librosa.amplitude_to_db(abs(stft)))
  return specs

Spectral Centroid

In [30]:
def spec_centroid(signals, srs):
  """
  determines which frequency the energy of a signal spectrum is centered upon;
  center of mass for a sound

  second dimension is the values of spectral centroid across time
  """
  centroids = []
  for i, s in enumerate(signals):
    centroids.append(librosa.feature.spectral_centroid(s, sr=srs[i], n_fft=512, hop_length=256)[0])
  return centroids

Mel-Frequency Cepstral Coefficients

In [31]:
def mel_freq_cc(signals, srs):
  """
  extracts the timbral information (spectral envelope) of signals
  """
  mfccs = []
  for i, s in enumerate(signals):
    mfccs.append(librosa.feature.mfcc(y=s, sr=srs[i]))
  return mfccs

Maximal Overlap Discrete Wavelet Transform

In [32]:
def modwt(signals, level):
  """
  takes denoised wavelets and performs multiple-overlap 1D stationary
  wavelet transforms (translation-invariance modification of DWT) on each signal
  
  Fraiwan et al. perform a soft MODWT of level 4, but the array data after denoising
  is not long enough so level 1 is selected

  Butterworth bandpass filtering allowed level to be set at 3.

  Output: list of approximation (index 0) & detail (index 1) coefficients
  """
  modwts = []
  for s in signals:
    modwts.append(pywt.swt(s, 'db5', level=level, trim_approx=True, norm=True))
  return modwts

Classes

In [33]:
def slice_classes(preprocessed_slices, slices, slice_df):
    classes = []
    for i, signal in enumerate(preprocessed_slices):
        if (len(signal) % 2) == 0:
            row = slice_df.loc[slice_df['Slice Name'] == slicenames[i]]
            crackle = int(row['Crackles'])
            wheeze = int(row['Wheezes'])
            # normal
            if (crackle == 0) & (wheeze == 0):
                classes.append(0)
            # crackles only
            elif (crackle > 0) & (wheeze == 0):
                classes.append(1)
            # wheezes only
            elif (crackle == 0) & (wheeze > 0):
                classes.append(2)
            # crackles & wheezes
            else:
                classes.append(3)
    return np.array(classes)

In [34]:
classes = slice_classes(prepro_slices, slicenames, slice_anno_df)

In [35]:
set0 = np.where(classes == 0)[0].tolist()
set1 = np.where(classes == 1)[0].tolist()
set2 = np.where(classes == 2)[0].tolist()
set3 = np.where(classes == 3)[0].tolist()
m = min(len(set0), len(set1), len(set2), len(set3))

In [36]:
set0 = set0[:m]
set1 = set1[:m]
set2 = set2[:m]
set3 = set3[:m]
balanced_set = np.hstack((set0, set1, set2, set3))
np.save("balanced_set.npy", balanced_set)

In [37]:
class_list = [0] * int(len(balanced_set)/4)
class_list.extend([1] * int(len(balanced_set)/4))
class_list.extend([2] * int(len(balanced_set)/4))
class_list.extend([3] * int(len(balanced_set)/4))
len(class_list)

2024

Extraction

In [38]:
# modwts = []
# for i, s in enumerate(prepro_slices):
#     if (len(s) % 2) == 0:
#         modwt = pywt.swt(s, 'db5', level=2, trim_approx=True, norm=True)
#         if np.array(modwt).shape not in modwts:
#             modwts.append(np.array(modwt).shape)
# modwts

In [39]:
# a = np.zeros((len(prepro_slices), 20, 127))
# mfcc = lr.feature.mfcc(prepro_slices[0], sr=4000)
# b,c = mfcc.shape
# a[0, :b, :c] = mfcc
# mfcc

In [40]:
def extract(preprocessed_slices, sr=4000):
    """
    preprocessed_slices: array of sliced signals that have already undergone preprocessing
    sr: sampling rate of slices
    """
    zcrs = np.zeros((len(preprocessed_slices), 1))
    spectrograms = np.zeros((len(preprocessed_slices), 1025, 127))
    centroids = np.zeros((len(preprocessed_slices), 253))
    mfccs = np.zeros((len(preprocessed_slices), 20, 127))
    modwts = np.zeros((len(preprocessed_slices), 3, 64652))
    
    for i, signal in enumerate(preprocessed_slices):
        if (len(signal) % 2) == 0: # signal array must be an even number of values for MODWT
            # Zero-crossing rate
            zcr = sum(lr.zero_crossings(signal, pad=False))
            zcrs[i, 0] = zcr
    
            # Spectrograms
            stft = lr.stft(signal)
            spec = lr.amplitude_to_db(abs(stft))
            y, z = spec.shape
            spectrograms[i, :y, :z] = spec
    
            # Spectral centroids
            cent = lr.feature.spectral_centroid(signal, sr, n_fft=512, hop_length=256)[0]
            centroids[i, :len(cent)] = cent
    
            # Mel-cepstral frequency coefficients
            mfcc = lr.feature.mfcc(signal, sr)
            y, z = mfcc.shape
            mfccs[i, :y, :z] = mfcc
    
            # Maximal overlap discrete wavelet transform
            modwt = np.array(pywt.swt(signal, 'db5', level=2, trim_approx=True, norm=True))
            y, z = modwt.shape
            modwts[i, :y, :z] = modwt
    
    return zcrs, spectrograms, centroids, mfccs, modwts

In [41]:
# start = tm.time()

# zcrs, spectrograms, centroids, mfccs, modwts = extract(prepro_slices[:1000])

# if (os.path.exists(outdir + '/zerocross.mat') == False):
#     zcrs, spectrograms, centroids, mfccs, modwts = extract(prepro_slices, slice_srs)
#     features = np.hstack([zcrs, spectrograms, centroids, mfccs, modwts])
    
#     spio.savemat(outdir + 'zcrs.mat', {'zcrs': zcrs})
#     spio.savemat(outdir + 'spectrograms.mat', {'spectrograms': spectrograms})
#     spio.savemat(outdir + 'centroids.mat', {'centroids': centroids})
#     spio.savemat(outdir + 'mfccs.mat', {'mfccs': mfccs})
#     spio.savemat(outdir + 'modwts.mat', {'modwts': modwts})
    
# else:
#     zcr_mat = spio.loadmat(outdir + 'zcrs.mat', squeeze_me=True)
#     zcrs = zcr_mat['zerocross']
#     spec_mat = spio.loadmat(outdir + 'spectrograms.mat', squeeze_me=True)
#     spectrograms = spec_mat['spectrograms']
#     cent_mat = spio.loadmat(outdir + 'centroids.mat', squeeze_me=True)
#     centroids = cent_mat['centroids']
#     mfcc_mat = spio.loadmat(outdir + 'mfccs.mat', squeeze_me=True)
#     mfccs = mfcc_mat['mfccs']
#     modwt_mat = spio.loadmat(outdir + 'modwts.mat', squeeze_me=True)
#     modwts = modwt_mat['modwts']

# end = tm.time()
# print(f"Runtime to extract is {end-start} s")

In [42]:
# np.save("zcrs.npy", zcrs)
# np.save("spectrograms.npy", spectrograms)
# np.save("centroids.npy", centroids)
# np.save("mfccs.npy", mfccs)
# np.save("modwts.npy", modwts)

In [43]:
# try:
#     a,b,c = spectrograms.shape
#     d,e,f = mfccs.shape
#     g,h,i = modwts.shape
#     spectrograms = spectrograms.reshape((a, b*c))
#     mfccs = mfccs.reshape((d, e*f))
#     modwts = modwts.reshape((g, h*i))
# except NameError:
#     zcrs = np.load("../input/icbhi-analysis/zcrs.npy")
#     spectrograms = np.load("../input/icbhi-analysis/spectrograms.npy")
#     centroids = np.load("../input/icbhi-analysis/centroids.npy")
#     mfccs = np.load("../input/icbhi-analysis/mfccs.npy")
#     modwts = np.load("../input/icbhi-analysis/modwts.npy")
    
#     a,b,c = spectrograms.shape
#     d,e,f = mfccs.shape
#     g,h,i = modwts.shape
#     spectrograms = spectrograms.reshape((a, b*c))
#     mfccs = mfccs.reshape((d, e*f))
#     modwts = modwts.reshape((g, h*i))

In [44]:
# print(zcrs.shape)
# print(spectrograms.shape)
# print(centroids.shape)
# print(mfccs.shape)
# print(modwts.shape)

In [45]:
# f = {'zcrs' : [zcrs],
#      'spectrograms': [spectrograms],
#      'centroids': [centroids],
#      'mfccs': [mfccs],
#      'modwts': [modwts]}
# feature_df = pd.DataFrame(data=f)
# feature_df.to_csv('features.csv')
# feature_df.head()

In [46]:
# features = np.hstack((zcrs, spectrograms, centroids, mfccs, modwts))
# features.shape

Balanced Extraction

In [47]:
def balanced_extract(preprocessed_slices, slice_set, sr=4000):
    """
    preprocessed_slices: array of sliced signals that have already undergone preprocessing
    slice_set: indices of slices in preprocessed_slices; if balanced, divided into an equal number of each class (i.e. normal, crackle, etc.)
    sr: sampling rate of slices
    """
    zcrs = np.zeros((len(balanced_set), 1))
    spectrograms = np.zeros((len(balanced_set), 1025, 127))
    centroids = np.zeros((len(balanced_set), 253))
    mfccs = np.zeros((len(balanced_set), 20, 127))
    modwts = np.zeros((len(balanced_set), 3, 64652))
    
    for i, index in enumerate(balanced_set):
        signal = preprocessed_slices[index]
        if (len(signal) % 2) == 0: # signal array must be an even number of values for MODWT
            # Zero-crossing rate
            zcr = sum(lr.zero_crossings(signal, pad=False))
            zcrs[i, 0] = zcr
    
            # Spectrograms
            stft = lr.stft(signal)
            spec = lr.amplitude_to_db(abs(stft))
            y, z = spec.shape
            spectrograms[i, :y, :z] = spec
    
            # Spectral centroids
            cent = lr.feature.spectral_centroid(signal, sr, n_fft=512, hop_length=256)[0]
            centroids[i, :len(cent)] = cent
    
            # Mel-cepstral frequency coefficients
            mfcc = lr.feature.mfcc(signal, sr)
            y, z = mfcc.shape
            mfccs[i, :y, :z] = mfcc
    
            # Maximal overlap discrete wavelet transform
            modwt = np.array(pywt.swt(signal, 'db5', level=2, trim_approx=True, norm=True))
            y, z = modwt.shape
            modwts[i, :y, :z] = modwt
    
    return zcrs, spectrograms, centroids, mfccs, modwts

In [48]:
# start = tm.time()

# zcrs, spectrograms, centroids, mfccs, modwts = balanced_extract(prepro_slices, balanced_set)

# end = tm.time()
# print(f"Runtime to extract is {end-start} s")

In [49]:
# a,b,c = spectrograms.shape
# d,e,f = mfccs.shape
# g,h,i = modwts.shape
# spectrograms = spectrograms.reshape((a, b*c))
# mfccs = mfccs.reshape((d, e*f))
# modwts = modwts.reshape((g, h*i))

In [50]:
# print(zcrs.shape)
# print(spectrograms.shape)
# print(centroids.shape)
# print(mfccs.shape)
# print(modwts.shape)

In [51]:
# features = np.hstack((zcrs, spectrograms, centroids, mfccs, modwts))
# features.shape

K-Nearest Neighbours

In [52]:
# X_train = features[:750]
# X_test = features[750:]
# y_train = classes[:750]
# y_test = classes[750:]

# 75% train, 25% test
# X_train, X_test, y_train, y_test = train_test_split(features, class_list, random_state=123)

In [53]:
# nn = NearestNeighbors(n_neighbors=10)
# nn.fit(X_train);

In [54]:
# distances, nearby_samples = nn.kneighbors(X_train[[0]])
# nearby_samples

In [55]:
# knn = KNeighborsClassifier(n_neighbors=4)
# knn.fit(X_train, y_train)

# y_pred = knn.predict(X_test)

In [56]:
# plot_confusion_matrix(knn, X_test, y_test)

In [57]:
# accuracy_score(y_test, y_pred)

In [58]:
# dc = DummyClassifier(strategy="most_frequent")
# dc.fit(X_train, y_train)

# dc_pred = dc.predict(X_test)

In [59]:
# accuracy_score(y_test, dc_pred)

In [60]:
# k_range = range(1, 16)

# scores = []

# for k in k_range:
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(X_train, y_train)
#     y_pred = knn.predict(X_test)
#     scores.append(accuracy_score(y_test, y_pred))

# print(scores)

In [61]:
# Pipeline
# pipe_knn = Pipeline([
#     ('preprocessor', preprocessing()),
#     ('extraction', extract()),
#     ('knn', KNeighborsClassifier())
# ])

In [62]:
# expert = '../input/dryrun/9002_expert/'
# expertnames = [s.split('.')[0] for s in os.listdir(path = expert) if '.wav' in s]

In [63]:
# exp_sigs, exp_srs = waveforms(expertnames, expert)

In [64]:
# prepro_exp = preprocessing(exp_sigs, exp_srs, 4000)

In [65]:
# exp_zcrs, exp_spectrograms, exp_centroids, exp_mfccs, exp_modwts = extract(prepro_exp)

In [66]:
# exp_spectrograms = exp_spectrograms.reshape((len(exp_spectrograms), 1025*127))
# exp_mfccs = exp_mfccs.reshape((len(exp_mfccs), 20*127))
# exp_modwts = exp_modwts.reshape((len(exp_modwts), 3*64652))

# exp_features = np.hstack((exp_zcrs, exp_spectrograms, exp_centroids, exp_mfccs, exp_modwts))
# exp_features.shape

In [67]:
# y_pred_exp = knn.predict(exp_features)
# print(y_pred_exp)

In [68]:
# nonexpert = '../input/dryrun/9002_nonexpert/'
# nonexpertnames = [s.split('.')[0] for s in os.listdir(path = nonexpert) if '.wav' in s]

In [69]:
# nonexp_sigs, nonexp_srs = waveforms(nonexpertnames, nonexpert)

In [70]:
# prepro_nonexp = preprocessing(nonexp_sigs, nonexp_srs, 4000)

In [71]:
# nonexp_zcrs, nonexp_spectrograms, nonexp_centroids, nonexp_mfccs, nonexp_modwts = extract(prepro_nonexp)

In [72]:
# nonexp_spectrograms = nonexp_spectrograms.reshape((len(nonexp_spectrograms), 1025*127))
# nonexp_mfccs = nonexp_mfccs.reshape((len(nonexp_mfccs), 20*127))
# nonexp_modwts = nonexp_modwts.reshape((len(nonexp_modwts), 3*64652))

# nonexp_features = np.hstack((nonexp_zcrs, nonexp_spectrograms, nonexp_centroids, nonexp_mfccs, nonexp_modwts))
# nonexp_features.shape

In [73]:
# y_pred_nonexp = knn.predict(nonexp_features)
# print(y_pred_nonexp)

In [74]:
# volunteer = '../input/dryrun/9002_volunteer/'
# volunteernames = [s.split('.')[0] for s in os.listdir(path = volunteer) if '.wav' in s]

In [75]:
# vol_sigs, vol_srs = waveforms(volunteernames, volunteer)

In [76]:
# prepro_vol = preprocessing(vol_sigs, vol_srs, 4000)

In [77]:
# vol_zcrs, vol_spectrograms, vol_centroids, vol_mfccs, vol_modwts = extract(prepro_vol)

In [78]:
# vol_spectrograms = vol_spectrograms.reshape((len(vol_spectrograms), 1025*127))
# vol_mfccs = vol_mfccs.reshape((len(vol_mfccs), 20*127))
# vol_modwts = vol_modwts.reshape((len(vol_modwts), 3*64652))

# vol_features = np.hstack((vol_zcrs, vol_spectrograms, vol_centroids, vol_mfccs, vol_modwts))
# vol_features.shape

In [79]:
# y_pred_vol = knn.predict(vol_features)
# print(y_pred_vol)

In [80]:
# pred_prob_vol = knn.predict_proba(vol_features)
# print(pred_prob_vol)

In [81]:
# sub_anno_df.loc[sub_anno_df["Diagnosis"] == 'Healthy'].head()

In [82]:
# filenames.index('102_1b1_Ar_sc_Meditron')

In [83]:
# plt.figure(figsize=(8,5))
# lr.display.waveplot(signals[2], sr=srs[2])
# plt.xlabel('Time (s)')
# plt.ylabel('Amplitude')
# plt.title(filenames[2])
# plt.show()

In [84]:
# plt.figure(figsize=(8,5))
# lr.display.waveplot(exp_sigs[0], sr=exp_srs[0])
# plt.xlabel('Time (s)')
# plt.ylabel('Amplitude')
# plt.title(expertnames[0])
# plt.show()

Zero Padding

In [85]:
# plt.scatter(x=(sub_anno_df['End']-sub_anno_df['Start']), y=sub_anno_df["Subject ID"])

In [86]:
# Ideal length is around 6
# plt.boxplot(x=(sub_anno_df['End']-sub_anno_df['Start']));

In [87]:
def zero_pad(slices, srs, annotation_df, max_length):
    """
    add zeros to signals so that all are the same length
    """
    padded_slices = []
    for i, row in annotation_df.iterrows():
        start = row['Start']
        end = row['End']
        
        # If length > maximum length, change to maximum length
        if (end - start) > max_length:
            end = start + max_length
        
        sample = slice_signal(start, end, slices[i], srs[i])
        
        # Note: librosa automatically converts audio to single channel (mono - 'sc') even if it is multi channel (stereo - 'mc')
        required_length = max_length * srs[i]
        padded_slices.append(lr.util.pad_center(sample, required_length))
    return padded_slices

In [88]:
# Sample rates can differ so not all slices are the same length
# padded_slices = zero_pad(slices, slice_srs, sub_anno_df, 6)

In [89]:
# def preprocessing_v2(slices, srs):
    # Low pass butterworth bandpass
    
    # High pass butterworth bandpass
    
    # Wavelet filter
    