In [1]:
import os
import glob
import re
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, precision_recall_curve, plot_precision_recall_curve

sns.set()

try:
    os.environ['KAGGLE_DATA_PROXY_TOKEN']
except KeyError:
    dir_path = "Respiratory_Sound_Database/Respiratory_Sound_Database/"
else:
    dir_path = "/kaggle/input/Respiratory_Sound_Database/Respiratory_Sound_Database/"
    
fname_demo = dir_path + "demographic_info.txt"
fname_diag = dir_path + "patient_diagnosis.csv"
dir_audio = dir_path + "audio_and_txt_files/"

In [26]:
group_pat_num = "([0-9]{3})"
group_rec_index = "([0-9][a-z][0-9])"
group_chest_loc = "(Tc|Al|Ar|Pl|Pr|Ll|Lr)"
group_acc_modes = "(sc|mc)"
group_equipments = "(AKGC417L|LittC2SE|Litt3200|Meditron)"

regex = re.compile("_".join([group_pat_num, group_rec_index, group_chest_loc, group_acc_modes, group_equipments]))

top = os.getcwd()
os.chdir(dir_audio)
fnames = glob.glob("*.txt")
os.chdir(top)

### file name info & breath cycles to crackles / wheezes
l_rec_info = []
tups_num_cycles_sounds = []

for fname in fnames:
    match = regex.match(fname)
    pat_num = int(match.group(1))
    rec_index = match.group(2)
    chest_loc = match.group(3)
    acc_mode = match.group(4)
    equipment = match.group(5)
    
    l_rec_info.append([pat_num, rec_index, chest_loc, acc_mode, equipment])
    
    with open(dir_audio + fname) as f_annot:
        lines = [line.strip().split() for line in f_annot.readlines()]
        tups_num_cycles_sounds.append((pat_num, lines))

l_rec_info.sort(key=lambda subl: (subl[0], subl[1], subl[2], subl[3], subl[4]))
df_rec_info = pd.DataFrame(l_rec_info, columns=["Patient number", "Recording index", "Chest location", "Accuisition mode", "Recording Equipment"])


### diagnosis

diag = pd.read_csv(fname_diag, names=["Patient number", "Diagnosis"])
df_rec_info_diag = pd.merge(df_rec_info, diag, on="Patient number")


### demographic info

with open(fname_demo) as f_demo:
    # skip single empty line at the beginning
    f_demo.readline()

    lines = [line.strip().split() for line in f_demo.readlines()]


for split in lines:
    split[0] = int(split[0])
    if split[1] != "NA":
        split[1] = float(split[1])
    else:
        split[1] = np.nan        
    if split[2] != "NA":
        pass
    else:
        split[2] = np.nan        
    if split[3] != "NA":
        split[3] = float(split[3])
    else:
        split[3] = np.nan        
    if split[4] != "NA":
        split[4] = float(split[4])
    else:
        split[4] = np.nan
    if split[5] != "NA":
        split[5] = float(split[5])
    else:
        split[5] = np.nan

df_demo = pd.DataFrame(lines, columns=["Patient number", "Age", "Sex", "Adult BMI", "Child weight kg", "Child Height cm"])

df_full = pd.merge(df_rec_info_diag, df_demo, on="Patient number")

df_full

Unnamed: 0,Patient number,Recording index,Chest location,Accuisition mode,Recording Equipment,Diagnosis,Age,Sex,Adult BMI,Child weight kg,Child Height cm
0,101,1b1,Al,sc,Meditron,URTI,3.00,F,,19.0,99.0
1,101,1b1,Pr,sc,Meditron,URTI,3.00,F,,19.0,99.0
2,102,1b1,Ar,sc,Meditron,Healthy,0.75,F,,9.8,73.0
3,103,2b2,Ar,mc,LittC2SE,Asthma,70.00,F,33.00,,
4,104,1b1,Al,sc,Litt3200,COPD,70.00,F,28.47,,
...,...,...,...,...,...,...,...,...,...,...,...
915,224,1b2,Al,sc,Meditron,Healthy,10.00,F,,32.3,143.0
916,225,1b1,Pl,sc,Meditron,Healthy,0.83,M,,7.8,74.0
917,226,1b1,Al,sc,Meditron,Pneumonia,4.00,M,,16.7,103.0
918,226,1b1,Ll,sc,Meditron,Pneumonia,4.00,M,,16.7,103.0


In [28]:
# pd.DataFrame(l_cycles_sounds, columns=["Cycle Start", "Cycle End", "Crackles", "Wheezes"])
tups_num_cycles_sounds

[(178,
  [['0.042', '1.28', '0', '1'],
   ['1.28', '2.697', '1', '1'],
   ['2.697', '4.006', '1', '1'],
   ['4.006', '5.506', '0', '1'],
   ['5.506', '7.089', '0', '1'],
   ['7.089', '8.863', '0', '0'],
   ['8.863', '10.268', '1', '0'],
   ['10.268', '11.982', '1', '1'],
   ['11.982', '13.732', '1', '0'],
   ['13.732', '15.589', '1', '0'],
   ['15.589', '17.387', '1', '0'],
   ['17.387', '19.494', '1', '0']]),
 (130,
  [['0.069', '2.563', '1', '0'],
   ['2.563', '4.933', '1', '0'],
   ['4.933', '7.466', '1', '0'],
   ['7.466', '10.124', '1', '0'],
   ['10.124', '12.707', '1', '0'],
   ['12.707', '15.078', '1', '0'],
   ['15.078', '17.615', '1', '0'],
   ['17.615', '19.946', '1', '0']]),
 (207,
  [['1.261', '4.583', '0', '0'],
   ['4.583', '8.556', '0', '0'],
   ['8.556', '12.396', '0', '1'],
   ['12.396', '16.553', '0', '0'],
   ['16.553', '19.992', '0', '0']]),
 (177,
  [['1.995', '8.406', '1', '0'],
   ['8.406', '12.045', '0', '0'],
   ['12.045', '15.885', '0', '0'],
   ['15.885', '1