# Detection of respiratory sounds with annotated breath recordings

## Description
The dataset contains recordings of healthy patients and patients with lung issues, as well as a file with demographic information for the patient and a diagnosis file. Along with every recording there is a text file with annotation of start and end time of each breath cycle and the prescence or abscence of crackles / wheezes.

As a first try don't use the audio files yet. Predict diagnosis with annotation and the meta info.
For a simpler first approach index the breath cycles instead of using the intervall times.

In [8]:
import os
import glob
import re
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, precision_recall_curve, plot_precision_recall_curve

sns.set()

try:
    os.environ['KAGGLE_DATA_PROXY_TOKEN']
except KeyError:
    dir_path = "Respiratory_Sound_Database/Respiratory_Sound_Database/"
else:
    dir_path = "/kaggle/input/Respiratory_Sound_Database/Respiratory_Sound_Database/"
    
fname_demo = dir_path + "demographic_info.txt"
fname_diag = dir_path + "patient_diagnosis.csv"
dir_audio = dir_path + "audio_and_txt_files/"

## Parse / merge all the info into convenient DataFrames
Uncomment, if needed;

In [9]:
# group_pat_num = "([0-9]{3})"
# group_rec_index = "([0-9][a-z][0-9])"
# group_chest_loc = "(Tc|Al|Ar|Pl|Pr|Ll|Lr)"
# group_acc_modes = "(sc|mc)"
# group_equipments = "(AKGC417L|LittC2SE|Litt3200|Meditron)"

# regex = re.compile("_".join([group_pat_num, group_rec_index, group_chest_loc, group_acc_modes, group_equipments]))

# top = os.getcwd()
# os.chdir(dir_audio)
# fnames = glob.glob("*.txt")

# ### file name info & breath cycles to crackles / wheezes

# l_rec_info = []
# num_cycles_sounds = []

# for fname in fnames:
#     match = regex.match(fname)
#     pat_num = int(match.group(1))
#     rec_index = match.group(2)
#     chest_loc = match.group(3)
#     acc_mode = match.group(4)
#     equipment = match.group(5)
    
#     l_rec_info.append([pat_num, rec_index, chest_loc, acc_mode, equipment])
    
#     with open(fname) as f_annot:
#         lines = [line.strip().split() for line in f_annot.readlines()]
#         lines = [ [ix_lines[0]] + ix_lines[1] for ix_lines in enumerate(lines)]
#         lines = [ [pat_num] + line for line in lines]

#         num_cycles_sounds.extend(lines)

# l_rec_info.sort(key=lambda subl: (subl[0], subl[1], subl[2], subl[3], subl[4]))
# df_rec_info = pd.DataFrame(l_rec_info, columns=["Patient number", "Recording index", "Chest location", "Accuisition mode", "Recording Equipment"])

# df_annotation = pd.DataFrame(num_cycles_sounds, columns=["Patient number", "Cycle number", "Cycle start", "Cycle End", "Crackles", "Wheezes"])

# os.chdir(top)

# ### diagnosis

# diag = pd.read_csv(fname_diag, names=["Patient number", "Diagnosis"])
# df_rec_info_diag = pd.merge(df_rec_info, diag, on="Patient number")

# ### demographic info

# with open(fname_demo) as f_demo:
#     # skip single empty line at the beginning
#     f_demo.readline()

#     lines = [line.strip().split() for line in f_demo.readlines()]


# for split in lines:
#     split[0] = int(split[0])
#     if split[1] != "NA":
#         split[1] = float(split[1])
#     else:
#         split[1] = np.nan        
#     if split[2] != "NA":
#         pass
#     else:
#         split[2] = np.nan        
#     if split[3] != "NA":
#         split[3] = float(split[3])
#     else:
#         split[3] = np.nan        
#     if split[4] != "NA":
#         split[4] = float(split[4])
#     else:
#         split[4] = np.nan
#     if split[5] != "NA":
#         split[5] = float(split[5])
#     else:
#         split[5] = np.nan

# df_demo = pd.DataFrame(lines, columns=["Patient number", "Age", "Sex", "Adult BMI", "Child weight kg", "Child Height cm"])


# df_full_info = pd.merge(df_rec_info_diag, df_demo, on="Patient number")
# df_full_info.to_csv("full_info.csv", index = False)
# df_annotation.to_csv("rec_annotation.csv", index = False)

In [16]:
df_annotation = pd.read_csv("rec_annotation.csv")
df_full_info = pd.read_csv("full_info.csv")

In [17]:
df = pd.merge(df_full_info, df_annotation, on="Patient number")
df.set_index(["Patient number", "Recording index", "Chest location"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Accuisition mode,Recording Equipment,Diagnosis,Age,Sex,Adult BMI,Child weight kg,Child Height cm,Cycle number,Cycle start,Cycle End,Crackles,Wheezes
Patient number,Recording index,Chest location,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
101,1b1,Al,sc,Meditron,URTI,3.0,F,,19.0,99.0,0,0.036,1.264,0,0
101,1b1,Al,sc,Meditron,URTI,3.0,F,,19.0,99.0,1,1.264,3.422,0,0
101,1b1,Al,sc,Meditron,URTI,3.0,F,,19.0,99.0,2,3.422,5.550,0,0
101,1b1,Al,sc,Meditron,URTI,3.0,F,,19.0,99.0,3,5.550,7.436,0,0
101,1b1,Al,sc,Meditron,URTI,3.0,F,,19.0,99.0,4,7.436,9.221,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,1b1,Pl,sc,LittC2SE,Pneumonia,4.0,M,,16.7,103.0,6,11.721,13.693,1,0
226,1b1,Pl,sc,LittC2SE,Pneumonia,4.0,M,,16.7,103.0,7,13.693,15.536,0,0
226,1b1,Pl,sc,LittC2SE,Pneumonia,4.0,M,,16.7,103.0,8,15.536,17.493,0,0
226,1b1,Pl,sc,LittC2SE,Pneumonia,4.0,M,,16.7,103.0,9,17.493,19.436,1,0
