# Respiratory sounds: experiment, predict diagnosis with recording annotations and metainfo, if successful automatically annotate recordings

## Description
The dataset contains recordings of healthy patients and patients with lung issues, as well as a file with demographic information for the patient and a diagnosis file. Along with every recording there is a text file with annotation of start and end time of each breath cycle and the prescence or abscence of crackles / wheezes.

As a first try don't use the audio files yet. Predict diagnosis with annotation and the meta info.
For a simpler first approach index the breath cycles instead of using the intervall times. My idea here was that as for the diagnosis it might be more important in which breath cycle sounds occur rather than the exact length and location in ms, which might even be to noisy for

In [1]:
import os
import glob
import re
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, precision_recall_curve, plot_precision_recall_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

sns.set()

try:
    os.environ['KAGGLE_DATA_PROXY_TOKEN']
except KeyError:
    dir_path = "Respiratory_Sound_Database/Respiratory_Sound_Database/"
    fname_demo = dir_path + "demographic_info.txt"
else:
    dir_path = "/kaggle/input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/"
    fname_demo = "/kaggle/input/respiratory-sound-database/" + "demographic_info.txt"
    
fname_diag = dir_path + "patient_diagnosis.csv"
dir_audio = dir_path + "audio_and_txt_files/"

## Parse / merge all the info into convenient DataFrames
Uncomment, if needed;

In [2]:
group_pat_num = "([0-9]{3})"
group_rec_index = "([0-9][a-z][0-9])"
group_chest_loc = "(Tc|Al|Ar|Pl|Pr|Ll|Lr)"
group_acc_modes = "(sc|mc)"
group_equipments = "(AKGC417L|LittC2SE|Litt3200|Meditron)"

regex = re.compile("_".join([group_pat_num, group_rec_index, group_chest_loc, group_acc_modes, group_equipments]))

top = os.getcwd()
os.chdir(dir_audio)
fnames = glob.glob("*.txt")

### file name info & breath cycles to crackles / wheezes

l_rec_info = []
num_cycles_sounds = []

max_cycles = 0

for fname in fnames:
    match = regex.match(fname)
    pat_num = int(match.group(1))
    rec_index = match.group(2)
    chest_loc = match.group(3)
    acc_mode = match.group(4)
    equipment = match.group(5)
    
    l_rec_info.append([pat_num, rec_index, chest_loc, acc_mode, equipment])
    
    with open(fname) as f_annot:
        lines = [line.strip().split() for line in f_annot.readlines()]
        lines = [ [ix_lines[0]] + ix_lines[1] for ix_lines in enumerate(lines)]
        lines = [ [pat_num] + [rec_index] + [chest_loc] + line for line in lines]
        
        num_cycles_sounds.extend(lines)
        
        if len(lines) > max_cycles:
            max_cycles = len(lines)

l_rec_info.sort(key=lambda subl: (subl[0], subl[1], subl[2], subl[3], subl[4]))
rec_info_cols = ["Patient number", "Recording index", "Chest location", "Acquisition mode", "Recording equipment"]
df_rec_info = pd.DataFrame(l_rec_info, columns=rec_info_cols)

annot_cols = ["Patient number", "Recording index", "Chest location", "Cycle number", "Cycle start", "Cycle end", "Crackles", "Wheezes"]
df_annotation = pd.DataFrame(num_cycles_sounds, columns=annot_cols)

os.chdir(top)

### create a simpler auxiliary DF / CSV for the annotations: one-hot-encoded crackles / wheezes per breath cycle:
### [cycle_0_crackles][cycle_0_wheezes][cycle_1_crackles][cycle_1_wheezes] etc.

df_tmp = df_annotation.set_index(["Patient number", "Recording index", "Chest location"])

aux = []

for ix in df_tmp.index.unique().sort_values():
    pat_num = ix[0]
    rec_index = ix[1]
    chest_loc = ix[2]
    subdf = df_tmp.loc[pat_num, rec_index, chest_loc]
    crackles_wheezes = [ yesno for c_w in zip( subdf["Crackles"], subdf["Wheezes"] ) for yesno in c_w ]
    len_cur = len(crackles_wheezes)
    row = [pat_num] + [rec_index] + [chest_loc] + crackles_wheezes + [0] * (max_cycles * 2 - len_cur)
    
    aux.append(row)

col_names = ["Patient number", "Recording index", "Chest location"]
col_names_c = ["Crackles_C{}".format(num_c) for num_c in range(max_cycles)]
col_names_w = ["Wheezes_C{}".format(num_w) for num_w in range(max_cycles)]

col_names_cw = [ name for tup in zip(col_names_c, col_names_w) for name in  tup]
col_names.extend(col_names_cw)

df_annot_aux = pd.DataFrame(aux, columns=col_names)

### diagnosis

diag = pd.read_csv(fname_diag, names=["Patient number", "Diagnosis"])
df_rec_info_diag = pd.merge(df_rec_info, diag)

### demographic info

with open(fname_demo) as f_demo:
    # skip single empty line at the beginning
    f_demo.readline()

    lines = [line.strip().split() for line in f_demo.readlines()]


for split in lines:
    split[0] = int(split[0])
    if split[1] != "NA":
        split[1] = float(split[1])
    else:
        split[1] = np.nan        
    if split[2] != "NA":
        pass
    else:
        split[2] = np.nan        
    if split[3] != "NA":
        split[3] = float(split[3])
    else:
        split[3] = np.nan        
    if split[4] != "NA":
        split[4] = float(split[4])
    else:
        split[4] = np.nan
    if split[5] != "NA":
        split[5] = float(split[5])
    else:
        split[5] = np.nan

df_demo = pd.DataFrame(lines, columns=["Patient number", "Age", "Sex", "Adult BMI", "Child weight kg", "Child height cm"])


df_full_info = pd.merge(df_rec_info_diag, df_demo, on="Patient number")
df_full_info.to_csv("full_info.csv", index = False)
df_annotation.to_csv("rec_annotation.csv", index = False)
df_annot_aux.to_csv("annot_aux.csv", index=False)



In [3]:
df_full_info = pd.read_csv("full_info.csv")
df_annotation = pd.read_csv("rec_annotation.csv")
df_annot_aux = pd.read_csv("annot_aux.csv")

In [4]:
df_full= pd.merge(df_full_info, df_annot_aux).set_index(["Patient number"])
df_full

Unnamed: 0_level_0,Recording index,Chest location,Acquisition mode,Recording equipment,Diagnosis,Age,Sex,Adult BMI,Child weight kg,Child height cm,...,Crackles_C28,Wheezes_C28,Crackles_C29,Wheezes_C29,Crackles_C30,Wheezes_C30,Crackles_C31,Wheezes_C31,Crackles_C32,Wheezes_C32
Patient number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101,1b1,Al,sc,Meditron,URTI,3.00,F,,19.0,99.0,...,0,0,0,0,0,0,0,0,0,0
101,1b1,Pr,sc,Meditron,URTI,3.00,F,,19.0,99.0,...,0,0,0,0,0,0,0,0,0,0
102,1b1,Ar,sc,Meditron,Healthy,0.75,F,,9.8,73.0,...,0,0,0,0,0,0,0,0,0,0
103,2b2,Ar,mc,LittC2SE,Asthma,70.00,F,33.00,,,...,0,0,0,0,0,0,0,0,0,0
104,1b1,Al,sc,Litt3200,COPD,70.00,F,28.47,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,1b2,Al,sc,Meditron,Healthy,10.00,F,,32.3,143.0,...,0,0,0,0,0,0,0,0,0,0
225,1b1,Pl,sc,Meditron,Healthy,0.83,M,,7.8,74.0,...,0,0,0,0,0,0,0,0,0,0
226,1b1,Al,sc,Meditron,Pneumonia,4.00,M,,16.7,103.0,...,0,0,0,0,0,0,0,0,0,0
226,1b1,Ll,sc,Meditron,Pneumonia,4.00,M,,16.7,103.0,...,0,0,0,0,0,0,0,0,0,0


## Try simple prediction
### Try Age, Sex, Chest location, Recording equipment to predict diagnosis

In [5]:
diag.groupby(["Diagnosis"]).count()

Unnamed: 0_level_0,Patient number
Diagnosis,Unnamed: 1_level_1
Asthma,1
Bronchiectasis,7
Bronchiolitis,6
COPD,64
Healthy,26
LRTI,2
Pneumonia,6
URTI,14


In [42]:
cat_attrs = ["Chest location", "Recording equipment"]
num_attrs = ["Age"]

# one NaN there
df_full["Sex"]  = df_full["Sex"].fillna("F")
df_full["Age"]  = df_full["Age"].fillna(df_full.Age.mean())

col_tr = ColumnTransformer([
    ("one_hot", OneHotEncoder(), cat_attrs),
    ("standard", StandardScaler(), num_attrs)
], remainder="drop")

label_enc = LabelEncoder()
label_enc.fit(diag["Diagnosis"])

# split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
# for train_ix, test_ix in split.split(df_full, df_full["Diagnosis"]):
#     df_train = df_full[train_ix]
#     df_test = df_full[test_ix]

df_train, df_test = train_test_split(df_full, test_size=0.1, random_state=42)

labels_train = label_enc.transform(df_train["Diagnosis"])
labels_test = label_enc.transform(df_test["Diagnosis"])

df_train.drop(["Diagnosis"], axis=1, inplace=True)
df_test.drop(["Diagnosis"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [46]:
train_trans = col_tr.fit_transform(df_train)
test_trans = col_tr.fit_transform(df_test)

dectree_clf = DecisionTreeClassifier()
dectree_clf.fit(train_trans, labels_train)
pred = dectree_clf.predict(train_trans)
accuracy_score(labels_train, pred)

0.9698067632850241

In [47]:
confusion_matrix(labels_train, pred)

array([[  0,   0,   0,   0,   0,   0,   1,   0],
       [  0,  12,   0,   0,   0,   0,   0,   0],
       [  0,   0,  12,   0,   0,   0,   0,   1],
       [  0,   0,   0, 711,   0,   0,   4,   0],
       [  0,   0,   0,   0,  27,   0,   0,   3],
       [  0,   0,   0,   0,   0,   1,   0,   1],
       [  0,   0,   0,   4,   0,   0,  29,   0],
       [  0,   0,   5,   0,   4,   0,   2,  11]])

In [48]:
pred = dectree_clf.predict(test_trans)
accuracy_score(labels_test, pred)

0.9565217391304348

In [49]:
confusion_matrix(labels_test, pred)

array([[ 4,  0,  0,  0,  0],
       [ 0, 77,  0,  1,  0],
       [ 0,  0,  4,  0,  1],
       [ 0,  1,  0,  3,  0],
       [ 0,  0,  1,  0,  0]])

## Looks good. This might mean that it's viable to train a model to automatically annotate audio files and use the annotations for prediction of the diagnosis which is less complicated and resource intensive