# Import bibliotek

In [32]:
import os
import shutil
import librosa
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import IPython.display as ipd
from PIL import Image

Do zbudowania naszego zbioru danych wykorzystamy trzy zbiory:
*  KAUH
*  ICBHI
*  Covid-19

In [33]:
# ścieżki do katalogów z danymi
RAW_DATA_PATH = "./Raw_datasets/"
KAUH_DATA_PATH = f"./{RAW_DATA_PATH}/KAUH_Lung_files"
ICBHI_DATA_PATH = f"./{RAW_DATA_PATH}/Respiratory_Sound_Database"
COVID_19_DATA_PATH = f"./{RAW_DATA_PATH}/covid-19_dataset"

In [34]:
# Sprawdzanie poprawności ścieżek folderów
print("RAW DATA: ", os.path.exists(RAW_DATA_PATH))
print("KAUH: ", os.path.exists(KAUH_DATA_PATH))
print("ICBHI: ", os.path.exists(ICBHI_DATA_PATH))
print("COVID-19: ", os.path.exists(COVID_19_DATA_PATH))

RAW DATA:  True
KAUH:  True
ICBHI:  True
COVID-19:  True


# Przetwarzanie zbioru ICBHI

### Wczytanie danych o chorobach pacjentów

In [None]:
patient_diseases_info = pd.read_csv(f'./{ICBHI_DATA_PATH}/patient_diagnosis.csv')
patient_diseases_info.info()

In [None]:
unique_diseases_icbhi = patient_diseases_info['Diagnosis'].unique()
unique_diseases_icbhi = sorted(unique_diseases_icbhi)
print(f'Unikalne nazwy chorób płucnych: {unique_diseases_icbhi}')

In [None]:
number_of_diseases = patient_diseases_info.groupby(['Diagnosis']).count()
print(number_of_diseases.sort_values(by=['PatientID'],ascending=False).rename(columns = {'PatientID':'Ilość pacjentów:'}))
plt.figure(figsize=(10,8))
plt.pie(number_of_diseases['PatientID'],labels=number_of_diseases.index,autopct='%1.1f%%')
plt.title("Ilość % chorób w zbiorze ICBHI")
plt.show()

### Segregacja plików audio ze względu na choroby 

In [None]:
#Tworzenie głównego folderu z danymi
if not os.path.exists('./data'):
    print("TWORZENIE KATALOGU GŁÓWNEGO")
    os.mkdir("./data")

#Tworzenie folderu dla audio
if not os.path.exists('./data/audio'):
    print("TWORZENIE KATALOGU DLA AUDIO")
    os.mkdir("./data/audio")

#Tworzenie podfolderów audio
for diseas in unique_diseases_icbhi:
    if not os.path.exists(f"./data/audio/{diseas}"):
        print(f"TWORZENIE PODKATALOGU DLA AUDIO CHOROB : {diseas}")
        os.mkdir(f"./data/audio/{diseas}")


In [None]:
#Segregacja plików audio ze względu na choroby
for diseas in unique_diseases_icbhi:
    print(f'CHOROBA {diseas}')

    #Szukanie po chorobie
    patient_list = patient_diseases_info[patient_diseases_info['Diagnosis'] == diseas]
    print(patient_list)
    
    for patient_id in patient_list['PatientID']:
        patient_files = glob.glob(f'{ICBHI_DATA_PATH}/audio_and_txt_files/{patient_id}*.wav')

        print(f'Pliki pacjenta: {patient_files}')

        for patient_file in patient_files:
            print("KOPIOWANIE: ",patient_file)
            shutil.copy(patient_file, f'./data/audio/{diseas}')

print("Pliki zostały posegregowane...")


In [None]:
#Tworzenie folderu dla spectogramów
if not os.path.exists('./data/spectograms'):
    print("TWORZENIE KATALOGU DLA SPEKTOGRAMÓW")
    os.mkdir("./data/spectograms")

#Tworzenie podfolderów spectogramów
for diseas in unique_diseases_icbhi:
    if not os.path.exists(f"./data/spectograms/{diseas}"):
        print(f"TWORZENIE PODKATALOGU DLA SPEKTOGRAMÓW CHOROBY : {diseas}")
        os.mkdir(f"./data/spectograms/{diseas}")

# Etapy tworzenia spektogramów dla pliku audio

In [None]:
audio_files = glob.glob('../lung_diseases_detection/data/audio/*/*.wav')
audio_files[1]

In [None]:
# Plik audio
ipd.Audio(audio_files[1])

In [None]:
# Wczytanie pliku
wave, sr = librosa.load(audio_files[1])

print(f'Wave: {wave[:10]}')
print(f'shape Wave: {wave.shape}')
print(f'SAMPLE RATE: {sr}')

In [None]:
#Surowy przebieg pliku
pd.Series(wave).plot(figsize=(10, 5),
                  lw=1,
                  title='Przebieg dźwięku')
plt.ylabel('Amplituda')
plt.xlabel('Czas')
plt.show()


In [None]:
# Korekta ciszy dźwięku w audio
wave_trimmed, _ = librosa.effects.trim(wave, top_db=40)
pd.Series(wave_trimmed).plot(figsize=(10, 5),
                  lw=1,
                  title='Korekta ciszy w dźwięku')
plt.ylabel('Amplituda')
plt.xlabel('Czas')
plt.show()

In [None]:
# Przybliżony przebieg dźwięku
pd.Series(wave[30000:30500]).plot(figsize=(10, 5),
                  lw=1,
                  title='Przybliżony przebieg dźwięku')
plt.ylabel('Amplituda')
plt.xlabel('Czas')
plt.show()

In [None]:
# Tranformata Fouriera
D = librosa.stft(wave)
S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
S_db.shape


In [None]:
# Spektogram z dźwięku
fig, ax = plt.subplots(figsize=(10, 5))
img = librosa.display.specshow(S_db,
                              x_axis='time',
                              y_axis='log',
                              ax=ax)
ax.set_title('Spectogram przykład', fontsize=20)
fig.colorbar(img, ax=ax, format=f'%0.2f db')
plt.show()

In [None]:
# Tworzenie spektogramu mel
mel_spec = librosa.feature.melspectrogram(y=wave,
                                   sr=sr,
                                   n_mels=128 * 2,)
S_db_mel = librosa.amplitude_to_db(mel_spec, ref=np.max)

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
# Spektogram mel
img = librosa.display.specshow(S_db_mel,
                              x_axis='time',
                              y_axis='log',
                              ax=ax)
ax.set_title('Mel Spectogram przykład', fontsize=20)
fig.colorbar(img, ax=ax, format=f'%0.2f')
plt.show()

## Tworzenie dataset'u

In [None]:
SAMPLE_RATE = 24000
CHUNK_LENGTH = 3

In [None]:
def save_spectogram(spectogram,class_name,file, index = 0):
    file_name = file[:-4]
    #print(file_name)

    #Zapis spektogramu
    saved_path = os.path.join(f'./data/spectograms/{class_name}/{file_name}_{index}.png')
    print(saved_path)

    image = Image.fromarray(spectogram * 255).convert("L")
    image.save(saved_path)

In [None]:
def get_mel_spectograms(filepath,sample_rate=22000,image_shape=(224,224)):
    #Wczytanie pliku audio
    wave, rate = librosa.load(path = filepath, sr = sample_rate)

    #Podział dźwięku na odcinki
    signal_splits = []

    for i in range(0,len(wave),int(CHUNK_LENGTH * SAMPLE_RATE)):
        chunk = wave[i:i + int(CHUNK_LENGTH * SAMPLE_RATE)]

        if len(chunk) < int(CHUNK_LENGTH * SAMPLE_RATE):
            break
        
        signal_splits.append(chunk)

    #Tworzenie mel spektogramu dla kazdego z odcinka
    ready_mel_spectograms = []

    for chunk in signal_splits:

        mel_spectogram = librosa.feature.melspectrogram(y = chunk,
                                                        sr = rate,
                                                        n_fft = 1024,
                                                        hop_length = 512,
                                                        n_mels = image_shape[0] * 1,
                                                        fmin = 100,
                                                        fmax = 24000)
        
        mel_spectogram = librosa.amplitude_to_db(mel_spectogram, ref = np.max)
        #Normalizacja
        mel_spectogram -= mel_spectogram.min()
        mel_spectogram /= mel_spectogram.max()

        #Dodanie melspektogramu do listy
        ready_mel_spectograms.append(mel_spectogram)

    return np.array(ready_mel_spectograms)

In [None]:
def prepare_spectograms():

    class_names = os.listdir('./data/audio')
    print(f"Nazwy klas: {class_names}")

    for class_name in class_names:
        files_disease = os.listdir(f'./data/audio/{class_name}')

        for file in files_disease:
            print(f"{class_name} - > {file}")

            mel_spectograms = get_mel_spectograms(f'./data/audio/{class_name}/{file}', sample_rate = SAMPLE_RATE)

            
            #Zapis spektogramów
            for idx, spec in enumerate(mel_spectograms):
                save_spectogram(spectogram = spec, class_name = class_name, file = file,index = idx)            
            

    print("SPEKTOGRAMY ZOSTAŁY PRZYGOTOWANE...")

In [None]:
test = get_mel_spectograms('./data/audio/Bronchiectasis/111_1b2_Tc_sc_Meditron.wav')

fig, ax = plt.subplots(figsize=(10, 5))
# Spektogram mel
img = librosa.display.specshow(test[0],
                              x_axis='time',
                              y_axis='log',
                              ax=ax)
ax.set_title('Mel Spectogram Example', fontsize=20)
fig.colorbar(img, ax=ax, format=f'%0.2f')
plt.show()

## Przygotowanie spektogramów z plików audio

In [None]:
prepare_spectograms()

 # Przygotowanie danych ze zbioru KAUH

In [None]:
kauh_dataset = pd.read_excel(f'{KAUH_DATA_PATH}/Data annotation.xlsx',usecols=range(5))
kauh_dataset.info()

In [None]:
unique_diseases_kauh = kauh_dataset['Diagnosis'].unique()

print(unique_diseases_kauh)

In [None]:
diseases_kauh_count = kauh_dataset.groupby('Diagnosis').count()
print(diseases_kauh_count['Age'].sort_values(ascending=False))

## Segregowanie danych

In [None]:
kauh_disease_dir = {'Asthma': ['Asthma','Asthma and lung fibrosis','asthma'],
                    'Bronchiectasis' : ['BRON'],
                    'COPD': ['COPD','copd'],
                    'Heart_Failure': ['heart failure','Heart Failure','Heart Failure + COPD'],
                    'Healthy': ['N'],
                    'Pneumonia' : ['pneumonia']}

kauh_disease_file_list = {'Asthma': [],
                        'Bronchiectasis' : [],
                        'COPD': [],
                        'Heart_Failure': [],
                        'Healthy': [],
                        'Pneumonia' : []}

In [None]:
import re

kauh_files = os.listdir(f'{KAUH_DATA_PATH}/Audio Files/')

#print(kauh_files)

for key_category, diseases_list in kauh_disease_dir.items():
    print(f'Category -> {key_category}')

    for disease in diseases_list:
        print(f'\t -> : {disease}')

        #Segregowanie plików audio ze względu na nazwy
        find_names = []
        
        for file in kauh_files:
            is_find = re.search(f"_{disease},",file)

            if is_find != None:
                print(f' --> {file}')
                kauh_disease_file_list[key_category].append(file)

#for key_category, diseases_list in kauh_disease_file_list.items():
#   print(diseases_list)

## Tworzenie dataset'u

In [None]:
# Tworzenie nowych podfolderów
for diseas in kauh_disease_dir.keys():
    if not os.path.exists(f"./data/audio/{diseas}"):
        print(f"TWORZENIE PODKATALOGU DLA SPEKTOGRAMÓW CHOROBY : {diseas}")
        os.mkdir(f"./data/audio/{diseas}")

In [None]:
# Kopiowanie plików do odpowiednich folderów

for key_category, diseases__file_list in kauh_disease_file_list.items():

    for file in diseases__file_list:

        print(f'COPY: {KAUH_DATA_PATH}/Audio Files/{file} -> ./data/audio/{key_category}')
        shutil.copy(src = f'{KAUH_DATA_PATH}/Audio Files/{file}', dst = f'./data/audio/{key_category}')

### Tworzenie spektogramów

In [None]:
prepare_spectograms()

# Przygotowanie danych dla zbioru Covid-19

In [48]:
covid_19_dataset = pd.read_csv(f'{COVID_19_DATA_PATH}/metadata_compiled.csv')
covid_19_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27550 entries, 0 to 27549
Data columns (total 51 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   uuid                   27550 non-null  object 
 1   datetime               27550 non-null  object 
 2   cough_detected         27550 non-null  float64
 3   SNR                    27550 non-null  float64
 4   latitude               16084 non-null  float64
 5   longitude              16084 non-null  float64
 6   age                    15218 non-null  float64
 7   gender                 16224 non-null  object 
 8   respiratory_condition  16224 non-null  object 
 9   fever_muscle_pain      16224 non-null  object 
 10  status                 16224 non-null  object 
 11  quality_1              820 non-null    object 
 12  cough_type_1           802 non-null    object 
 13  dyspnea_1              820 non-null    object 
 14  wheezing_1             820 non-null    object 
 15  st

In [49]:
# Pobranie kolumn z indetyfikatorem pliku oraz statusem
covid_19_dataset_prep = covid_19_dataset[['uuid','status']]

In [50]:
# Usuwanie pustych krotek
print("Wyświetlenie ilość poszczególnych przypadków:")
print(covid_19_dataset_prep.groupby('status').count())
print("\nIlość danych w których pole status jest puste:")
print(covid_19_dataset_prep.isnull().sum())

covid_19_dataset_prep = covid_19_dataset_prep.dropna()

print("\nPo usunięciu pustych pól:")
print(covid_19_dataset_prep.isnull().sum())

Wyświetlenie ilość poszczególnych przypadków:
              uuid
status            
COVID-19      1155
healthy      12479
symptomatic   2590

Ilość danych w których pole status jest puste:
uuid          0
status    11326
dtype: int64

Po usunięciu pustych pól:
uuid      0
status    0
dtype: int64


In [38]:
# Wyłuskanie unikalnych nazw statusu
covid_19_labels_diseases = covid_19_dataset_prep['status'].unique()

print(covid_19_labels_diseases)

['healthy' 'COVID-19' 'symptomatic']


In [39]:
# Tworzenie podfolderów
for diseas in covid_19_labels_diseases:
    diseas = diseas.capitalize()
    if not os.path.exists(f"./data/audio/{diseas}"):
        print(f"TWORZENIE PODKATALOGU DLA AUDIO CHOROBY : {diseas}")
        os.mkdir(f"./data/audio/{diseas}")

In [40]:
# Zmiany nazw wartości
covid_19_dataset_prep.loc[covid_19_dataset_prep["status"] == "healthy","status"] = "Healthy"
covid_19_dataset_prep.loc[covid_19_dataset_prep["status"] == "symptomatic","status"] = "Symptomatic"

In [47]:
# Ograniczenie liczby plików
liczba_graniczna = 400

covid_19_Healthy = covid_19_dataset_prep[covid_19_dataset_prep["status"] == "Healthy"][:liczba_graniczna]
covid_19_Symptomatic = covid_19_dataset_prep[covid_19_dataset_prep["status"] == "Symptomatic"][:liczba_graniczna]
covid_19_Covid_19 = covid_19_dataset_prep[covid_19_dataset_prep["status"] == "COVID-19"][:liczba_graniczna]
print(covid_19_Covid_19)
print(len(covid_19_Covid_19))

covid_19_dataset_prep = pd.DataFrame(covid_19_Healthy,covid_19_Symptomatic,covid_19_Covid_19)
print(len(covid_19_dataset_prep))

KeyError: 'status'

In [None]:
covid_19_pre_ready = covid_19_dataset_prep

# Kopiowanie plików
for index, row in covid_19_pre_ready.iterrows():
    print(row['uuid'], row['status'])

    try:
        file_name = f"{row['uuid']}.webm"
        
        print(f"COPY : {COVID_19_DATA_PATH}/{file_name} -> ./data/audio/{row['status']}/{file_name}")
        shutil.copy(src = f'{COVID_19_DATA_PATH}/{file_name}', dst = f'./data/audio/{row["status"]}/')
    except FileNotFoundError:

        try:
            file_name = f"{row['uuid']}.ogg"
        
            print(f"COPY : {COVID_19_DATA_PATH}/{file_name} -> ./data/audio/{row['status']}/{file_name}")
            shutil.copy(src = f'{COVID_19_DATA_PATH}/{file_name}', dst = f'./data/audio/{row["status"]}/')
        except Exception:
            pass