#Audio Preprocessing

# importing .txt files and libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
import librosa.display
import os

In [None]:
#mounting Google drive
import sys
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Upload data on drive and unzip it
!unzip "/content/drive/MyDrive/Mosaic23/Mosaic23_PS1_TrainData.zip" -d "/content/drive/MyDrive/Mosaic23/Unzip"

Archive:  /content/drive/MyDrive/Mosaic23/Mosaic23_PS1_TrainData.zip
replace /content/drive/MyDrive/Mosaic23/Unzip/ICBHI_final_database/145_2b2_Al_mc_AKGC417L.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# Preprocessing .txt files

In [None]:
import os
path='/content/drive/MyDrive/Mosaic23/Unzip/ICBHI_final_database/'
files=[s.split('.')[0] for s in os.listdir(path) if '.txt' in s]
files[:5]

['195_1b1_Ar_sc_Litt3200',
 '195_1b1_Ll_sc_Litt3200',
 '195_1b1_Lr_sc_Litt3200',
 '195_1b1_Pl_sc_Litt3200',
 '195_1b1_Pr_sc_Litt3200']

In [None]:
#splitting filename after every underscore
def getFilenameInfo(file):
    return file.split('_')

In [None]:
getFilenameInfo('160_1b3_Al_mc_AKGC417L')

['160', '1b3', 'Al', 'mc', 'AKGC417L']

#### Creating a dataframe with columns containing starting time, end time presence or weezels ,Patient ID ,mode of collection and file name of the audio

In [None]:
files_data=[]
for file in files:
    data=pd.read_csv(path + file + '.txt',sep='\t',names=['start','end','crackles','weezels'])
    name_data=getFilenameInfo(file)
    data['pid']=name_data[0]
    data['mode']=name_data[-2]
    data['filename']=file
    files_data.append(data)
files_df=pd.concat(files_data)
files_df.reset_index()
files_df.head()

Unnamed: 0,start,end,crackles,weezels,pid,mode,filename
0,1.5629,3.4694,0.0,1.0,195,sc,195_1b1_Ar_sc_Litt3200
1,3.4694,5.4677,0.0,0.0,195,sc,195_1b1_Ar_sc_Litt3200
2,5.4677,7.5242,0.0,1.0,195,sc,195_1b1_Ar_sc_Litt3200
3,7.5242,9.8468,0.0,0.0,195,sc,195_1b1_Ar_sc_Litt3200
4,9.8468,11.134,0.0,0.0,195,sc,195_1b1_Ar_sc_Litt3200


#### Processing the files_df dataframe

In [None]:
files_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6207 entries, 0 to 11
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   start     6207 non-null   object 
 1   end       6110 non-null   float64
 2   crackles  6110 non-null   float64
 3   weezels   6110 non-null   float64
 4   pid       6207 non-null   object 
 5   mode      6207 non-null   object 
 6   filename  6207 non-null   object 
dtypes: float64(3), object(4)
memory usage: 387.9+ KB


In [None]:
files_df[pd.isnull(files_df).any(axis=1)]

Unnamed: 0,start,end,crackles,weezels,pid,mode,filename
0,Elements contained in the filenames:,,,,filename,filename,filename_format
1,"Patient number (101,102,...,226)",,,,filename,filename,filename_format
2,Recording index,,,,filename,filename,filename_format
3,"Chest location (Trachea (Tc), {Anterior (A), P...",,,,filename,filename,filename_format
4,Acquisition mode (sequential/single channel (s...,,,,filename,filename,filename_format
...,...,...,...,...,...,...,...
86,'224_1b1_Tc_sc_AKGC417L',,,,filename,filename,filename_differences
87,'224_1b2_Al_sc_AKGC417L',,,,filename,filename,filename_differences
88,'225_1b1_Pl_sc_AKGC417L',,,,filename,filename,filename_differences
89,'226_1b1_Al_sc_LittC2SE',,,,filename,filename,filename_differences


In [None]:
#removing all Nan values
files_df.dropna(inplace=True)

In [None]:
files_df['start'] = files_df['start'].astype(float)

In [None]:
files_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6110 entries, 0 to 11
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   start     6110 non-null   float64
 1   end       6110 non-null   float64
 2   crackles  6110 non-null   float64
 3   weezels   6110 non-null   float64
 4   pid       6110 non-null   object 
 5   mode      6110 non-null   object 
 6   filename  6110 non-null   object 
dtypes: float64(4), object(3)
memory usage: 381.9+ KB


In [None]:
files_df.head()

Unnamed: 0,start,end,crackles,weezels,pid,mode,filename
0,1.5629,3.4694,0.0,1.0,195,sc,195_1b1_Ar_sc_Litt3200
1,3.4694,5.4677,0.0,0.0,195,sc,195_1b1_Ar_sc_Litt3200
2,5.4677,7.5242,0.0,1.0,195,sc,195_1b1_Ar_sc_Litt3200
3,7.5242,9.8468,0.0,0.0,195,sc,195_1b1_Ar_sc_Litt3200
4,9.8468,11.134,0.0,0.0,195,sc,195_1b1_Ar_sc_Litt3200


In [None]:
#saving the dataframe as csv for future use
files_df.to_csv('files_df.csv')
!cp files_df.csv "/content/drive/MyDrive/Mosaic23/"

#Audio Preprocessing

In [None]:
#make new directory if not already present
os.mkdir('/content/drive/MyDrive/Mosaic23/processed_audio_files')

In [None]:
#Takes a numpy array and spilts its using start and end args raw_data=numpy array of audio sample start=time end=time sr=sampling_rate mode=mono/stereo
    
def getPureSample(raw_data,start,end,sr=22050):
    
    max_ind = len(raw_data) 
    start_ind = min(int(start * sr), max_ind)
    end_ind = min(int(end * sr), max_ind)
    return raw_data[start_ind: end_ind]

In [None]:
for index,row in files_df.iterrows():
    print("Index ->",index)
    print("Data->\n",row)
    break

Index -> 0
Data->
 start                       1.5629
end                         3.4694
crackles                       0.0
weezels                        1.0
pid                            195
mode                            sc
filename    195_1b1_Ar_sc_Litt3200
Name: 0, dtype: object


#### Preprocess all the audios in the dataset using a for loop.
First we store start and end values of audios and then access the audio themselves using their filenames.
<p>Then we change the lengths of audios if they are shorter or longer then a particular length(6 seconds). Shorter audios are padded and longer audios are clipped.</p>
<p> The audios are then saved in new folder called preprocessed audios where they will be used for training on a neural network</p>

In [None]:
import librosa as lb
import soundfile as sf
i,c=0,0
for index,row in files_df.iterrows():
    maxLen=6
    start=row['start']
    end=row['end']
    filename=row['filename']
    
    #If len > maxLen , change it to maxLen
    if end-start>maxLen:
        end=start+maxLen
    
    audio_file_loc=path + filename + '.wav'
    
    if index > 0:
        #check if more cycles exits for same patient if so then add i to change filename
        if files_df.iloc[index-1]['filename']==filename:
            i+=1
        else:
            i=0
    filename= filename + '_' + str(i) + '.wav'
    
    save_path='/content/drive/MyDrive/Mosaic23/processed_audio_files/' + filename
    c+=1
    
    audioArr,sampleRate=lb.load(audio_file_loc)
    pureSample=getPureSample(audioArr,start,end,sampleRate)
    
    #pad audio if pureSample len < max_len
    reqLen=6*sampleRate
    padded_data = lb.util.pad_center(pureSample, reqLen)
    
    sf.write(file=save_path,data=padded_data,samplerate=sampleRate)
print('Total Files Processed: ',c)

Total Files Processed:  6110


In total we processed 6110 files.