In [0]:
import os
import glob
import re
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.train import FloatList, Int64List, BytesList, Feature, Features, FeatureList, FeatureLists, SequenceExample

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import soundfile as sf

try:
    os.environ['KAGGLE_DATA_PROXY_TOKEN']
except KeyError:
    dir_out = "./"
    dir_files = "Respiratory_Sound_Database/Respiratory_Sound_Database/"
else:
    dir_out = "/kaggle/working/"
    dir_files = "/kaggle/input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/"
    
dir_audio = dir_files + "audio_and_txt_files/"

### Read WAVs and pad ###
    
group_pat_num = "([0-9]{3})"
group_rec_index = "([0-9][a-z][0-9])"
group_chest_loc = "(Tc|Al|Ar|Pl|Pr|Ll|Lr)"
group_acc_modes = "(sc|mc)"
group_equipments = "(AKGC417L|LittC2SE|Litt3200|Meditron)"

regex_info = re.compile("_".join([group_pat_num, group_rec_index, group_chest_loc, group_acc_modes, group_equipments]))

top = os.getcwd()
os.chdir(dir_audio)
fnames = glob.glob("*.wav")

l_wav_rec = []
dict_wav_rec = {}
min_len = np.inf
max_len = 0

### !!!!!!!!!!!! remove [:3]

for fname in fnames[:3]:
    match_info = regex_info.match(fname)
    pat_num = int(match_info.group(1))
    rec_index = match_info.group(2)
    chest_loc = match_info.group(3)
    acc_mode = match_info.group(4)
    equipment = match_info.group(5)
    
    wav_content = sf.read(fname)[0]
    l_wav_rec.append([pat_num, rec_index, chest_loc, wav_content])
    dict_wav_rec[(pat_num, rec_index, chest_loc)] = wav_content
    
    if len(wav_content) > max_len:
        max_len = len(wav_content)
        # for getting the corresponding annotation below
        max_patnum = pat_num
        max_recindex = rec_index
        max_chestloc = chest_loc
    
    if len(wav_content) < min_len:
        min_len = len(wav_content)
        # for getting the corresponding annotation below
        min_patnum = pat_num
        min_recindex = rec_index
        min_chestloc = chest_loc

os.chdir(top)

# pad all recordings to same length
for i in range(len(l_wav_rec)):
    if len(l_wav_rec[i][3]) < max_len:
        padding = [0] * ( max_len - len(l_wav_rec[i][3]) )
        l_wav_rec[i][3] = np.append(l_wav_rec[i][3], padding)

# pad all recordings to multiple of length of shortest recording
# for i in range(len(l_wav_rec)):
#     if len(l_wav_rec[i][3]) % min_len != 0:
#         padding = [0] * ( min_len - len(l_wav_rec[i][3]) % min_len)
#         l_wav_rec[i][3] = np.append(l_wav_rec[i][3], padding)

l_wav_rec.sort(key=lambda subl: (subl[0], subl[1], subl[2]))

# wav_cols = ["Patient number", "Recording index", "Chest location", "WAV"]
# df_wav_rec = pd.DataFrame(l_wav_rec, columns=wav_cols)

######

# for wav_rec in l_wav_rec:
#     wav_transposed = tf.reshape(tf.constant(l_wav_rec[0][3], dtype=tf.float32), shape=[-1, 1])

feat_patnum = Feature(
    int64_list = Int64List(value=[l_wav_rec[0][0]])
)

feat_recix = Feature(
    bytes_list = BytesList(value=[bytes(l_wav_rec[0][1], "ascii")])
)

feat_chestloc = Feature(
    bytes_list = BytesList(value=[bytes(l_wav_rec[0][2], "ascii")])
)

features_context = Features(
    feature = {
        "patient number" : feat_patnum,
        "recording index" : feat_recix,
        "chest location" : feat_chestloc
    }
)

# wav_transposed = tf.reshape(tf.constant(l_wav_rec[0][3], dtype=tf.float32), shape=[-1, 1])

fl = []

for val_ix in range(len(l_wav_rec[0][3])):
    feature = Feature(
        float_list = FloatList(value=([l_wav_rec[0][3][val_ix]]))
    )
    
    fl.append(feature)

feat_list = FeatureList(feature = fl)

fls = {}    
fls["{} {} {}".format(l_wav_rec[0][0], l_wav_rec[0][1], l_wav_rec[0][2])] = feat_list

feat_lists = FeatureLists(feature_list = fls)

se = SequenceExample(
    context = features_context,
    feature_lists = feat_lists
)
    
print(se)