### Check of all the files in the local dir

In [1]:
import os
import glob
from tqdm import tqdm
from os import path
import pandas as pd
from datasets import load_dataset, Audio
from datasets import Dataset

from audio_utils import (check_channels,
                        check_max_length,
                        check_sampling_rate)

In [2]:
EXT = "wav"
DATA_DIR = "atco2_orig"
ATCO2_CSV = "atco2.csv"

# the expected sampling rate
SAMPLING_RATE = 16000

# MAX length in secs
MAX_LENGTH = 30

# expected: MONO
N_CHANNELS = 1

In [3]:
def remove_path(f_name):
    new_name = f_name.split("/")[-1]
    
    return new_name

In [4]:
df_atco2 = pd.read_csv(path.join(DATA_DIR, ATCO2_CSV))

# remove old path
df_atco2['path'] = df_atco2['path'].apply(remove_path)

list_wav = glob.glob(path.join(DATA_DIR, f"*.{EXT}"))

In [5]:
# check the sample rate, max_length, mono
n_ko_srate = 0
n_ko_mlength = 0
n_ko_channels = 0

for f_name in tqdm(list_wav):
    # check sample_rate
    if not check_sampling_rate(f_name, SAMPLING_RATE):
        print(f"WARN: {f_name} has not {SAMPLING_RATE} as sampling rate...")
        n_ko_srate += 1
    
    # check max_length (duration)
    if not check_max_length(f_name, MAX_LENGTH):
        print(f"WARN: {f_name} exceed {MAX_LENGTH} sec. as duration...")
        n_ko_mlength += 1
    
    # check if mono
    if not check_channels(f_name, N_CHANNELS):
        print(f"WARN: {f_name} is NOT mono...")
        n_ko_channels += 1
        

print()
if n_ko_srate == 0:
    print(f"OK: All files have {SAMPLING_RATE} as sampling rate...")
else:
    print(f"There are {n_ko_srate} files with unexpected sampling rate...")
    
if n_ko_mlength == 0:
    print(f"OK: All files have length less than {MAX_LENGTH}...")
else:
    print(f"There are {n_ko_mlength} file/s exceeding {MAX_LENGTH} s....")

if n_ko_channels == 0:
    print(f"OK: All files are mono...")
else:
    print(f"There are {n_ko_channels} not mono....")

100%|███████████████████████████████████████████████████████████████████████████████████████| 560/560 [00:00<00:00, 1143.23it/s]

Duration : 32.8 s.
WARN: atco2_orig/LSZB_BERN_Approach_127_3MHz_20210416_090122.wav exceed 30 sec. as duration...

OK: All files have 16000 as sampling rate...
There are 1 file/s exceeding 30 s....
OK: All files are mono...





In [6]:
# controlla che le due liste (csv, wav in dir) abbiano eguale lunghezza
assert (df_atco2.shape[0] == len(list_wav))

# controlla che per ogni riga del csv ci sia il file wav
df_paths = list(df_atco2['path'].values)

not_found = 0
for my_path in tqdm(df_paths):
    # add path, old  path have been removed
    new_path = DATA_DIR + "/" + my_path
    if new_path not in list_wav:
        print(f"{path} not found")
        not_found += 1
        
print()
if not_found == 0:
    print("All files match with csv...")
else:
    print(f"{not_found} files not found in csv...")

100%|█████████████████████████████████████████████████████████████████████████████████████| 560/560 [00:00<00:00, 101785.85it/s]


All files match with csv...



