<a href="https://colab.research.google.com/github/pethodoma/BME-DeepLearning-BirdCLEF_2023/blob/main/preprocess_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install pandas numpy librosa matplotlib tqdm soundfile gdown



In [2]:
# import packages
import pandas as pd
import numpy as np
import os
import librosa.display
import librosa
import matplotlib.pyplot as plt
import random
import math
import json
import time
from tqdm import tqdm
import soundfile as sf
import gdown
import zipfile

There around 17000 audiofiles in the provided training dataset. About 2000 of them have multiple labels. We decided to exclude them from the training dataset for a simpler input for the neural network and because there is plenty of data at hand. Also, in the metadata of the files there are ratings describing the quality of the sound files. We decided to throw away the files with low ratings.

In [3]:
def read_file_paths(main_directory):
    main_directory = main_directory
    file_paths = []

    # go through all folders and get the paths of all .ogg audio files
    for root, directories, files in os.walk(main_directory):
        for file in files:
            if file.endswith('.ogg'):
                file_path = os.path.join(root, file)
                file_paths.append(file_path)

    # os.walk may not go in alphabetical order thus it needs to be sorted
    file_paths.sort()
    return file_paths

# Function to create the variables and files that log the data preprocessing, and the file for the final samples
def create_logs(filelog_csv_name, sample_counts_json_name, output_folder, child_folders):
    # Creating or reading the variable logging the preprocessing
    if os.path.exists(filelog_csv_name):
        filelog = pd.read_csv(filelog_csv_name).values.tolist()
        print(len(filelog))
    else:
        filelog = []
    # Creating or reading the variable that keeps track of the sample counts for each bird species
    if os.path.exists(sample_counts_json_name):
        with open(sample_counts_json_name, 'r') as json_file:
            sample_counts = json.load(json_file)
    else:
        sample_counts = {}
    # Creating the folder for the final samples
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)
    for child_folder in child_folders:
        if not os.path.exists(os.path.join(output_folder, child_folder)):
            os.mkdir(os.path.join(output_folder, child_folder))

    return filelog, sample_counts

def cleandata(df):

    df = pd.read_csv('train_metadata.csv')
    print("Cleaning data...")

    # delete data with low rating (rating in interval [0.0, 0.5]) from dataframe
    # get the names of the files with a rating of 0.0 or 0.5 - these are the files we want to get rid of
    poorRatingFilenames = df.loc[df['rating'].isin([0.0, 0.5])]['filename'].values.tolist()

    rows_to_delete = df.loc[df['rating'].isin([0.0, 0.5])].index
    df.drop(rows_to_delete, inplace=True)

    # the folder which contains the audio files
    trainpath='train_audio'

    # creating an array of the files to delete with the full path
    FilesToDelete = [os.path.join(subdir, file) for subdir, dirs, files in os.walk(trainpath) for file in files if os.path.basename(subdir)+'/'+file in poorRatingFilenames]

    # delete files with more than 1 labels
    rows_with_multiple_labels = df[df['secondary_labels'].apply(lambda x: len(x) > 2)]
    #print(rows_with_multiple_labels)
    df.drop(rows_with_multiple_labels.index, axis=0, inplace=True)
    files_with_multiple_labels = rows_with_multiple_labels['filename']
    file_array = np.array(files_with_multiple_labels)

    for i in range(len(file_array)):
        file_path = (os.path.join("train_audio/" , file_array[i]))
        FilesToDelete.append(file_path)

    # also delete files that are shorter than 1 sec
    # it takes a lot of time (about 45 mins) to search these files so we built it in the code
    short_files = ["categr/XC368933.ogg","categr/XC368934.ogg","eubeat1/XC647701.ogg","gargan/XC310912.ogg","gobbun1/XC200993.ogg","greegr/XC338469.ogg","litegr/XC147857.ogg","piekin1/XC601791.ogg","rerswa1/XC191112.ogg","strher/XC255388.ogg"]
    for i in range(len(short_files)):
        df.drop(df[df['filename'] == short_files[i]].index, axis=0, inplace=True)
        file_path = os.path.join("train_audio/" , short_files[i])
        if os.path.exists(file_path):
            FilesToDelete.append(file_path)
    FilesToDelete = np.unique(FilesToDelete)

    ## if there aren't any files to delete, then we dont need to do anything - assuming the data path is right
    if len(FilesToDelete) == 0:
        print("Data has already been cleaned")
    else:
        count = sum(1 for file in FilesToDelete if os.remove(file) is None)
        print(f"Deleted {count} files")

    df.to_csv('train_metadata.csv', index=False)
    return df



To make all the inputs to the same size we cut the bird sounds from the original audio files by thresholding the energy. We create the melspectrogram from the audio file and then for every timestep we calculate the energy. If this energy is greater than the threshold value we consider it as a bird sound assuming that the bird sounds are much louder than the background thus they have more energy. By modifying the threshold value we found that the optimum was at 10 times of the median of the energies of all the time steps.

Sometimes birds can make sounds with concatenated chirps that result in longer sequences. We don't want to separate these sequences thus we defined a 0.5 sec time window as a no sound period. If another chirp is present in this no sound period the algorithm handles it as one sequence and concatenates the incoming chirp's time to the sequence.

After that is done, the intervals are passed on to the cut_windows function, which cuts out the equally sized time windows from the original file around the chirp. We have chosen these time windows to be 5 seconds long, as even the longest chirp sequences can fit into this time frame, while it still does not include too much unneccesary data for shorter sequences. The windows are cut out in a way that the individual chirp is always placed into a different part of the window to introduce some variety to the data. The data for the created samples are logged in a pandas dataframe and then saved to a csv file. The number of samples for each secies is logged in a dictionary, which is than saved to a JSON file.

Once that is done, it is investigated whether there is a sufficient number of samples for every species. If not, than a combination of two common audio augmentation methods is used to augment the data. The random gain method changes the volume of the segment randomly, while time shifting shifts the position of the features within the time window. These methods are done by extracting the raw chirps from the corresponding files, randomly modifying their volume and then placing them to a random position within a sample of a 5 seconds long white noise segment.

The functions for these operations are defined below.

In [4]:
# Function for retrieving the positions of the individual chirps within the audiofile as spectrogram indices
def get_intervals(spectrogram, duration, length):
    # get energies of all time steps
    frame_energies = []
    for i, frame in enumerate(spectrogram.T):
        frame_energy = sum(frame)
        frame_energies.append(frame_energy)
    # set threshold to optimal value
    threshold = 10 * np.median(np.array(frame_energies))
    segments = []
    intervals = []
    segment_start = 0
    # Search for chirps with thresholding method and make a list of their start and end times
    # It saves the start adn end times as melspectrogram time indices
    for i, frame in enumerate(spectrogram.T):
        frame_energy = sum(frame)
        if frame_energy > threshold:
            if not segments:
                segment_start = i
            segments.append(i)  # We keep this from an example code
                                # We don't use it but somehow it optimizes the runtime
        elif segments:
            segment_end = i
            intervals.append([segment_start,segment_end])
            segments = []
    # Concatenating chirps to sequences within no sound time
    min_length = 0.5/duration * length
    for i in range(len(intervals)-1,0,-1):
        if intervals[i][0]-intervals[i-1][1] < min_length:
            intervals[i-1][1] = intervals[i][1]
            del intervals[i]
    return intervals

# Adjusting the start and end ndices in a way that th window 'shifts' around the chirp within a file, makes the data more diverse
def adjust_interval(interval_index, num_intervals, window_len, start, end):
    position_factor = interval_index / (num_intervals - 1) if num_intervals > 1 else 0
    adjusted_start = int(start + position_factor * (end - start) - position_factor * window_len)
    adjusted_end = adjusted_start + window_len
    return adjusted_start, adjusted_end

# Cutting the segments with the adjusted start and end indices
def cut_segments(y, sr, interval, window_len, noise_level, interval_index, num_intervals):
    # If no chirps werel located in the file (because it was too noisy)
    if not interval:
        return None
    # Converting time indices to indices in the waveform
    start = interval[0] * sr
    end = interval[1] * sr
    window_len *= sr
    chirp_len = end - start
    # If chirp indices were out of range return none (bad interval)
    if chirp_len > window_len:
        return None
    # If file is shorter than window size we pad it with artificial noise
    elif len(y) < window_len:
        noise = np.random.normal(scale=noise_level, size=(window_len - len(y)))
        final_sample = np.concatenate((y, noise))
    # Cutting out the windows
    else:
        adjusted_start, adjusted_end = adjust_interval(interval_index, num_intervals, window_len, start, end)
        final_sample = y[adjusted_start:adjusted_end]
        # If because the index conversions the final waveform is a few frames shoter than the window we pad it with artificial noise
        if len(final_sample) < window_len:
            noise = np.random.normal(scale=noise_level, size=(window_len - len(final_sample)))
            final_sample = np.concatenate((final_sample, noise))
    return final_sample


def create_samples(input_file, output_folder, sample_counts, filelog, window_len):
    # Basic informations about the file
    bird = input_file.split(os.path.sep)[-2]
    filename = input_file.split(os.path.sep)[-1]
    overall_bird_num = sample_counts.get(bird)
    # Reading the file
    y, sr = librosa.load(input_file, sr=None)
    # Creating the melspectrogram
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    # Duration in seconds
    duration = librosa.get_duration(y=y, sr=sr)
    # Spectrogram length
    length = spectrogram.shape[1]
    # Noise level for padding shorter segments with noise
    noise_level = np.sqrt(np.mean(y**2))
    # Retrieving the indices of the chirp intervals (in spectragram indices)
    intervals = get_intervals(spectrogram, duration, length)
    for i, interval in enumerate(intervals):
        # Converting the spectragam indices to time indices for cutting the samples and logging
        interval[0] = interval[0] / length * duration
        interval[1] = interval[1] / length * duration
        # Cutting the windows around the chirps
        final_sample = cut_segments(y, sr, interval, window_len, noise_level, i, len(intervals))  # Cutting the segments
        if final_sample is not None:
            # Increasing birdcount
            if overall_bird_num is not None:
                overall_bird_num += 1
            else:
                overall_bird_num = 1
            # Writing the file
            output_path = os.path.join(output_folder, bird, f'{bird}_{overall_bird_num}.wav')
            sf.write(output_path, final_sample, sr, subtype='PCM_16')
            # Logging the changes
            sample_counts[bird] = overall_bird_num
            current_log = [bird, filename, output_path, overall_bird_num, interval, 'original']
            filelog.append(current_log)


    return filelog, sample_counts

Functions for data augmentation

In [21]:
# Function for cutting the individual chirp
def cut_chirp(y, sr, interval):
    start = int(interval[0] *sr)
    end = int(interval[1] * sr)
    chirp = y[start:end]
    return chirp

# Function random gain
def random_gain(chirp):
    gain = random.uniform(0.05, 2)
    sample_len = len(chirp)
    random_gains = np.random.normal(scale=gain, size=sample_len)
    chirp_with_gain = chirp * random_gains
    return chirp_with_gain

# Function for timeshifting
def timeshift(chirp, noise_level, window_len, sr):
    noise_level *= random.uniform(0.05, 1.2)
    window_len *= sr
    noise1_len = random.randint(0, (window_len -len(chirp)))
    noise2_len = window_len - noise1_len
    noise1 = np.random.normal(scale=noise_level, size=noise1_len)
    noise2 = np.random.normal(scale=noise_level, size=noise2_len)
    timeshifted_chirp = np.concatenate((noise1, chirp, noise2))
    return timeshifted_chirp

# Function for augmenting data in cases when there are not enough samples for a species. It applies two audio augmentation techniques: time shifting and random gain
def augment(input_folder, output_folder, min_sample_num, window_size, sample_counts, filelog):
    filelog_list = filelog.values.tolist()
    birds_to_augment = [[bird, sample_num] for bird, sample_num in sample_counts.items() if sample_num < min_sample_num]
    print(birds_to_augment)

    for bird_data in birds_to_augment:
        bird = bird_data[0]
        # Calculating how many files we have to create from one chirp and rounding it to the largest adjacent integer.
        # If it is less than 1, it will be rounded up to 1, hence the number of samples will be doubled
        sample_num = bird_data[1]
        files_from_1_chirp = math.ceil((min_sample_num - sample_num) / sample_num)
        # Retrieving the data for the individual chirps for the bird from the filelog
        chirps = filelog[filelog['bird'] == bird]

         # Progress bar to show the progress
        progress_bar = tqdm(total=(sample_num*files_from_1_chirp), desc=bird, leave=False)
        for index, row in chirps.iterrows():
            filepath = os.path.join(input_folder, bird, row['origin file'])
            # When saving to csv, lists are converted to strings, this is a way to convert them back
            if type(row['interval']) == str:
                interval = json.loads(row['interval'])
            else:
                interval = row['interval']

            y, sr = librosa.load(filepath)
            noise_level = np.sqrt(np.mean(y**2))

            for i in range(files_from_1_chirp):
                chirp = cut_chirp(y, sr, interval)
                chirp_with_gain = random_gain(chirp)
                timeshifted_chirp = timeshift(chirp_with_gain, noise_level, window_size, sr)
                # Save the augmented sample to a new file
                output_path = os.path.join(output_folder, bird, f'{bird}_{sample_num}.wav')
                sf.write(output_path, timeshifted_chirp, sr, subtype='PCM_16')
                # Logging tha data
                sample_num += 1
                sample_counts[bird] = sample_num
                current_log = [bird, row['origin file'], sample_num, row['interval'], 'augmented']
                filelog_list.append(current_log)

            progress_bar.update(1)

        progress_bar.close()

    filelog = pd.DataFrame(filelog_list, columns=['bird', 'origin file', 'sample file', 'chirp no.', 'interval', 'origin'])
    return sample_counts, filelog



In [6]:
# download data
url = 'https://drive.google.com/u/0/uc?id=1y3XTDabEW5vhhh2Seh3FYsdMA3yNodMz&export=download'
output = 'database.zip'

gdown.download(url,output)

Downloading...
From: https://drive.google.com/u/0/uc?id=1y3XTDabEW5vhhh2Seh3FYsdMA3yNodMz&export=download
To: /content/database.zip
100%|██████████| 5.27G/5.27G [00:17<00:00, 299MB/s]


'database.zip'

In [7]:
# unzip data
with zipfile.ZipFile('database.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [8]:
filepaths = read_file_paths('train_audio')
len(filepaths)

16941

In [9]:
df = pd.read_csv('train_metadata.csv')
df = cleandata(df)

Cleaning data...
Deleted 2735 files


In [10]:
filepaths = read_file_paths('train_audio')
len(filepaths)

14206

In [11]:
output_folder = 'final_samples'
filelog_filename = 'filelog.csv'
sample_counts_filename = 'sample_counter.json'
birds_list = list(pd.read_csv('sample_submission.csv').transpose().reset_index().tail(-1).drop(columns=[0,1,2])['index'])

# Create logs
filelog, sample_counts = create_logs(filelog_filename, sample_counts_filename, output_folder, birds_list)

# Set the start and end of the preprocessing (normally zero and and the end of the file, only customisable for debugging)
if len(filelog) > 0:
    start_index = filelog[-1][-2] + 1
else: start_index = 0
end_index = len(filepaths)

# Calculate runtime for optimization
start_time = time.time()

# Progress bar to show the progress
progress_bar = tqdm(total=(end_index-start_index), desc='Progress', leave=False)

# Go through all audio files and get the start and end times of the individual chirps
for i in range(start_index, end_index):
    filelog, sample_counts = create_samples(filepaths[i], 'final_samples', sample_counts, filelog, 5)
    progress_bar.update(1)

# Save the logs
filelog = pd.DataFrame(filelog, columns=['bird', 'origin file', 'sample file', 'chirp no.', 'interval', 'origin'])
filelog.to_csv(filelog_filename, index=False)
with open(sample_counts_filename, 'w') as json_file:
    json.dump(sample_counts, json_file)


# Calculate runtime for optimization
end_time = time.time()
execution_time = end_time - start_time
print(f"Runtime: {execution_time} seconds")


0


Progress: 100%|██████████| 14206/14206 [1:14:11<00:00,  3.05it/s]

Runtime: 4452.106737136841 seconds


In [12]:
sample_counts

{'abethr1': 91,
 'abhori1': 1257,
 'abythr1': 192,
 'afbfly1': 110,
 'afdfly1': 283,
 'afecuc1': 775,
 'affeag1': 117,
 'afgfly1': 44,
 'afghor1': 418,
 'afmdov1': 257,
 'afpfly1': 786,
 'afpkin1': 2,
 'afpwag1': 634,
 'afrgos1': 615,
 'afrgrp1': 82,
 'afrjac1': 68,
 'afrthr1': 281,
 'amesun2': 434,
 'augbuz1': 40,
 'bagwea1': 160,
 'barswa': 4772,
 'bawhor2': 174,
 'bawman1': 26,
 'bcbeat1': 558,
 'beasun2': 283,
 'bkctch1': 586,
 'bkfruw1': 433,
 'blacra1': 364,
 'blacuc1': 585,
 'blakit1': 1843,
 'blaplo1': 152,
 'blbpuf2': 966,
 'blcapa2': 121,
 'blfbus1': 257,
 'blhgon1': 169,
 'blhher1': 72,
 'blksaw1': 22,
 'blnmou1': 127,
 'blnwea1': 70,
 'bltapa1': 25,
 'bltbar1': 64,
 'bltori1': 138,
 'blwlap1': 68,
 'brcale1': 54,
 'brcsta1': 32,
 'brctch1': 385,
 'brcwea1': 11,
 'brican1': 228,
 'brobab1': 159,
 'broman1': 222,
 'brosun1': 198,
 'brrwhe3': 75,
 'brtcha1': 14,
 'brubru1': 395,
 'brwwar1': 320,
 'bswdov1': 165,
 'btweye2': 530,
 'bubwar2': 38,
 'butapa1': 225,
 'cabgre1': 185

In [13]:
filelog

Unnamed: 0,bird,origin file,sample file,chirp no.,interval,origin
0,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_1.wav,1,"[1.7917556997544721, 3.0235877433356717]",original
1,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_2.wav,2,"[11.294460035952296, 12.478298623290073]",original
2,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_3.wav,3,"[19.35736068484742, 20.589192728428618]",original
3,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_4.wav,4,"[27.30827660250789, 28.508113008593476]",original
4,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_5.wav,5,"[35.48316198263767, 36.682998388723256]",original
...,...,...,...,...,...,...
143192,yewgre1,XC753190.ogg,final_samples/yewgre1/yewgre1_642.wav,642,"[8.070880182160804, 8.150789886934673]",original
143193,yewgre1,XC753190.ogg,final_samples/yewgre1/yewgre1_643.wav,643,"[9.04577858040201, 9.141670226130653]",original
143194,yewgre1,XC753190.ogg,final_samples/yewgre1/yewgre1_644.wav,644,"[9.812911746231155, 10.484153266331658]",original
143195,yewgre1,XC753190.ogg,final_samples/yewgre1/yewgre1_645.wav,645,"[11.171376727386935, 11.267268373115577]",original


Augmentation

In [18]:
if os.path.exists('filelog.csv'):
        filelog = pd.read_csv('filelog.csv')

if os.path.exists('sample_counts.json'):
        with open('sample_counts.json', 'r') as json_file:
                sample_counts = json.load(json_file)

In [19]:
filelog

Unnamed: 0,bird,origin file,sample file,chirp no.,interval,origin
0,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_1.wav,1,"[1.7917556997544721, 3.0235877433356717]",original
1,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_2.wav,2,"[11.294460035952296, 12.478298623290073]",original
2,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_3.wav,3,"[19.35736068484742, 20.589192728428618]",original
3,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_4.wav,4,"[27.30827660250789, 28.508113008593476]",original
4,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_5.wav,5,"[35.48316198263767, 36.682998388723256]",original
...,...,...,...,...,...,...
143192,yewgre1,XC753190.ogg,final_samples/yewgre1/yewgre1_642.wav,642,"[8.070880182160804, 8.150789886934673]",original
143193,yewgre1,XC753190.ogg,final_samples/yewgre1/yewgre1_643.wav,643,"[9.04577858040201, 9.141670226130653]",original
143194,yewgre1,XC753190.ogg,final_samples/yewgre1/yewgre1_644.wav,644,"[9.812911746231155, 10.484153266331658]",original
143195,yewgre1,XC753190.ogg,final_samples/yewgre1/yewgre1_645.wav,645,"[11.171376727386935, 11.267268373115577]",original


In [22]:
sample_counts, filelog = augment('train_audio', 'final_samples', 50, 5, sample_counts, filelog)

[['afgfly1', 44], ['afpkin1', 2], ['augbuz1', 40], ['bawman1', 26], ['blksaw1', 22], ['bltapa1', 25], ['brcsta1', 32], ['brcwea1', 11], ['brtcha1', 14], ['bubwar2', 38], ['chewea1', 17], ['crefra2', 21], ['darter3', 12], ['dotbar1', 10], ['easmog1', 49], ['equaka1', 35], ['fatwid1', 9], ['gobsta5', 30], ['gyhneg1', 43], ['gytbar1', 32], ['hunsun2', 49], ['joygre1', 16], ['lotlap1', 4], ['macshr1', 46], ['malkin1', 46], ['marsto1', 2], ['nubwoo1', 42], ['palpri1', 32], ['purgre2', 43], ['refbar2', 47], ['rehblu1', 2], ['rehwea1', 13], ['rostur1', 37], ['rufcha2', 47], ['sacibi2', 31], ['spfwea1', 15], ['stusta1', 48], ['vibsta2', 48], ['whbcro2', 34], ['whctur2', 1], ['whhsaw1', 3], ['whrshr1', 44], ['witswa1', 10], ['yebduc1', 45], ['yebsto1', 3], ['yenspu1', 49]]




afgfly1:   0%|          | 0/44 [00:00<?, ?it/s][A[A

afgfly1:   9%|▉         | 4/44 [00:00<00:01, 33.14it/s][A[A

afgfly1:  18%|█▊        | 8/44 [00:00<00:01, 32.11it/s][A[A

afgfly1:  27%|██▋       | 12/44 [00:00<00:01, 30.45it/s][A[A

afgfly1:  36%|███▋      | 16/44 [00:00<00:00, 29.53it/s][A[A

afgfly1:  43%|████▎     | 19/44 [00:00<00:00, 28.45it/s][A[A

afgfly1:  52%|█████▏    | 23/44 [00:00<00:00, 29.31it/s][A[A

afgfly1:  59%|█████▉    | 26/44 [00:00<00:00, 29.23it/s][A[A

afgfly1:  66%|██████▌   | 29/44 [00:00<00:00, 28.87it/s][A[A

afgfly1:  73%|███████▎  | 32/44 [00:01<00:00, 28.29it/s][A[A

afgfly1:  82%|████████▏ | 36/44 [00:01<00:00, 31.31it/s][A[A

afgfly1:  91%|█████████ | 40/44 [00:01<00:00, 29.05it/s][A[A

afgfly1:  98%|█████████▊| 43/44 [00:01<00:00, 29.22it/s][A[A

                                                        [A[A

afpkin1:   0%|          | 0/48 [00:00<?, ?it/s][A[A

afpkin1:   2%|▏         | 1/48 [00:00<00:13,  3.58it/s][A

In [23]:
# Saving the logs
filelog.to_csv('filelog.csv', index=False)

with open('sample_counts.json', 'w') as json_file:
    json.dump(sample_counts, json_file)

Printing out files data after augmentation

In [24]:
filelog

Unnamed: 0,bird,origin file,sample file,chirp no.,interval,origin
0,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_1.wav,1,"[1.7917556997544721, 3.0235877433356717]",original
1,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_2.wav,2,"[11.294460035952296, 12.478298623290073]",original
2,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_3.wav,3,"[19.35736068484742, 20.589192728428618]",original
3,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_4.wav,4,"[27.30827660250789, 28.508113008593476]",original
4,abethr1,XC128013.ogg,final_samples/abethr1/abethr1_5.wav,5,"[35.48316198263767, 36.682998388723256]",original
...,...,...,...,...,...,...
145122,yenspu1,XC608955.ogg,94,"[1.6474699828473411, 3.6308319039451113]",augmented,
145123,yenspu1,XC608955.ogg,95,"[4.238636363636363, 4.702487135506003]",augmented,
145124,yenspu1,XC608955.ogg,96,"[5.278301886792453, 6.8457975986277875]",augmented,
145125,yenspu1,XC756890.ogg,97,"[13.835258205307264, 13.851252723463688]",augmented,


In [25]:
sample_counts

{'abethr1': 91,
 'abhori1': 1257,
 'abythr1': 192,
 'afbfly1': 110,
 'afdfly1': 283,
 'afecuc1': 775,
 'affeag1': 117,
 'afgfly1': 88,
 'afghor1': 418,
 'afmdov1': 257,
 'afpfly1': 786,
 'afpkin1': 50,
 'afpwag1': 634,
 'afrgos1': 615,
 'afrgrp1': 82,
 'afrjac1': 68,
 'afrthr1': 281,
 'amesun2': 434,
 'augbuz1': 80,
 'bagwea1': 160,
 'barswa': 4772,
 'bawhor2': 174,
 'bawman1': 52,
 'bcbeat1': 558,
 'beasun2': 283,
 'bkctch1': 586,
 'bkfruw1': 433,
 'blacra1': 364,
 'blacuc1': 585,
 'blakit1': 1843,
 'blaplo1': 152,
 'blbpuf2': 966,
 'blcapa2': 121,
 'blfbus1': 257,
 'blhgon1': 169,
 'blhher1': 72,
 'blksaw1': 66,
 'blnmou1': 127,
 'blnwea1': 70,
 'bltapa1': 50,
 'bltbar1': 64,
 'bltori1': 138,
 'blwlap1': 68,
 'brcale1': 54,
 'brcsta1': 64,
 'brctch1': 385,
 'brcwea1': 55,
 'brican1': 228,
 'brobab1': 159,
 'broman1': 222,
 'brosun1': 198,
 'brrwhe3': 75,
 'brtcha1': 56,
 'brubru1': 395,
 'brwwar1': 320,
 'bswdov1': 165,
 'btweye2': 530,
 'bubwar2': 76,
 'butapa1': 225,
 'cabgre1': 18

Removing augmented files if incorrect - Do not run this cell only if neccesary

In [None]:
# # Removing augmented files Only run if neccesary
# if os.path.exists('filelog.csv'):
#         filelog = pd.read_csv('filelog.csv')

# if os.path.exists('sample_counts.json'):
#         with open('sample_counts.json', 'r') as json_file:
#                 sample_counts = json.load(json_file)



# orig_num_files = sum([value for key, value in sample_counts.items()])
# augmented_files = filelog[filelog['origin'] == 'augmented']
# filestodelete = [os.path.join('final_samples', f'{row["bird"]}_{row["chirp no."]}.ogg') for index, row in augmented_files.iterrows()]
# count = sum(1 for file in filestodelete if os.remove(file) is None)
# filelog.drop(augmented_files.index, axis=0, inplace=True)
# for bird in list(augmented_files['bird'].unique()):
#     bird_num = max(filelog[filelog['bird'] == bird]['chirp no.'])
#     sample_counts[bird] = bird_num

# new_num_files = sum([value for key, value in sample_counts.items()])

# # Saving the logs
# filelog.to_csv('filelog.csv', index=False)

# with open('sample_counts.json', 'w') as json_file:
#     json.dump(sample_counts, json_file)

# print(f'{count} files deleted', {orig_num_files - new_num_files})