# Boiler Plate

## Importing libraries

In [3]:
#from utils import *
import os
import numpy as np
import pandas as pd
import gc
from tqdm.notebook import tqdm_notebook
from pydub import AudioSegment
import wave
import librosa
import matplotlib.pyplot as plt
%matplotlib inline

## Helper Functions

In [4]:
def create_possible_timeframe(max_time, space):
    '''
    Return a list of [startTime, endTime] proposal for negative sampling
    '''
    possible_time_frame = []
    for i in np.arange(0, max_time.astype(int) - 2, space):
        possible_time_frame.append([i, i + 2 + 3 * np.random.random()])
    return possible_time_frame


def overlap(x, y):
    '''
    A function to returning true or false indicating overlap given 
    a time duration list as compared to other list
    '''
    if (y[0] < x[1] < y[1]) or (y[0] < x[0] < y[1]) or (
            x[0] < y[1] < x[1]) or (x[0] <= y[0] < x[1]):
        return True
    elif x == y:
        return True
    else:
        return False


def not_overlap_list(x, y_list):
    '''
    Given a possible time proposal return True/False indicating if there is any overlap
    '''
    return sum([overlap(x, y) for y in y_list]) == 0


def not_overlap_list_x_list(x_list, y_list):
    '''
    Function to returning a list of x_list with no overlaps from y_list
    '''
    return list(
        pd.Series(x_list)[[not_overlap_list(x, y_list) for x in x_list]])


def get_wave_file(wav_file):
    '''
    Function to load a wav file
    '''
    return AudioSegment.from_wav(wav_file)


def export_wave_file(audio, begin, end, dest):
    '''
    Function to extract a smaller wav file based start and end duration information
    '''
    sub_audio = audio[begin * 1000:end * 1000]
    sub_audio.export(dest, format="wav")


def extract_segments(audioPath, sampleDict, destnPath, suffix):
    '''
    Function to exctact segments given a audio path folder and proposal segments
    '''
    # Listing the local audio files
    local_audio_files = audioPath
    for wav_file in tqdm_notebook(sampleDict.keys(), desc='outerloop'):
        audio_file = get_wave_file(local_audio_files + wav_file)
        for begin_time, end_time in sampleDict[wav_file]:
            output_file_name = wav_file.lower().replace(
                '.wav', '') + '_' + str(begin_time) + '_' + str(
                    end_time) + suffix + '.wav'
            output_file_path = destnPath + output_file_name
            export_wave_file(audio_file, begin_time, end_time,
                             output_file_path)

# Training

##  Reading Data from WHOIs

**Reading the metadata from WHOIS dataset and creating positive time proposals for audio segmentation**

In [5]:
df_ts = pd.read_csv('./data/train/train_data_09222019/train.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()
df_ts.head()

Unnamed: 0,wav_filename,start_time_s,duration_s,location,date,master_wav,begin_time,end_time,time_frame
0,60026.wav,1.0,1.7,"80 mi. south of Martha's Vineyard, Massachusetts",21-Dec-1960,60026,1.0,3.0,"[1.0, 3.0]"
1,60026.wav,8.183,4.141,"80 mi. south of Martha's Vineyard, Massachusetts",21-Dec-1960,60026,8.0,13.0,"[8.0, 13.0]"
2,60026.wav,12.004,5.981,"80 mi. south of Martha's Vineyard, Massachusetts",21-Dec-1960,60026,12.0,18.0,"[12.0, 18.0]"
3,60026.wav,12.883,1.404,"80 mi. south of Martha's Vineyard, Massachusetts",21-Dec-1960,60026,12.0,15.0,"[12.0, 15.0]"
4,60026.wav,18.251,2.789,"80 mi. south of Martha's Vineyard, Massachusetts",21-Dec-1960,60026,18.0,22.0,"[18.0, 22.0]"


**Creating a dictionary of negative sample time proposals for noise file creation**

In [8]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [9]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

3992

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/train_data_09222019/wav/', 
    negative_sample_dict, 
    './data/train/mldata/all/negative/',
    "_Noise"
)

**Creating a dictionary of positive sample time proposals for noise file creation**

In [10]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [11]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

2635

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/train_data_09222019/wav/', 
    positive_sample_dict, 
    './data/train/mldata/all/positive/',
    "_call"
)

## Reading Podcast Round 2

**Reading the metadata from Round 2 dataset and creating positive time proposals for audio segmentation**

In [15]:
df_ts = pd.read_csv('./data/train/Round2_OS_07_05/train.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()

**Creating a dictionary of positive sample time proposals for noise file creation**

In [16]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [17]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

341

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/Round2_OS_07_05/wav/', 
    positive_sample_dict, 
    './data/train/mldata/all/positive/',
    "_call"
)

**Creating a dictionary of negative sample time proposals for noise file creation**

In [19]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [20]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

1195

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/Round2_OS_07_05/wav/', 
    negative_sample_dict, 
    './data/train/mldata/all/negative/',
    "_call"
)

## Reading Podcast Round 3

**Reading the metadata from Round 3 dataset and creating positive time proposals for audio segmentation**

In [21]:
df_ts = pd.read_csv('./data/train/Round3_OS_09_27_2017/train.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()

In [20]:
df_ts.head()

Unnamed: 0,wav_filename,start_time_s,duration_s,location,date,data_source,data_source_id,begin_time,end_time,time_frame
0,OS_9_27_2017_08_09_00__0002.wav,10.378472,1.446181,orcasound_lab,2017-09-27,Orcasound_PodCast_Round3,OS_9_27_2017_08_09,10.0,12.0,"[10.0, 12.0]"
1,OS_9_27_2017_08_09_00__0002.wav,18.204861,1.063368,orcasound_lab,2017-09-27,Orcasound_PodCast_Round3,OS_9_27_2017_08_09,18.0,20.0,"[18.0, 20.0]"
2,OS_9_27_2017_08_09_00__0002.wav,15.993056,1.190972,orcasound_lab,2017-09-27,Orcasound_PodCast_Round3,OS_9_27_2017_08_09,15.0,18.0,"[15.0, 18.0]"
3,OS_9_27_2017_08_09_00__0002.wav,35.686632,1.318576,orcasound_lab,2017-09-27,Orcasound_PodCast_Round3,OS_9_27_2017_08_09,35.0,38.0,"[35.0, 38.0]"
4,OS_9_27_2017_08_09_00__0002.wav,56.741319,2.169271,orcasound_lab,2017-09-27,Orcasound_PodCast_Round3,OS_9_27_2017_08_09,56.0,59.0,"[56.0, 59.0]"


**Creating a dictionary of positive sample time proposals for noise file creation**

In [21]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [22]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

440

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/Round3_OS_09_27_2017/wav/', 
    positive_sample_dict, 
    './data/train/mldata/all/positive/',
    "_call"
)

**Creating a dictionary of negative sample time proposals for noise file creation**

In [23]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [24]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

700

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/Round3_OS_09_27_2017/wav/', 
    negative_sample_dict, 
    './data/train/mldata/all/negative/',
    "_call"
)

# Test

**This part of code is to just generate test for initial evaluation, not for official evaluation**

## OrcasoundLab07052019_Test

**Reading the metadata from Round 2 testing dataset and creating positive time proposals for audio segmentation**

In [23]:
df_ts = pd.read_csv('./data/test/OrcasoundLab07052019_Test/test.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()

In [24]:
df_ts.head()

Unnamed: 0,wav_filename,start_time_s,duration_s,location,date,master_wav,tag,begin_time,end_time,time_frame
0,OS_7_05_2019_08_24_00_.wav,52.172,1.118,Orcasound Lab Hydrophone,1562340736,OS_7_05_2019_08_24_00_.wav,call,52.0,54.0,"[52.0, 54.0]"
1,OS_7_05_2019_08_24_00_.wav,54.877,1.104,Orcasound Lab Hydrophone,1562340736,OS_7_05_2019_08_24_00_.wav,call,54.0,56.0,"[54.0, 56.0]"
2,OS_7_05_2019_08_24_00_.wav,69.701,2.691,Orcasound Lab Hydrophone,1562340736,OS_7_05_2019_08_24_00_.wav,?,69.0,73.0,"[69.0, 73.0]"
3,OS_7_05_2019_08_24_00_.wav,75.111,1.367,Orcasound Lab Hydrophone,1562340736,OS_7_05_2019_08_24_00_.wav,?,75.0,77.0,"[75.0, 77.0]"
4,OS_7_05_2019_08_24_00_.wav,78.934,1.81,Orcasound Lab Hydrophone,1562340736,OS_7_05_2019_08_24_00_.wav,ca ll,78.0,81.0,"[78.0, 81.0]"


**Creating a dictionary of positive sample time proposals for noise file creation**

In [25]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [26]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

390

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/test/OrcasoundLab07052019_Test/wav/', 
    positive_sample_dict, 
    './data/test/all/positive/',
    "_call"
)

**Creating a dictionary of negative sample time proposals for noise file creation**

In [33]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [34]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

532

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/test/OrcasoundLab07052019_Test/wav/', 
    negative_sample_dict, 
    './data/test/all/negative/',
    "_call"
)

## OrcasoundLab09272017_Test

**Reading the metadata from Round 3 testing dataset and creating positive time proposals for audio segmentation**

In [27]:
df_ts = pd.read_csv('./data/test/OrcasoundLab09272017_Test/test.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()

In [40]:
df_ts.head()

Unnamed: 0,wav_filename,start_time_s,duration_s,location,date,data_source,data_source_id,begin_time,end_time,time_frame
0,OS_9_27_2017_08_14_00__0001.wav,11.643564,2.45,orcasound_lab,9/27/2017,Orcasound_PodCast_Round3,OS_9_27_2017_08_14,11.0,15.0,"[11.0, 15.0]"
1,OS_9_27_2017_08_14_00__0001.wav,15.594059,2.165842,orcasound_lab,9/27/2017,Orcasound_PodCast_Round3,OS_9_27_2017_08_14,15.0,18.0,"[15.0, 18.0]"
2,OS_9_27_2017_08_14_00__0001.wav,53.9,2.45,orcasound_lab,9/27/2017,Orcasound_PodCast_Round3,OS_9_27_2017_08_14,53.0,57.0,"[53.0, 57.0]"
3,OS_9_27_2017_08_14_00__0001.wav,59.781486,1.468514,orcasound_lab,9/27/2017,Orcasound_PodCast_Round3,OS_9_27_2017_08_14,59.0,62.0,"[59.0, 62.0]"
4,OS_9_27_2017_08_19_00__0002.wav,6.592882,1.233507,orcasound_lab,9/27/2017,Orcasound_PodCast_Round3,OS_9_27_2017_08_19,6.0,8.0,"[6.0, 8.0]"


**Creating a dictionary of positive sample time proposals for noise file creation**

In [41]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [42]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

97

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/test/OrcasoundLab09272017_Test/wav/', 
    positive_sample_dict, 
    './data/test/all/positive/',
    "_call"
)

**Creating a dictionary of negative sample time proposals for noise file creation**

In [43]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [44]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

201

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/test/OrcasoundLab09272017_Test/wav/', 
    negative_sample_dict, 
    './data/test/all/negative/',
    "_call"
)

# Generating wav for scoring

**This part of code is to just 2 sec clips from our testing file for final scoring for official evaluation**

## OrcasoundLab07052019_Test

In [29]:
## Finding the maximum duration of the test wav file
max_length = librosa.get_duration(filename="./data/test/OrcasoundLab07052019_Test/wav/OS_7_05_2019_08_24_00_.wav")

In [30]:
## Generating 2 sec proposal with 1 sec hop length
twoSecList = []
for i in range(int(np.floor(max_length)-1)):
    twoSecList.append([i, i+2])

In [31]:
# Creating a proposal dictionary
two_sec_dict = {}
two_sec_dict['OS_7_05_2019_08_24_00_.wav'] = twoSecList

**Extrating 2 sec samples from test file for official evaluation**

In [None]:
extract_segments(
    './data/test/OrcasoundLab07052019_Test/wav/', 
    two_sec_dict, 
    './data/test/OrcasoundLab07052019_Test/test2Sec/',
    ""
)