# Boiler Plate

## Installing Dependencies

In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt

## Importing libraries

In [None]:
import gc
import librosa
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pathlib
from pydub import AudioSegment
from tqdm.notebook import tqdm_notebook
import wave

# Ensure that matplotlib-generated charts are shown in notebook
%matplotlib inline

## Helper Functions

In [None]:
def create_possible_timeframe(max_time, space):
    '''
    Return a list of [startTime, endTime] proposal for negative sampling
    '''
    possible_time_frame = []
    for i in np.arange(0, max_time.astype(int) - 2, space):
        possible_time_frame.append([i, i + 2 + 3 * np.random.random()])
    return possible_time_frame


def overlap(x, y):
    '''
    A function to returning true or false indicating overlap given 
    a time duration list as compared to other list
    '''
    if (y[0] < x[1] < y[1]) or (y[0] < x[0] < y[1]) or (
            x[0] < y[1] < x[1]) or (x[0] <= y[0] < x[1]):
        return True
    elif x == y:
        return True
    else:
        return False


def not_overlap_list(x, y_list):
    '''
    Given a possible time proposal return True/False indicating if there is any overlap
    '''
    return sum([overlap(x, y) for y in y_list]) == 0


def not_overlap_list_x_list(x_list, y_list):
    '''
    Function to returning a list of x_list with no overlaps from y_list
    '''
    return list(
        pd.Series(x_list)[[not_overlap_list(x, y_list) for x in x_list]])


def get_wave_file(wav_file):
    '''
    Function to load a wav file
    '''
    return AudioSegment.from_wav(wav_file)


def export_wave_file(audio, begin, end, dest):
    '''
    Function to extract a smaller wav file based start and end duration information
    '''
    sub_audio = audio[begin * 1000:end * 1000]
    sub_audio.export(dest, format="wav")


def extract_segments(audioPath, sampleDict, destnPath, suffix):
    '''
    Function to exctact segments given a audio path folder and proposal segments
    '''
    # Listing the local audio files
    local_audio_files = audioPath
    for wav_file in tqdm_notebook(sampleDict.keys(), desc='outerloop'):
        audio_file = get_wave_file(local_audio_files + wav_file)
        for begin_time, end_time in sampleDict[wav_file]:
            output_file_name = wav_file.lower().replace(
                '.wav', '') + '_' + str(begin_time) + '_' + str(
                    end_time) + suffix + '.wav'
            output_file_path = destnPath + output_file_name
            export_wave_file(audio_file, begin_time, end_time,
                             output_file_path)

# Training

##  Reading Data from WHOIs

**Reading the metadata from WHOIS dataset and creating positive time proposals for audio segmentation**

In [None]:
df_ts = pd.read_csv('./data/train/train_data_09222019/train.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()
df_ts.head()

**Creating a dictionary of negative sample time proposals for noise file creation**

In [None]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [None]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/train_data_09222019/wav/', 
    negative_sample_dict, 
    './data/train/mldata/all/negative/',
    "_Noise"
)

**Creating a dictionary of positive sample time proposals for noise file creation**

In [None]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [None]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/train_data_09222019/wav/', 
    positive_sample_dict, 
    './data/train/mldata/all/positive/',
    "_call"
)

## Reading Podcast Round 2

**Reading the metadata from Round 2 dataset and creating positive time proposals for audio segmentation**

In [None]:
df_ts = pd.read_csv('./data/train/Round2_OS_07_05/train.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()

**Creating a dictionary of positive sample time proposals for noise file creation**

In [None]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [None]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/Round2_OS_07_05/wav/', 
    positive_sample_dict, 
    './data/train/mldata/all/positive/',
    "_call"
)

**Creating a dictionary of negative sample time proposals for noise file creation**

In [None]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [None]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/Round2_OS_07_05/wav/', 
    negative_sample_dict, 
    './data/train/mldata/all/negative/',
    "_call"
)

## Reading Podcast Round 3

**Reading the metadata from Round 3 dataset and creating positive time proposals for audio segmentation**

In [None]:
df_ts = pd.read_csv('./data/train/Round3_OS_09_27_2017/train.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()

In [None]:
df_ts.head()

**Creating a dictionary of positive sample time proposals for noise file creation**

In [None]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [None]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/Round3_OS_09_27_2017/wav/', 
    positive_sample_dict, 
    './data/train/mldata/all/positive/',
    "_call"
)

**Creating a dictionary of negative sample time proposals for noise file creation**

In [None]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [None]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/train/Round3_OS_09_27_2017/wav/', 
    negative_sample_dict, 
    './data/train/mldata/all/negative/',
    "_call"
)

# Test

**This part of code is to just generate test for initial evaluation, not for official evaluation**

## OrcasoundLab07052019_Test

**Reading the metadata from Round 2 testing dataset and creating positive time proposals for audio segmentation**

In [None]:
df_ts = pd.read_csv('./data/test/OrcasoundLab07052019_Test/test.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()

In [None]:
df_ts.head()

**Creating a dictionary of positive sample time proposals for noise file creation**

In [None]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [None]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/test/OrcasoundLab07052019_Test/wav/', 
    positive_sample_dict, 
    './data/test/all/positive/',
    "_call"
)

**Creating a dictionary of negative sample time proposals for noise file creation**

In [None]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [None]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/test/OrcasoundLab07052019_Test/wav/', 
    negative_sample_dict, 
    './data/test/all/negative/',
    "_call"
)

## OrcasoundLab09272017_Test

**Reading the metadata from Round 3 testing dataset and creating positive time proposals for audio segmentation**

In [None]:
df_ts = pd.read_csv('./data/test/OrcasoundLab09272017_Test/test.tsv', sep='\t')
df_ts = df_ts.loc[df_ts.duration_s > 1,:].reset_index(drop=True)
df_ts['begin_time'] = np.floor(df_ts['start_time_s'])
df_ts['end_time'] = np.ceil(df_ts.start_time_s + df_ts.duration_s)
df_ts['time_frame'] = df_ts.apply(lambda row : [row['begin_time'],row['end_time']],axis=1)
df_timeframe = df_ts.groupby(['wav_filename'])['time_frame'].apply(list)
df_timeframe_max = df_ts.groupby(['wav_filename'])['end_time'].max()

In [None]:
df_ts.head()

**Creating a dictionary of positive sample time proposals for noise file creation**

In [None]:
positive_sample_dict = {}
for idx, key in enumerate(df_timeframe.index):
    positive_sample_dict[key] = df_timeframe[idx]

In [None]:
a = 0
for key in positive_sample_dict.keys(): a+=len(positive_sample_dict[key])
a

**Extracting the positive sample wav files and putting it in the input positive sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/test/OrcasoundLab09272017_Test/wav/', 
    positive_sample_dict, 
    './data/test/all/positive/',
    "_call"
)

**Creating a dictionary of negative sample time proposals for noise file creation**

In [None]:
negative_sample_dict = {}
for idx in df_timeframe.index:
    negative_sample_dict[idx] = not_overlap_list_x_list(
        create_possible_timeframe(df_timeframe_max[idx],1), 
        df_timeframe[idx]
    )

In [None]:
a = 0
for key in negative_sample_dict.keys(): a+=len(negative_sample_dict[key])
a

**Extracting the negative sample wav files and putting it in the input negative sample data folder for ML modeling**

In [None]:
extract_segments(
    './data/test/OrcasoundLab09272017_Test/wav/', 
    negative_sample_dict, 
    './data/test/all/negative/',
    "_call"
)

In [None]:
# Generating wav for scoring

**This part of code is to just 2 sec clips from our testing file for final scoring for official evaluation**

## OrcasoundLab07052019_Test

In [None]:
## Finding the maximum duration of the test wav file
max_length = librosa.get_duration(filename="./data/test/OrcasoundLab07052019_Test/wav/OS_7_05_2019_08_24_00_.wav")

In [None]:
## Generating 2 sec proposal with 1 sec hop length
twoSecList = []
for i in range(int(np.floor(max_length)-1)):
    twoSecList.append([i, i+2])

In [None]:
# Creating a proposal dictionary
two_sec_dict = {}
two_sec_dict['OS_7_05_2019_08_24_00_.wav'] = twoSecList

**Extrating 2 sec samples from test file for official evaluation**

In [None]:
extract_segments(
    './data/test/OrcasoundLab07052019_Test/wav/', 
    two_sec_dict, 
    './data/test/OrcasoundLab07052019_Test/test2Sec/',
    ""
)

# Deleting Empty .wav Files

In [None]:
import os
import librosa
import pathlib

# Delete all 0-length wav files in generated sample set
def delete_empty_wav_files(directory_path):
  try:
    with os.scandir(directory_path) as file_iterator:
      file_list = [f for f in file_iterator]
      for entry in tqdm_notebook(
        file_list, 
        desc=f'Check .wav files in {directory_path} for zero-length'
      ):
        if entry.is_file() and \
          pathlib.Path(entry.path).suffix == '.wav' and \
          librosa.get_duration(path=f"{entry.path}") == 0.0:
          print(f"{entry.path} is 0s long. Deleting...")
          os.unlink(entry.path)
  except OSError:
    print("Error occurred while deleting files.")

# Delete 0-length wav files in each sample directory
delete_empty_wav_files('data/train/mldata/all/positive')
delete_empty_wav_files('data/train/mldata/all/negative')