### Table of Contents

This notebook contains the code to load, preprocess, and explore the labeled whale call detection dataset

* [1. Load data](#loaddata)
* [2. Stats of raw data](#stats)
    * [Type of files](#typefiles)
    * [Number of file per station](#stationfiles)
* [3. Visualy explore data](#vizexp)
    * [Obspy](#obspy)
    * [Pysmo](#pysmo)


# Load data <a class="anchor" id="loaddata"></a>

In [None]:
import glob
import pandas as pd
import random
from collections import Counter

file_directory = '/network/projects/aia/whale_call'
list_files = glob.glob(file_directory+'/*/*.SAC')

# Visual exploration <a class="anchor" id="vizexp"></a>

#### Load labeled data

In [None]:
WHALE_TYPE = 'bw' # fw bw

In [None]:
label_d = pd.read_csv('data/'+WHALE_TYPE+'c_preprocessed.csv')
print("Total number of Whale detection:",label_d.detection_id.nunique())

##### Select random detection

In [None]:
detection = label_d[label_d.detection_id == random.choice(label_d.detection_id.unique())]
detection_id = detection['detection_id'].max()
date = detection['date'].max()
datetime_start = detection['datetime_start'].max()
station_name = detection['station_name'].max()
num_calls_in_detection = detection['num_calls_in_detection'].max()

In [None]:
print("Station name: {}".format(station_name))
print("Date detection: {} | Time detection: {}".format(date,datetime_start))
print("Number of calls on this detection: {}".format(num_calls_in_detection))

##### Match label data with raw data

In [None]:
import datetime
from obspy import UTCDateTime
from datetime import date
from datetime import timezone

df_time_changed = detection.copy()
df_time_changed['datetime_datetime'] = df_time_changed.datetime.apply(lambda x : datetime.datetime.strptime(x[:-3], '%Y-%m-%d %H:%M:%S.%f'))
df_time_changed['datetime_UTCDateTime'] = df_time_changed.datetime.apply(lambda x : UTCDateTime(x))
df_time_changed['datetime_ordinal'] = df_time_changed.datetime_datetime.apply(lambda x : date.toordinal(x))
df_time_changed['datetime_tz'] = df_time_changed.datetime_datetime.apply(lambda x : x.replace(tzinfo=timezone.utc))

In [None]:
grouped_df = df_time_changed.groupby('detection_id').agg(
    min_time_utc=('datetime_UTCDateTime', 'min'), 
    max_time_utc=('datetime_UTCDateTime', 'max'),
    min_time=('datetime_datetime', 'min'), 
    max_time=('datetime_datetime', 'max'),
    min_time_ordinal=('datetime_ordinal', 'min'), 
    max_time_ordinal=('datetime_ordinal', 'max'),
    datetz_list=('datetime_tz', list), 
    datenum_list=('Datenum', list ),
    date_list=('datetime',  list),
    ).reset_index()

grouped_df['length'] = grouped_df['min_time'] - grouped_df['max_time']

### Obspy <a class="anchor" id="obspy"></a>

In [None]:
from obspy import read
import matplotlib.pyplot as plt
import glob

date = detection['date'].max()
date_str = ''.join(date.split('-'))
directory_path = '/network/projects/aia/whale_call/RAW/'+date_str+'/'
files = glob.glob(directory_path+'*'+station_name+'*.SAC')

In [None]:
threechannels = read(files[0])
for file_id in range(1,len(files)):
    threechannels += read(files[file_id])

In [None]:
threechannels.plot(size=(1200, 400))

### Plot Fin Whale calls for one detection

In [None]:
import yaml

with open('../config/config.yml', 'r') as file:
    param_data = yaml.safe_load(file)['whale_constant']

In [None]:
fig,ax = plt.subplots(len(threechannels),1,figsize=(15,10))
for index,trace in enumerate(threechannels):
    starttime = df_time_changed['datetime_UTCDateTime'].min()
    endtime = starttime + param_data[WHALE_TYPE]["window"]

    # Plot trace of 
    sliced = trace.slice(starttime - 50, endtime + 50 )
    ax[index].plot(sliced.times("matplotlib"), sliced.data, "b-")

    # Plot whales detections on timeline
    for date in df_time_changed['datetime_tz'].values:
        ax[index].axvline(
            x=date,
            color='r',
            label="whale Call")

    ax[index].xaxis_date()
    ax[index].set_xlabel('Time of day', fontweight='bold')
    ax[index].set_ylabel('Amplitude', fontweight='bold')
    ax[index].set_title(
        "{} calls | {} coordinates |Starting time: {} | Date: {}".format(
            param_data[WHALE_TYPE]["name"],
            sliced.stats.channel,
            df_time_changed.datetime_start.min(),
            df_time_changed.date.min(),
            ),
        fontweight='bold')
fig.tight_layout()
plt.show()

# Apply signal processing methods

In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig,ax = plt.subplots(len(threechannels),2,figsize=(15,10))
for index,trace in enumerate(threechannels):

    # Plot trace of 
    sliced = trace.slice(
        starttime - 20, 
        endtime + 20 )

    # Filtering with a lowpass on a copy of the original Trace
    tr_filt = sliced.copy()
    tr_filt.filter(
        'bandpass', 
        freqmin=param_data[WHALE_TYPE]["low_cut_bandpass"],
        freqmax=param_data[WHALE_TYPE]["high_cut_bandpass"],
        corners=2, 
        zerophase=True)

    # Now let's plot the raw and filtered data...
    t = np.arange(0, sliced.stats.npts / sliced.stats.sampling_rate, sliced.stats.delta)
    ax[index,0].plot(t, sliced.data)
    ax[index,0].set_ylabel('Raw Data')
    ax[index,0].set_xlabel('Time [s]')
    ax[index,0].set_title('Raw signal | {} coordinates'.format(sliced.stats.channel))

    ax[index,1].plot(t, tr_filt.data)
    ax[index,1].set_ylabel('Bandpassed Data')
    ax[index,1].set_xlabel('Time [s]')
    ax[index,1].set_title('Bandpassed signal ({} Hz <-> {} Hz) | {} coordinates'.format(
        param_data[WHALE_TYPE]["low_cut_bandpass"],
        param_data[WHALE_TYPE]["high_cut_bandpass"],
        sliced.stats.channel))

fig.tight_layout()
plt.show()

In [None]:
tr_filt.spectrogram(
    samp_rate=100,
    wlen=1,
    per_lap=0.5,
    dbscale=False)

### Add up all signals

Build a spectrogram using the 3 different coordinates

In [None]:
import numpy as np
import matplotlib.pyplot as plt

dataa = []

for index,trace in enumerate(threechannels):

    # Plot trace of 
    sliced = trace.slice(
        starttime - 20, 
        endtime + 20 )

    # Filtering with a lowpass on a copy of the original Trace
    tr_filt = sliced.copy()
    tr_filt.filter(
        'bandpass', 
        freqmin=param_data[WHALE_TYPE]["low_cut_bandpass"],
        freqmax=param_data[WHALE_TYPE]["high_cut_bandpass"],
        corners=2, 
        zerophase=True)
    dataa.append(tr_filt)

import numpy as np

list_of_list = np.add(dataa[0].data, dataa[1].data, dataa[2].data)

trace = threechannels[0]
trace.data = list_of_list

In [None]:
trace.spectrogram(title='SPECTROGRAM')