### Table of Contents

This notebook contains the code to load and preprocess

* [1. Load data](#loaddata)
* [2. Reformat data](#cleandata)
* [3. Save data](#savedata)

# Load data <a class="anchor" id="loaddata"></a>

In [None]:
from datetime import date
from datetime import timedelta
import numpy as np
import pandas as pd

In [None]:
def read_stadir(mat_file,name):
    list_stations = [name_station[0] for name_station in mat_file['stadir'][:,0]]
    return {v+1:k for v, k in enumerate(list_stations)}

In [None]:
colnames_WC = [
    'Datenum',
    'station_number',
    'R',
    'SNR',
    'group_index',
    'time_start',
    'num_calls_in_detection',
    'detection_id']

In [None]:
import scipy.io
# .mat file
mat = scipy.io.loadmat('/network/projects/aia/whale_call/calls_data/WhaleDetectionsLSZ.mat')

In [None]:
# Load data
fwc = pd.DataFrame(mat['FWC'], columns = colnames_WC)
bwc = pd.DataFrame(mat['BWC'], columns = colnames_WC)
stadir_fw = read_stadir(mat,'stadir')
stadir_bw = read_stadir(mat,'stadir_bw')

# Clean data <a class="anchor" id="cleandata"></a>

In [None]:
def preprocess_(dataset,stadir):

    ## Time 
    dataset['datetime'] = pd.to_datetime(dataset['Datenum']-719529, unit='D')
    dataset['time'] = dataset['datetime'].dt.time
    dataset['date'] = dataset['datetime'].dt.date
    dataset['datetime_start'] = pd.to_datetime(dataset['time_start'].round(), unit='s').dt.time

    ## Column types
    integer_columns = ['num_calls_in_detection','group_index','station_number','detection_id']
    dataset[integer_columns] = dataset[integer_columns].astype(int)

    # Station number 
    dataset['station_name'] = dataset['station_number'].map(stadir)
    
    return dataset

In [None]:
fwc = preprocess_(fwc,stadir_fw)
bwc = preprocess_(bwc,stadir_fw)

In [None]:
print("Number of Fin Whale calls detected: {}".format(fwc.detection_id.nunique()))
print("Number of Blue Whale calls detected: {}".format(bwc.detection_id.nunique()))

# Save data <a class="anchor" id="savedata"></a>

In [None]:
fwc.to_csv('data/fwc_preprocessed.csv')
bwc.to_csv('data/bwc_preprocessed.csv')