### Table of Contents

This notebook contains the code to load, preprocess, and explore the labeled whale call detection dataset

* [1. Load data](#loaddata)
* [2. Stats of raw data](#stats)
    * [Type of files](#typefiles)
    * [Number of file per station](#stationfiles)
* [3. Visualy explore data](#vizexp)
    * [Obspy](#obspy)
    * [Pysmo](#pysmo)


# Load data <a class="anchor" id="loaddata"></a>

In [None]:
import glob
import pandas as pd
import random
from collections import Counter

file_directory = '/network/projects/aia/whale_call'
list_files = glob.glob(file_directory+'/*/*.SAC')

In [None]:
print("Total number of files:",len(list_files))

# Stats of raw data <a class="anchor" id="stats"></a>

### Type of files <a class="anchor" id="typefiles"></a>

In [None]:
file_extensions = Counter([file[-7:-4] for file in list_files]).keys()

for extension in file_extensions:
    print("{}: {} files".format(extension,sum(extension in s for s in list_files)))

### Number of files per station <a class="anchor" id="stationfiles"></a>

In [None]:
sta_names = ["PMAQ","ICQ","SNFQ","RISQ","SMQ","CNQ"]

for sta in sta_names:
    sta_files = [s for s in list_files if sta in s.split('/')[-1]]
    print("{}: {} files".format(sta,sum(sta in s for s in list_files if sta in s.split('/')[-1])))

### Stations and files

In [None]:
df = pd.DataFrame(
    columns=file_extensions,
    index=["PMAQ","ICQ","SNFQ","RISQ","SMQ","CNQ"])

In [None]:
sta_names = ["PMAQ","ICQ","SNFQ","RISQ","SMQ","CNQ"]

for sta in sta_names:
    sta_files = [s for s in list_files if sta in s.split('/')[-1]]
    for extension in file_extensions:
        df.xs(sta)[extension] = sum(extension in s for s in sta_files)

In [None]:
df.loc['Total']= df.sum()
df['Total'] = df.sum(axis=1).astype(int)

In [None]:
df