In [48]:
import os
os.environ['AWS_PROFILE'] = 'admin'
os.environ['HAVEN_DATABASE'] = 'haven'

import numpy as np
import pandas as pd
from scipy.stats import norm
from random import sample
import h3

from mirrorverse.utils import read_data_w_cache
from haven.db import write_data

In [None]:
# we sample down to a 15 minute interval

data = read_data_w_cache(
    '''
    select distinct
        tag_key,
        first_value(depth) over (partition by tag_key, epoch - epoch % 900 order by epoch asc) as depth,
        epoch - epoch % 900 as epoch
    from 
        mgietzmann_tag_depths 
    '''
)
data = data[~np.isnan(data['depth'])]
print(data.shape)
data.head()

In [15]:
def get_depth_class(depth_classes, depth):
    """
    Inputs:
    - depth_classes: np.array, the depth classes to choose from
    - depth: float, the depth of the fish as recorded

    Outputs:
    - int, the selected depth class

    Selects a depth class based on the depth of the fish.

    It turns out that PSAT summary data bins the depth into
    intervals so the actual depth is not known. However
    given the recorded depth we can estimate the depth classes
    it could belong to and the likelihoods of each.
    """
    depth_classes = np.array(depth_classes)

    sd = (
        depth * 0.08 / 1.96
    )  # ~two standard deviations gives our 95% confidence interval
    if sd == 0:
        division = np.zeros(len(depth_classes))
        division[0] = 1
    else:
        # we're going to assume the depth classes are sorted
        z = (depth_classes - depth) / sd
        division = norm.cdf(z)
        division[1:] = division[1:] - division[:-1]
    #print(depth, division)
    # if there aren't quite enough depth classes the
    # probabilities may not sum to 1, so we'll normalize
    division = division / division.sum()
    return float(np.random.choice(depth_classes, p=division))

In [None]:
depth_bins = [25, 50, 75, 100, 150, 200, 250, 300, 400, 500]

data['depth_bin'] = data['depth'].apply(lambda depth: get_depth_class(depth_bins, depth))
data.head()

In [None]:
tag_keys = list(data['tag_key'].unique())
test_keys = sample(tag_keys, int(len(tag_keys) * 0.35))
print(len(test_keys))
train_keys = [key for key in tag_keys if key not in test_keys]
print(len(train_keys))

keys_df = pd.concat([
    pd.DataFrame({'tag_key': train_keys, '_train': [True] * len(train_keys)}),
    pd.DataFrame({'tag_key': test_keys, '_train': [False] * len(test_keys)})
]).reset_index(drop=True).reset_index().rename({'index': '_individual'}, axis=1)
keys_df.head()

In [None]:
data = data.merge(keys_df)
data.head()

In [None]:
data = data.sort_values('_individual').reset_index(drop=True).reset_index().rename({'index': '_decision'}, axis=1)

In [None]:
tracks = read_data_w_cache(
    'select tag_key, epoch, longitude, latitude from mgietzmann_tag_tracks'
)
print(tracks.shape)
tracks.head()

In [None]:
tracks['h3_index'] = tracks.apply(
    lambda r: h3.geo_to_h3(r['latitude'], r['longitude'], resolution=4), 
    axis=1
)
tracks.head()

In [None]:
tracks['time'] = pd.to_datetime(tracks['epoch'], unit='s').dt.date
data['time'] = pd.to_datetime(data['epoch'], unit='s').dt.date

data = data.merge(tracks[['tag_key', 'time', 'h3_index']], on=['tag_key', 'time'], how='inner')

In [59]:
write_data(
    data, 'chinook_depth_decisions', ['_train']
)

In [58]:
#from haven.db import drop_table
#
#drop_table('chinook_depth_decisions')