In [1]:
import os
import plotly.express as px
import numpy as np
import pandas as pd

os.environ['HAVEN_DATABASE'] = 'haven'
os.environ['AWS_PROFILE'] = 'admin'

from mirrorverse.utils import read_data_w_cache
from haven.db import write_data, drop_table

In [None]:
sql = """
with elevation as (
    select 
        h3_index,
        elevation
    from 
        mean_elevation_by_h3
    where 
        h3_resolution = 4
), physics as (
    select 
        date,
        h3_index,
        temperature,
        salinity
    from 
        copernicus_physics
    where 
        depth_bin = 25
        and h3_resolution = 4
        and region = 'chinook_study'
), features as (
    select 
        _individual, _decision, _choice, _selected, _train, tag_key,
        mixed_layer_thickness, net_primary_production, 
        water_heading, movement_heading, 
        distance, origin_h3_index, next_h3_index, h3_index, time,
        region, home_lat, home_lon, fork_length_cm,
        cos_time, sin_time, date_format(time, '%Y-%m-%d') as date
    from 
        movement_model_features_m3_a4
)

select 
    f.*,
    e.elevation,
    p.salinity,
    p.temperature
from 
    features f
    inner join elevation e 
        on e.h3_index = f.next_h3_index
    inner join physics p 
        on p.h3_index = f.next_h3_index 
        and p.date = f.date 
"""
data = read_data_w_cache(
    sql
)
print('Shape Before:', data.shape)

# filter down to decisions where a movement happened
# within a specific range
_filter = data[
    data['_selected'] & (data['distance'] < 50)
][['_individual', '_decision']].drop_duplicates()
data = data.merge(_filter, how='inner')

data['stay_put'] = (data['distance'] == 0).astype(int).astype(float)

# remove the choices where the distance is outside of the
# range to be considered
data = data[(data['distance'] < 50)]
print('Shape After:', data.shape)
data.head()

In [None]:
dfs = []
for i in range(1000, 4000, 1000):
    for _individual in data[data['_train']]['_individual'].unique():
        df = data[data['_individual'] == _individual]
        df['_individual'] = i + _individual

        size = df.shape[0]

        # elevation
        delta = 1.0 + np.random.uniform(low=-0.1, high=0.1, size=size)
        df['elevation'] = df['elevation'] * delta

        # salinity
        delta = np.random.uniform(low=-0.25, high=0.25, size=size)
        df['salinity'] = df['salinity'] + delta

        # mixed layer thickness
        delta = np.random.uniform(low=-5, high=5, size=size)
        df['mixed_layer_thickness'] = df['mixed_layer_thickness'] + delta
        df.loc[df['mixed_layer_thickness'] < 0, 'mixed_layer_thickness'] = 0

        # movement heading
        delta = np.random.uniform(low=-np.pi/8, high=np.pi/8, size=size)
        df['movement_heading'] = (df['movement_heading'] + delta) % (2 * np.pi)

        dfs.append(df)

additions = pd.concat(dfs)

In [None]:
print(data[data['_train']].shape)
print(additions.shape)

In [None]:
data = pd.concat([data, additions])
print(data.shape)

In [None]:
def catch_region_map(tag_key):
    for i, _id in enumerate(['172', '202', '159', '205', '210', '229', '142']):
        if tag_key.startswith(_id):
            return i
    return -1

data['catch_region'] = data['tag_key'].apply(catch_region_map)
print(data['catch_region'].unique())

In [None]:
#data['rounded_mlt'] = round(data['mixed_layer_thickness'] / 5)*5
print(data['mixed_layer_thickness'].max())
data['normed_log_mlt'] = np.log(data['mixed_layer_thickness'] + 0.01) / np.log(data['mixed_layer_thickness'].max() + 0.01)
px.histogram(data['normed_log_mlt'])

In [None]:
data['normed_log_npp'] = np.log(data['net_primary_production'] + 0.01) / np.log(data['net_primary_production'].max() + 0.01)
px.histogram(data['normed_log_npp'])

In [46]:
def round_angle(d):
    return round(d / (np.pi/4)) * (np.pi/4)
data['sin_mh'] = np.sin(data['movement_heading'])
data['cos_mh'] = np.cos(data['movement_heading'])
data['sin_wh'] = np.sin(data['water_heading'])
data['cos_wh'] = np.cos(data['water_heading'])

In [None]:
#data['rounded_distance'] = round(data['distance'] / 10) * 10
data['binned_distance'] = (data['distance'] - data['distance'].mean()) / data['distance'].std()
px.histogram(data['binned_distance'])

In [None]:
data.loc[~data['region'].isin(['SEAK', 'WA/OR', 'BC']), 'home_lat'] = \
    data[data['region'].isin(['SEAK', 'WA/OR', 'BC'])]['home_lat'].mean()
data['normed_home_lat'] = (data['home_lat'] - data['home_lat'].min()) \
    / (data['home_lat'].max() - data['home_lat'].min())
px.histogram(data['normed_home_lat'])

In [None]:
data.loc[~data['region'].isin(['SEAK', 'WA/OR', 'BC']), 'home_lon'] = \
    data[data['region'].isin(['SEAK', 'WA/OR', 'BC'])]['home_lon'].mean()
data['normed_home_lon'] = (data['home_lon'] - data['home_lon'].min()) \
    / (data['home_lon'].max() - data['home_lon'].min())
px.histogram(data['normed_home_lon'])

In [50]:
data['region_unknown'] = (~data['region'].isin(['SEAK', 'WA/OR', 'BC'])).astype(float)

In [None]:
#data['rounded_fl'] = round(data['fork_length_cm'] / 5) * 5
data['normed_fl'] = (data['fork_length_cm'] - data['fork_length_cm'].min()) / (data['fork_length_cm'].max() - data['fork_length_cm'].min())
px.histogram(data['normed_fl'])

In [None]:
#data['rounded_salinity'] = round(data['salinity'] / 0.5) * 0.5
print(data['salinity'].mean())
print(data['salinity'].std())
data['normed_salinity'] = (data['salinity'] - data['salinity'].mean()) / data['salinity'].std()
px.histogram(data['normed_salinity'])

In [None]:
data.loc[data['elevation'] > -np.e, 'elevation'] = -np.e 
data['normed_elevation'] = (np.log(-data['elevation'])) / (np.log(-data['elevation']).max())
px.histogram(data['normed_elevation'])

In [None]:
#data['rounded_temp'] = round(data['temperature'] / 2) * 2 
data['normed_temp'] = (data['temperature'] - data['temperature'].mean()) / data['temperature'].std()
px.histogram(data['normed_temp'])

In [55]:
drop_table('movement_model_features_m9_a2')
write_data(
    data, 'movement_model_features_m9_a2', ['tag_key']
)

In [None]:
data.columns

In [None]:
data.groupby('_individual')['_decision'].nunique().describe()

In [None]:
data.groupby('_individual')['_decision'].nunique().quantile(0.95)

In [None]:
data.groupby(['_individual', '_decision'])['_choice'].nunique().describe()