In [1]:
import os
import random

import h3
import numpy as np
import pandas as pd
import haven.db as db 
import geopy.distance
import plotly.express as px

os.environ['HAVEN_DATABASE'] = 'haven'
os.environ['AWS_PROFILE'] = 'admin'

## Load the Raw Data

In [2]:
sql = '''
with neighbors as (
    select 
        *,
        row_number() over () as _choice
    from 
        movement_model_neighbors
    where
        max_km = 100
)
select 
    n.current_h3_index,
    n.selected_h3_index,
    n.neighbor_h3_index,
    n.date,
    n._individual,
    n._decision,
    n._choice,
    n.tag_key,
    p.mixed_layer_thickness,
    p.velocity_east,
    p.velocity_north,
    c.net_primary_production
from 
    neighbors n 
    inner join copernicus_physics p 
        on p.depth_bin = 25 
        and p.region = 'chinook_study'
        and n.neighbor_h3_index = p.h3_index 
        and n.date = p.date 
    inner join copernicus_biochemistry c
        on c.depth_bin = 25 
        and c.region = 'chinook_study'
        and n.neighbor_h3_index = c.h3_index 
        and n.date = c.date 
'''

cached_file_path = 'cached_model.snappy.parquet'
if os.path.exists(cached_file_path):
    data = pd.read_parquet(cached_file_path)
else:
    data = db.read_data(sql)
data.to_parquet(cached_file_path)

## Get Spatial Features of Each Choice

In [None]:
data['current_lat'] = data['current_h3_index'].apply(lambda i: h3.h3_to_geo(i)[0])
data['current_lon'] = data['current_h3_index'].apply(lambda i: h3.h3_to_geo(i)[1])
data['neighbor_lat'] = data['neighbor_h3_index'].apply(lambda i: h3.h3_to_geo(i)[0])
data['neighbor_lon'] = data['neighbor_h3_index'].apply(lambda i: h3.h3_to_geo(i)[1])
data.head()

In [None]:
def get_distance(row):
    return geopy.distance.geodesic(
        (row['current_lat'], row['current_lon']),
        (row['neighbor_lat'], row['neighbor_lon'])
    ).km

data['distance'] = data.apply(get_distance, axis=1)
data.head()

In [None]:
data['water_heading'] = data.apply(lambda r: np.arctan2(r['velocity_north'], r['velocity_east']), axis=1)
data['movement_heading'] = data.apply(
    lambda r: np.arctan2(
        r['neighbor_lat'] - r['current_lat'],
        r['neighbor_lon'] - r['current_lon'] 
    ) if r['distance'] else 0, axis=1
)
data.head()

## Filter to Decisions that are Legal

In [None]:
print(data.shape)
data = data.dropna()
print(data.shape)
data['_selected'] = data['selected_h3_index'] == data['neighbor_h3_index']
gdf = data.groupby('_decision')[['_selected']].max().reset_index()
print(gdf.shape)
gdf = gdf[gdf['_selected']]
print(gdf.shape)
data = data.merge(gdf[['_decision']], how='inner')
data.head()

## Normalize Features

In [None]:
data['normed_distance'] = data['distance'] / 100
px.histogram(
    data['normed_distance']
)

In [None]:
data['log_npp'] = np.log(data['net_primary_production'] + 0.001)
data['log_npp_mean'] = data['log_npp'].mean()
print(data['log_npp'].mean())
data['normed_log_npp'] = (data['log_npp'] - data['log_npp'].mean())
px.histogram(
    data['normed_log_npp']
)

In [None]:
data['log_mlt'] = np.log(data['mixed_layer_thickness'] + 0.001)
data['log_mlt_mean'] = data['log_mlt'].mean()
print(data['log_mlt'].mean())
data['normed_log_mlt'] = (data['log_mlt'] - data['log_mlt'].mean())
px.histogram(
    data['normed_log_mlt']
)

## Add Training/Testing Split

In [None]:
data['_train'] = data['_individual'] % 3 > 0
data[data['_train']]['_individual'].drop_duplicates().shape[0] / data[~data['_train']]['_individual'].drop_duplicates().shape[0]

## Filter Columns

In [11]:
to_upload = data[[
    '_individual', '_decision', '_choice', '_selected', '_train', # required by mimic
    'normed_log_mlt', 'normed_log_npp', 'normed_distance', 'water_heading', 'movement_heading', # features
    'log_mlt_mean', 'log_npp_mean', # for build new features
    'mixed_layer_thickness', 'net_primary_production', 'distance', # unnormalized
    'neighbor_h3_index', 'tag_key', 'date', # for plotting later
]].rename({'neighbor_h3_index': 'h3_index'}, axis=1)

In [12]:
db.write_data(
    to_upload, 'movement_model_features_v3', ['_train']
)

In [None]:
to_upload.groupby(['_individual', '_decision'])[['_choice']].nunique().describe()

In [None]:
to_upload.groupby('_individual')[['_decision']].nunique().describe()