In [1]:
import os
os.environ['AWS_PROFILE'] = 'admin'
os.environ['HAVEN_DATABASE'] = 'haven'

import numpy as np
import pandas as pd
import plotly.express as px

from mirrorverse.utils import read_data_w_cache

In [None]:
data = read_data_w_cache(
    'select * from chinook_depth_raw_choices where _train'
)
print(data.shape)
data.head()

In [None]:
list(sorted(data.columns))

In [None]:
TOL = 0.001
NORMS = {'TOL': TOL}

NORMS['log_chlorophyll_min'] = np.min(np.log(data['chlorophyll'] + TOL))
NORMS['log_chlorophyll_max'] = np.max(np.log(data['chlorophyll'] + TOL))

data['n_chlorophyll'] = (np.log(data['chlorophyll'] + TOL) - NORMS['log_chlorophyll_min']) / \
    (NORMS['log_chlorophyll_max'] - NORMS['log_chlorophyll_min'])
px.histogram(data['n_chlorophyll'].sample(10000))

In [None]:
NORMS['log_npp_min'] = np.min(np.log(data['net_primary_production'] + TOL))
NORMS['log_npp_max'] = np.max(np.log(data['net_primary_production'] + TOL))

data['n_net_primary_production'] = (np.log(data['net_primary_production'] + TOL) - NORMS['log_npp_min']) / \
    (NORMS['log_npp_max'] - NORMS['log_npp_min'])
px.histogram(data['n_net_primary_production'].sample(10000))

In [None]:
NORMS['nitrate_min'] = np.min(data['nitrate'])
NORMS['nitrate_max'] = np.max(data['nitrate'])

data['n_nitrate'] = (data['nitrate'] - NORMS['nitrate_min']) / (NORMS['nitrate_max'] - NORMS['nitrate_min'])
px.histogram(data['n_nitrate'].sample(10000))

In [None]:
NORMS['oxygen_min'] = np.min(data['oxygen'])
NORMS['oxygen_max'] = np.max(data['oxygen'])

data['n_oxygen'] = (data['oxygen'] - NORMS['oxygen_min']) / (NORMS['oxygen_max'] - NORMS['oxygen_min'])
px.histogram(data['n_oxygen'].sample(10000))

In [None]:
NORMS['phosphate_min'] = np.min(data['phosphate'])
NORMS['phosphate_max'] = np.max(data['phosphate'])

data['n_phosphate'] = (data['phosphate'] - NORMS['phosphate_min']) / (NORMS['phosphate_max'] - NORMS['phosphate_min'])
px.histogram(data['n_phosphate'].sample(10000))

In [None]:
NORMS['silicate_min'] = np.min(data['silicate'])
NORMS['silicate_max'] = np.max(data['silicate'])

data['n_silicate'] = (data['silicate'] - NORMS['silicate_min']) / (NORMS['silicate_max'] - NORMS['silicate_min'])
px.histogram(data['n_silicate'].sample(10000))

In [None]:
data['negative_elevation'] = -data['elevation']
data.loc[data['negative_elevation'] < 0, 'negative_elevation'] = 0 

NORMS['elevation_min'] = np.min(data['negative_elevation'])
NORMS['elevation_max'] = np.max(data['negative_elevation'])

data['n_elevation'] = (data['negative_elevation'] - NORMS['elevation_min']) / (NORMS['elevation_max'] - NORMS['elevation_min'])
px.histogram(data['n_elevation'].sample(10000))

In [None]:
NORMS['log_mixed_layer_thickness_min'] = np.min(np.log(data['mixed_layer_thickness'] + TOL))
NORMS['log_mixed_layer_thickness_max'] = np.max(np.log(data['mixed_layer_thickness'] + TOL))

data['n_mixed_layer_thickness'] = (np.log(data['mixed_layer_thickness'] + TOL) - NORMS['log_mixed_layer_thickness_min']) / \
    (NORMS['log_mixed_layer_thickness_max'] - NORMS['log_mixed_layer_thickness_min'])
px.histogram(data['n_mixed_layer_thickness'].sample(10000))

In [None]:
NORMS['salinity_min'] = np.min(data['salinity'])
NORMS['salinity_max'] = np.max(data['salinity'])

data['n_salinity'] = (data['salinity'] - NORMS['salinity_min']) / (NORMS['salinity_max'] - NORMS['salinity_min'])
px.histogram(data['n_salinity'].sample(10000))

In [None]:
NORMS['temperature_min'] = np.min(data['temperature'])
NORMS['temperature_max'] = np.max(data['temperature'])

data['n_temperature'] = (data['temperature'] - NORMS['temperature_min']) / (NORMS['temperature_max'] - NORMS['temperature_min'])
px.histogram(data['n_temperature'].sample(10000))

In [None]:
NORMS

In [None]:
data.groupby('_individual')['_decision'].nunique().describe()