In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os
import h3
import itertools
from tqdm import tqdm
from multiprocessing import Pool
from scipy.stats import norm
from collections import defaultdict
from sklearn.cluster import AgglomerativeClustering

from mirrorverse.warehouse.utils import get_engine
from mirrorverse.chinook.states import spatial_key_to_index

pd.options.mode.chained_assignment = None

os.environ["DATABASE_URL"] = "sqlite:////workspaces/mirrorverse/mirrorverse.db"

# Load the Data

In [2]:
sql = '''
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths
'''
depth = pd.read_sql(sql, get_engine())
depth = depth[~np.isnan(depth['depth'])]
print(depth.shape)
depth.head()

2024-05-19 10:39:52,902 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-19 10:39:52,902 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths
")
2024-05-19 10:39:52,903 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-19 10:39:52,904 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths
")
2024-05-19 10:39:52,905 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-19 10:39:52,906 INFO sqlalchemy.engine.Engine 
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths

2024-05-19 10:39:52,907 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-19 10:39:55,785 INFO sqlalchemy.engine.Engine ROLLBACK
(1033662, 4)


Unnamed: 0,tag_key,date_key,depth,epoch
0,129843,1387411200,83.4,1387411200
1,129843,1387411200,80.7,1387412100
2,129843,1387411200,91.5,1387413000
3,129843,1387411200,91.5,1387413900
4,129843,1387411200,88.8,1387414800


In [3]:
def select_a_class(depth, depth_classes):
    sd = depth * 0.08 / 1.96 # ~two standard deviations gives our 95% confidence interval
    if sd == 0:
        division = np.zeros(len(depth_classes))
        division[0] = 1
    else:
        # we're going to assume the depth classes are sorted
        z = (depth_classes - depth) / sd
        division = norm.cdf(z)
        division[1:] = division[1:] - division[:-1]
    # if there aren't quite enough depth classes the 
    # probabilities may not sum to 1, so we'll normalize
    division = division / division.sum()
    #if len(division[np.isnan(division)]) != 0:
    #    return np.nan
    return np.random.choice(depth_classes, p=division)

min_size = 0.01
depth_classes = np.array([25, 50, 75, 100, 150, 200, 250, 300, 400, 500])

depth['depth_class'] = depth['depth'].apply(lambda x: select_a_class(x, depth_classes))
depth.head()

Unnamed: 0,tag_key,date_key,depth,epoch,depth_class
0,129843,1387411200,83.4,1387411200,100
1,129843,1387411200,80.7,1387412100,100
2,129843,1387411200,91.5,1387413000,100
3,129843,1387411200,91.5,1387413900,100
4,129843,1387411200,88.8,1387414800,100


In [4]:
print(depth.shape)
depth = depth.sample(int(depth.shape[0] / len(depth_classes)), replace=True)
print(depth.shape)

(1033662, 5)
(103366, 5)


# Add Context

In [5]:
sql = '''
select 
    tt.*,
    h.home_region,
    e.elevation
from 
    tag_tracks tt 
    left join home_regions h
        on tt.tag_key = h.tag_key
    left join elevation e 
        on tt.h3_level_4_key = e.h3_level_4_key
'''
tt = pd.read_sql_query(
    sql,
    get_engine()
)
print(tt.head())
tt.head()

2024-05-19 10:40:49,308 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-19 10:40:49,309 INFO sqlalchemy.engine.Engine 
select 
    tt.*,
    h.home_region,
    e.elevation
from 
    tag_tracks tt 
    left join home_regions h
        on tt.tag_key = h.tag_key
    left join elevation e 
        on tt.h3_level_4_key = e.h3_level_4_key

2024-05-19 10:40:49,309 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-19 10:40:49,339 INFO sqlalchemy.engine.Engine ROLLBACK
  tag_key    date_key   longitude   latitude      h3_level_4_key home_region  \
0  129843  1387411200 -166.922615  54.131760  595087630329184255        None   
1  129843  1387497600 -166.884086  54.258072  595087595969445887        None   
2  129843  1387584000 -166.910525  54.312433  595087595969445887        None   
3  129843  1387670400 -166.817057  54.358280  595087595969445887        None   
4  129843  1387756800 -166.676901  54.389694  595087595969445887        None   

    elevation  
0 -184.870688  
1 -790.973118 

Unnamed: 0,tag_key,date_key,longitude,latitude,h3_level_4_key,home_region,elevation
0,129843,1387411200,-166.922615,54.13176,595087630329184255,,-184.870688
1,129843,1387497600,-166.884086,54.258072,595087595969445887,,-790.973118
2,129843,1387584000,-166.910525,54.312433,595087595969445887,,-790.973118
3,129843,1387670400,-166.817057,54.35828,595087595969445887,,-790.973118
4,129843,1387756800,-166.676901,54.389694,595087595969445887,,-790.973118


In [6]:
depth = depth.merge(tt[['tag_key', 'date_key', 'longitude', 'latitude', 'home_region', 'elevation']])
depth.head()

Unnamed: 0,tag_key,date_key,depth,epoch,depth_class,longitude,latitude,home_region,elevation
0,202588,1598486400,28.5,1598542800,50,-157.875,56.1,,-108.727546
1,202588,1598486400,10.0,1598533200,25,-157.875,56.1,,-108.727546
2,202588,1598486400,12.5,1598531400,25,-157.875,56.1,,-108.727546
3,202588,1598486400,28.5,1598542200,50,-157.875,56.1,,-108.727546
4,202588,1598486400,25.5,1598496000,50,-157.875,56.1,,-108.727546


In [7]:
from suntimes import SunTimes

def get_sunrise(lat, lon, date):
    return SunTimes(longitude=lon, latitude=lat, altitude=0).risewhere(date, 'UTC').hour

def get_sunset(lat, lon, date):
    return SunTimes(longitude=lon, latitude=lat, altitude=0).setwhere(date, 'UTC').hour

depth['datetime'] = pd.to_datetime(depth['epoch'], utc=True, unit='s')
depth['date'] = depth['datetime'].dt.date
depth = depth[np.abs(depth['longitude']) <= 180]
depth['sunrise'] = depth.apply(
    lambda r: get_sunrise(r['latitude'], r['longitude'], r['date']), axis=1
)
depth['sunset'] = depth.apply(
    lambda r: get_sunset(r['latitude'], r['longitude'], r['date']), axis=1
)


depth['hour'] = depth['datetime'].dt.hour

depth['daytime'] = (depth['hour'] < depth['sunset']) | (depth['hour'] > depth['sunrise'])
depth.head()

Unnamed: 0,tag_key,date_key,depth,epoch,depth_class,longitude,latitude,home_region,elevation,datetime,date,sunrise,sunset,hour,daytime
0,202588,1598486400,28.5,1598542800,50,-157.875,56.1,,-108.727546,2020-08-27 15:40:00+00:00,2020-08-27,15,5,15,False
1,202588,1598486400,10.0,1598533200,25,-157.875,56.1,,-108.727546,2020-08-27 13:00:00+00:00,2020-08-27,15,5,13,False
2,202588,1598486400,12.5,1598531400,25,-157.875,56.1,,-108.727546,2020-08-27 12:30:00+00:00,2020-08-27,15,5,12,False
3,202588,1598486400,28.5,1598542200,50,-157.875,56.1,,-108.727546,2020-08-27 15:30:00+00:00,2020-08-27,15,5,15,False
4,202588,1598486400,25.5,1598496000,50,-157.875,56.1,,-108.727546,2020-08-27 02:40:00+00:00,2020-08-27,15,5,2,True


# Build Into Choices

In [8]:
choices = depth.copy().sample(10000 * 2)
choices = (
    choices.reset_index(drop=True).reset_index().rename(
        {'index': '_decision', 'depth_class': 'selected_class', 'tag_key': '_identifier'}, 
        axis=1
    )
)
choices.head()

Unnamed: 0,_decision,_identifier,date_key,depth,epoch,selected_class,longitude,latitude,home_region,elevation,datetime,date,sunrise,sunset,hour,daytime
0,0,142189,1445126400,52.4,1445202000,75,-178.483268,62.070841,,-118.744569,2015-10-18 21:00:00+00:00,2015-10-18,18,4,21,True
1,1,202595,1609200000,5.0,1609249800,25,-161.3,55.175,,8.765975,2020-12-29 13:50:00+00:00,2020-12-29,19,2,13,False
2,2,172907,1511568000,39.0,1511586900,50,-165.95,54.975,,-174.075908,2017-11-25 05:15:00+00:00,2017-11-25,18,2,5,False
3,3,205403,1603670400,130.5,1603736400,150,-149.3,59.05,SEAK,-162.064552,2020-10-26 18:20:00+00:00,2020-10-26,17,2,18,True
4,4,142189,1449273600,5.4,1449297000,25,-178.597373,58.857363,,-3308.116451,2015-12-05 06:30:00+00:00,2015-12-05,20,3,6,False


In [9]:
all_choices = choices[['_decision']].merge(pd.DataFrame({'depth_class': depth_classes}), how='cross')
print(all_choices.shape)
all_choices.head()

(200000, 2)


Unnamed: 0,_decision,depth_class
0,0,25
1,0,50
2,0,75
3,0,100
4,0,150


In [10]:
choices = choices.merge(all_choices, how='outer', on='_decision')
choices['selected'] = choices['depth_class'] == choices['selected_class']
del choices['selected_class']
print(choices.shape)
choices.head()

(200000, 17)


Unnamed: 0,_decision,_identifier,date_key,depth,epoch,longitude,latitude,home_region,elevation,datetime,date,sunrise,sunset,hour,daytime,depth_class,selected
0,0,142189,1445126400,52.4,1445202000,-178.483268,62.070841,,-118.744569,2015-10-18 21:00:00+00:00,2015-10-18,18,4,21,True,25,False
1,0,142189,1445126400,52.4,1445202000,-178.483268,62.070841,,-118.744569,2015-10-18 21:00:00+00:00,2015-10-18,18,4,21,True,50,False
2,0,142189,1445126400,52.4,1445202000,-178.483268,62.070841,,-118.744569,2015-10-18 21:00:00+00:00,2015-10-18,18,4,21,True,75,True
3,0,142189,1445126400,52.4,1445202000,-178.483268,62.070841,,-118.744569,2015-10-18 21:00:00+00:00,2015-10-18,18,4,21,True,100,False
4,0,142189,1445126400,52.4,1445202000,-178.483268,62.070841,,-118.744569,2015-10-18 21:00:00+00:00,2015-10-18,18,4,21,True,150,False


# Build the Features

In [93]:
model_data = choices.copy()
model_data['month'] = model_data['datetime'].dt.month
model_data['interval'] = 24 - model_data['sunrise'] + model_data['sunset']
model_data['daytime'] = model_data['daytime'].astype(float)
features = ['depth_class', 'latitude', 'sunrise', 'interval', 'daytime']#, 'sunrise', 'sunset', 'elevation', 'hour', 'month']
model_data = model_data[['selected', '_decision', '_identifier'] + features]
model_data.head()

Unnamed: 0,selected,_decision,_identifier,depth_class,latitude,sunrise,interval,daytime
0,False,0,142189,25,62.070841,18,10,1.0
1,False,0,142189,50,62.070841,18,10,1.0
2,True,0,142189,75,62.070841,18,10,1.0
3,False,0,142189,100,62.070841,18,10,1.0
4,False,0,142189,150,62.070841,18,10,1.0


In [94]:
ids = model_data['_identifier'].unique()
train_ids = np.random.choice(ids, int(0.8 * len(ids)), replace=False)
test_ids = np.array(list(set(ids) - set(train_ids)))
print(
    len(train_ids),
    len(test_ids)
)

88 23


In [95]:
folds = 4
rows = []
np.random.shuffle(train_ids)
for i, _identifier in enumerate(train_ids):
    rows.append({
        '_identifier': _identifier,
        'fold': i % folds
    })
folds_assignment = pd.DataFrame(rows)
folds_assignment.head()

Unnamed: 0,_identifier,fold
0,172905,0
1,205417,1
2,202585,2
3,229229,3
4,172913,0


In [96]:
train = model_data[model_data['_identifier'].isin(train_ids)]

train = train.merge(folds_assignment)
fold = np.array(train['fold'])

In [107]:
from collections import defaultdict

import numpy as np
from tqdm import tqdm


def get_proposed_utility(dataframe, learning_rate=None):
    """
    Inputs:
    - dataframe (pd.DataFrame): a dataframe with columns "utility",
        "selected", and "_decision"
    - learning_rate (float): maximum abs score

    Returns a pd.DataFrame with proposed utility values
    """
    dataframe["sum_utility"] = dataframe.groupby("_decision")["utility"].transform(
        "sum"
    )
    dataframe["probability"] = dataframe["utility"] / dataframe["sum_utility"]
    dataframe["score"] = dataframe["selected"] - dataframe["probability"]
    if learning_rate:
        factor = np.abs(learning_rate / dataframe["score"].min())
        dataframe["score"] = dataframe["score"] * factor
    dataframe["proposed"] = dataframe["utility"] * (1 + dataframe["score"])
    return dataframe

def get_central_likelihood(dataframe):
    return np.exp(np.mean(np.log(dataframe[dataframe['selected']]['probability'])))

def train_utility_model(
    model, dataframe, dataframe_test, feature_columns, N=1, learning_rate=None
):
    """
    Inputs:
    - model: a model object with a "fit" method and a "predict" method
    - dataframe (pd.DataFrame): a dataframe with both feature columns
        and "selected" and "_decision" columns
    - feature_columns (list): a list of column names to use as features
    - N (int): the number of iterations to train the model
    - learning_rate (float): maximum abs score

    Returns a trained model
    """
    assert "selected" not in feature_columns
    assert "_decision" not in feature_columns

    folds = set(dataframe['fold'].unique())
    fold_sets = []
    for fold in folds:
        fold_sets.append(
            (fold, folds - set([fold]))
        )

    diagnostics = []
    dataframe["proposed"] = 1.0
    for i in tqdm(range(N)):
        for fold, fold_set in fold_sets:
            train = dataframe[dataframe['fold'].isin(fold_set)]

            model.fit(train[features], train["proposed"])

            dataframe['utility'] = model.predict(dataframe[features])
            dataframe_test['utility'] = model.predict(dataframe_test[features])
            dataframe = get_proposed_utility(dataframe, learning_rate=learning_rate)
            dataframe_test = get_proposed_utility(dataframe_test, learning_rate=learning_rate)
            diagnostics.append({
                "iteration": i,
                "fold": fold,
                "case": "validate",
                "central_likelihood": get_central_likelihood(dataframe[dataframe['fold'] == fold])
            })
            diagnostics.append({
                "iteration": i,
                "fold": fold,
                "case": "train",
                "central_likelihood": get_central_likelihood(dataframe[dataframe['fold'].isin(fold_set)])
            })
            diagnostics.append({
                "iteration": i,
                "fold": fold,
                "case": "test",
                "central_likelihood": get_central_likelihood(dataframe_test)
            })

    model.fit(dataframe[features], dataframe['proposed'])
    dataframe = get_proposed_utility(dataframe, learning_rate=learning_rate)

    return model, pd.DataFrame(diagnostics)
   

In [110]:
from sklearn.ensemble import RandomForestRegressor

def grid_search(
        param_grids, M, model_class, dataframe, dataframe_test, feature_columns, N=1, learning_rate=None, max_attempts = 100
):
    param_sets = []
    attempts = 0
    while len(param_sets) < M:
        assert attempts < max_attempts

        param_set = {}
        for param, grid in param_grids.items():
            param_set[param] = np.random.choice(grid)
        if param_set in param_sets:
            attempts += 1
        else:
            attempts = 0
            param_sets.append(param_set)

    diagnostics_dfs = []
    for i, param_set in enumerate(param_sets):
        model = model_class(**param_set)
        model, diagnostics_df = train_utility_model(
            model, dataframe, dataframe_test, feature_columns, N, learning_rate
        )
        for param, val in param_set.items():
            diagnostics_df[param] = val
        diagnostics_df['_param_set'] = i
        diagnostics_dfs.append(diagnostics_df)

    return param_sets, pd.concat(diagnostics_dfs)

test = model_data[model_data['_identifier'].isin(test_ids)]

param_sets, diagnostics = grid_search(
    {
        # basic settings
        "bootstrap": [True],
        "n_jobs": [(os.cpu_count() - 2)],
        "max_samples": [int(train.shape[0] / 2)],
        # hyper parameters
        "n_estimators": [25, 50],
        "min_weight_fraction_leaf": [1e-04, 2e-04, 1e-05, 2e-05, 1e-06],
        "max_features": [1, 2, 3]
    },
    5,
    RandomForestRegressor,
    train,
    test,
    features,
    10,
    learning_rate=31 / 32,
)

100%|██████████| 10/10 [00:32<00:00,  3.29s/it]
100%|██████████| 10/10 [00:14<00:00,  1.48s/it]
100%|██████████| 10/10 [00:19<00:00,  1.96s/it]
100%|██████████| 10/10 [00:27<00:00,  2.78s/it]
100%|██████████| 10/10 [00:17<00:00,  1.70s/it]


In [115]:
summary = diagnostics[diagnostics['iteration'] == 9].groupby(['case', '_param_set'])[['central_likelihood']].mean()
summary

Unnamed: 0_level_0,Unnamed: 1_level_0,central_likelihood
case,_param_set,Unnamed: 2_level_1
test,0,0.182514
test,1,0.212031
test,2,0.206254
test,3,0.212564
test,4,0.184795
train,0,0.368649
train,1,0.311175
train,2,0.289529
train,3,0.311671
train,4,0.369065


In [100]:
df = summary.reset_index()
params = param_sets[df[df['case'] == 'validate'].sort_values('central_likelihood', ascending=False)['_param_set'].values[0]]
params

{'bootstrap': True,
 'n_jobs': 6,
 'max_samples': 72860,
 'n_estimators': 25,
 'min_weight_fraction_leaf': 1e-05,
 'max_features': 2}

In [108]:
model = RandomForestRegressor(**params)

model, diagnostics_results = train_utility_model(
    model,
    train,
    test,
    features,
    10,
    learning_rate=31 / 32,
)

100%|██████████| 10/10 [00:16<00:00,  1.63s/it]


In [109]:
px.line(
    diagnostics_results.groupby(['iteration', 'case']).mean().reset_index(),
    x='iteration', y='central_likelihood', color='case'
)

In [103]:
train["utility"] = model.predict(train[features])
train = get_proposed_utility(train)
print(get_central_likelihood(train))
train.head()

0.3695769923149888


Unnamed: 0,selected,_decision,_identifier,depth_class,latitude,sunrise,interval,daytime,fold,proposed,utility,sum_utility,probability,score
0,False,0,142189,25,62.070841,18,10,1.0,3,3.588816,4.501899,22.196341,0.202822,-0.202822
1,False,0,142189,50,62.070841,18,10,1.0,3,2.815283,3.308407,22.196341,0.149052,-0.149052
2,True,0,142189,75,62.070841,18,10,1.0,3,7.180502,3.939926,22.196341,0.177503,0.822497
3,False,0,142189,100,62.070841,18,10,1.0,3,2.86468,3.379105,22.196341,0.152237,-0.152237
4,False,0,142189,150,62.070841,18,10,1.0,3,3.190884,3.863294,22.196341,0.174051,-0.174051


In [104]:
test["utility"] = model.predict(test[features])
test = get_proposed_utility(test)
print(get_central_likelihood(test))
test.head()

0.18477711835589997


Unnamed: 0,selected,_decision,_identifier,depth_class,latitude,sunrise,interval,daytime,utility,sum_utility,probability,score,proposed
10,True,1,202595,25,55.175,19,7,0.0,6.241106,32.329635,0.193046,0.806954,11.277392
11,False,1,202595,50,55.175,19,7,0.0,2.170235,32.329635,0.067128,-0.067128,2.02455
12,False,1,202595,75,55.175,19,7,0.0,6.246415,32.329635,0.19321,-0.19321,5.039544
13,False,1,202595,100,55.175,19,7,0.0,8.671135,32.329635,0.26821,-0.26821,6.345449
14,False,1,202595,150,55.175,19,7,0.0,7.430874,32.329635,0.229847,-0.229847,5.722909


In [105]:
train.groupby('depth_class')[['probability', 'selected']].mean()

Unnamed: 0_level_0,probability,selected
depth_class,Unnamed: 1_level_1,Unnamed: 2_level_1
25,0.49859,0.508784
50,0.162544,0.164837
75,0.110992,0.108976
100,0.089766,0.086399
150,0.092691,0.094153
200,0.029328,0.029577
250,0.006126,0.00398
300,0.00388,0.002539
400,0.003088,0.000755
500,0.002995,0.0


In [106]:
test.groupby('depth_class')[['probability', 'selected']].mean()

Unnamed: 0_level_0,probability,selected
depth_class,Unnamed: 1_level_1,Unnamed: 2_level_1
25,0.464721,0.484709
50,0.188315,0.162122
75,0.115874,0.120302
100,0.087272,0.087878
150,0.098372,0.089352
200,0.030569,0.041452
250,0.005485,0.012896
300,0.003486,0.00129
400,0.002981,0.0
500,0.002923,0.0
