In [1]:
import os
import random

import h3
import numpy as np
import pandas as pd
import geopy.distance
import plotly.express as px
from mirrorverse.plotting import plot_h3_slider, plot_h3_animation
from mirrorverse.utils import read_data_w_cache

os.environ['HAVEN_DATABASE'] = 'haven'
os.environ['AWS_PROFILE'] = 'admin'

In [17]:
sql = '''
select  
    i._individual,
    i._decision,
    i._choice,
    f._selected,
    i._train,
    i.log_odds,
    i.odds,
    i.probability,
    f.h3_index,
    f.time,
    f.net_primary_production,
    f.mixed_layer_thickness,
    f.distance,
    f.water_heading,
    f.movement_heading
from 
    movement_model_inference_m3_a18 i 
    inner join movement_model_features_m3_a1 f 
        on i._individual = f._individual
        and i._decision = f._decision
        and i._choice = f._choice
'''
data = read_data_w_cache(sql)
data['date'] = data['time'].dt.strftime("%Y-%m-%d")
print(data.shape)
data.head()

(107979, 16)


Unnamed: 0,_individual,_decision,_choice,_selected,_train,log_odds,odds,probability,h3_index,time,net_primary_production,mixed_layer_thickness,distance,water_heading,movement_heading,date
0,9,3,17,False,True,-3.547141,0.028807,0.002647,840c5e9ffffffff,2016-03-16,8.865529,61.364006,70.411743,-2.682873,-0.249076,2016-03-16
1,9,1,4,False,True,-5.871097,0.00282,0.000277,840c513ffffffff,2016-03-14,9.65154,92.691001,86.289017,3.062147,0.029534,2016-03-14
2,9,18,15,False,True,-4.253474,0.014215,0.000995,840c5e1ffffffff,2016-03-31,12.275617,59.56032,76.360962,-3.137232,-2.84983,2016-03-31
3,9,8,14,False,True,0.049067,1.050291,0.081337,840c5edffffffff,2016-03-21,3.670062,110.400713,43.181267,2.842356,0.036477,2016-03-21
4,9,25,1,False,True,-6.649404,0.001295,7e-05,840c421ffffffff,2016-04-07,22.758621,35.17952,86.323982,-2.950686,0.642132,2016-04-07


In [18]:
sql = '''
select  
    _individual,
    avg(probability) as score
from 
    movement_model_inference_m3_a18
where 
    _selected
group by 
    1
'''
rankings = read_data_w_cache(sql).sort_values('score', ascending=False)
print(rankings.shape)
rankings

(91, 2)


Unnamed: 0,_individual,score
53,31,0.780472
22,26,0.767747
86,49,0.696936
13,60,0.679356
58,2,0.678087
...,...,...
71,1,0.273151
49,73,0.261222
68,24,0.255349
32,90,0.255274


In [19]:
def set_line_color(row):
    if row['distance'] == 0:
        return "orange"
    elif row['_selected']:
        return "purple"
    else:
        return "black"
    
data['color'] = data.apply(set_line_color, axis=1)

In [None]:
def add_common_time(data):
    data['mod_date'] = data['date'].apply(lambda d: '-'.join(['2020'] + list(d.split('-')[1:])))
    data['_time'] = pd.to_datetime(data['mod_date'])
    return data

poor_fits = add_common_time(
    data[data['color'] == 'orange'].merge(
        rankings[rankings['score'] >= rankings['score'].quantile(0.75)]
    )
).groupby(['h3_index', '_time'])[['_individual']].mean().reset_index()
poor_fits['color'] = 'orange'
good_fits = add_common_time(
    data[data['color'] == 'orange'].merge(
        rankings[rankings['score'] <= rankings['score'].quantile(0.25)]
    )
).groupby(['h3_index', '_time'])[['_individual']].mean().reset_index()
good_fits['color'] = 'purple'
overall = pd.concat([poor_fits, good_fits])
plot_h3_animation(
    overall, 
    value_col='_individual',
    h3_col='h3_index',
    slider_col='_time',
    line_color_col='color',
    bold_colors=['orange', 'purple'],
    zoom=3, 
    center={"lat": 55, "lon": -165},
    duration=200
)

In [6]:
selected = data[data['_selected']]
selected = selected.sort_values(['_individual', 'time'])
selected['moved'] = selected['distance'] > 0
selected.head()

Unnamed: 0,_individual,_decision,_choice,_selected,_train,log_odds,odds,probability,h3_index,time,net_primary_production,mixed_layer_thickness,distance,water_heading,movement_heading,date,color,moved
95427,0,1,17,True,True,6.393696,598.063171,0.700043,840cce9ffffffff,2020-08-04,4.212408,9.668819,0.0,-2.467472,0.0,2020-08-04,orange,False
82248,0,2,17,True,True,6.393696,598.063171,0.700043,840cce9ffffffff,2020-08-05,3.94457,9.668819,0.0,-2.817833,0.0,2020-08-05,orange,False
82048,0,3,18,True,True,4.39471,81.021103,0.094837,840ccebffffffff,2020-08-06,5.164183,10.518363,38.880829,-1.138572,-0.606672,2020-08-06,purple,True
82200,0,4,18,True,True,6.393696,598.063171,0.632331,840ccebffffffff,2020-08-07,7.052177,10.518363,0.0,-2.099981,0.0,2020-08-07,orange,False
95739,0,5,18,True,True,6.393696,598.063171,0.632331,840ccebffffffff,2020-08-08,6.84375,10.518363,0.0,-2.678993,0.0,2020-08-08,orange,False


In [20]:
df = selected.groupby('_individual')['moved'].mean().reset_index().merge(rankings)
px.scatter(
    df, x='score', y='moved'
)

In [21]:
moved = selected.groupby('_individual')['moved'].mean().reset_index()
prob = data[data['distance'] == 0]
prob['prob_moved'] = 1 - prob['probability']
prob_moved = prob.groupby('_individual')['prob_moved'].mean().reset_index()
df = moved.merge(prob_moved)
px.scatter(df, x='prob_moved', y='moved')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
df.corr()

Unnamed: 0,_individual,moved,prob_moved
_individual,1.0,0.210721,0.504829
moved,0.210721,1.0,0.420872
prob_moved,0.504829,0.420872,1.0


Why the movement? 

- Size?
- Location of Origin?
- Time of Year?

Some combination of the above?

In [None]:
sizes = read_data_w_cache('select tag_key, fork_length_cm from mgietzmann_tags')
key = read_data_w_cache('select distinct _individual, tag_key from movement_model_features_m2_a1')
sizes = key.merge(sizes)[['_individual', 'fork_length_cm']]
sizes.head()

In [None]:
df = selected.groupby('_individual')['moved'].mean().reset_index().merge(rankings).merge(sizes)
px.scatter(df, x='fork_length_cm', y='moved')

In [None]:
regions = read_data_w_cache('select tag_key, region from mgietzmann_tag_regions')
regions = key.merge(regions)[['_individual', 'region']].fillna('NaN')
regions.head()

In [None]:
df = selected.groupby('_individual')['moved'].mean().reset_index().merge(rankings).merge(regions)
px.violin(df, x='region', y='moved')

In [None]:
selected['month'] = selected['time'].dt.month
df = selected.groupby(['_individual', 'month'])['moved'].mean().reset_index()
px.violin(df, x='month', y='moved')

I think we should include:

- Fork Length
- Temperature
- Time of Year
- Region

And see what this gets us. 