In [80]:
import os
import random

import h3
import numpy as np
import pandas as pd
import geopy.distance
import plotly.express as px
from mirrorverse.plotting import plot_h3_slider, plot_h3_animation
from mirrorverse.utils import read_data_w_cache

os.environ['HAVEN_DATABASE'] = 'haven'
os.environ['AWS_PROFILE'] = 'admin'

In [72]:
sql = '''
select  
    i._individual,
    i._decision,
    i._choice,
    f._selected,
    i._train,
    i.log_odds,
    i.odds,
    i.probability,
    f.h3_index,
    f.time,
    f.net_primary_production,
    f.mixed_layer_thickness,
    f.distance,
    f.water_heading,
    f.movement_heading
from 
    movement_model_inference_m2_a1 i 
    inner join movement_model_features_m2_a1 f 
        on i._individual = f._individual
        and i._decision = f._decision
        and i._choice = f._choice
'''
data = read_data_w_cache(sql)
data['date'] = data['time'].dt.strftime("%Y-%m-%d")
print(data.shape)
data.head()

(107979, 16)


Unnamed: 0,_individual,_decision,_choice,_selected,_train,log_odds,odds,probability,h3_index,time,net_primary_production,mixed_layer_thickness,distance,water_heading,movement_heading,date
0,8,19,19,False,False,-0.581495,0.559062,0.039425,840ccebffffffff,2020-08-20,2.250086,10.518363,38.630669,-2.393005,2.526236,2020-08-20
1,8,42,15,False,False,-4.155947,0.015671,0.00093,840ccebffffffff,2020-09-12,7.104885,11.165566,75.752777,-2.456823,-2.760679,2020-09-12
2,8,64,14,False,False,-3.720925,0.024212,0.002149,840ccb5ffffffff,2020-10-04,8.854354,24.921634,66.451942,-2.433206,-1.418148,2020-10-04
3,8,62,17,False,False,1.143745,3.1385,0.170827,840ccb9ffffffff,2020-10-02,4.610088,15.827531,37.990437,-2.680057,-0.629075,2020-10-02
4,8,12,9,False,False,-3.708452,0.024515,0.002008,840cc9dffffffff,2020-08-13,31.275713,10.528886,85.632736,-2.878665,0.08774,2020-08-13


In [73]:
sql = '''
select  
    _individual,
    avg(probability) as score
from 
    movement_model_inference_m2_a1
where 
    _selected
group by 
    1
'''
rankings = read_data_w_cache(sql).sort_values('score', ascending=False)
print(rankings.shape)
rankings

(91, 2)


Unnamed: 0,_individual,score
16,37,0.708440
32,43,0.692393
4,31,0.669473
56,16,0.663704
67,41,0.660710
...,...,...
70,61,0.273085
57,51,0.251936
37,79,0.250740
34,3,0.234841


In [74]:
def set_line_color(row):
    if row['distance'] == 0:
        return "orange"
    elif row['_selected']:
        return "purple"
    else:
        return "black"
    
data['color'] = data.apply(set_line_color, axis=1)

In [None]:
def add_common_time(data):
    data['mod_date'] = data['date'].apply(lambda d: '-'.join(['2020'] + list(d.split('-')[1:])))
    data['_time'] = pd.to_datetime(data['mod_date'])
    return data

poor_fits = add_common_time(
    data[data['color'] == 'orange'].merge(
        rankings[rankings['score'] >= rankings['score'].quantile(0.75)]
    )
).groupby(['h3_index', '_time'])[['_individual']].mean().reset_index()
poor_fits['color'] = 'orange'
good_fits = add_common_time(
    data[data['color'] == 'orange'].merge(
        rankings[rankings['score'] <= rankings['score'].quantile(0.25)]
    )
).groupby(['h3_index', '_time'])[['_individual']].mean().reset_index()
good_fits['color'] = 'purple'
overall = pd.concat([poor_fits, good_fits])
plot_h3_animation(
    overall, 
    value_col='_individual',
    h3_col='h3_index',
    slider_col='_time',
    line_color_col='color',
    bold_colors=['orange', 'purple'],
    zoom=3, 
    center={"lat": 55, "lon": -165},
    duration=200
)

In [76]:
selected = data[data['_selected']]
selected = selected.sort_values(['_individual', 'time'])
selected['moved'] = selected['distance'] > 0
selected.head()

Unnamed: 0,_individual,_decision,_choice,_selected,_train,log_odds,odds,probability,h3_index,time,net_primary_production,mixed_layer_thickness,distance,water_heading,movement_heading,date,color,moved
10408,0,1,18,True,False,2.348935,10.474409,0.659026,840c9ebffffffff,2015-08-04,8.80383,10.528886,0.0,-3.080385,0.0,2015-08-04,orange,False
8874,0,2,10,True,False,-0.80616,0.44657,0.025452,840c9c5ffffffff,2015-08-05,8.799863,11.557796,43.663868,-2.63067,1.277605,2015-08-05,purple,True
7662,0,3,17,True,False,-1.939586,0.143763,0.008418,840c9e9ffffffff,2015-08-06,8.713154,11.351961,46.165955,2.186119,-2.916775,2015-08-06,purple,True
19326,0,4,2,True,False,-0.908004,0.403329,0.033606,840c913ffffffff,2015-08-07,8.542751,11.158331,43.814377,2.088821,1.29568,2015-08-07,purple,True
20461,0,5,4,True,False,2.032951,7.63659,0.657129,840c913ffffffff,2015-08-08,7.773859,12.947478,0.0,2.217131,0.0,2015-08-08,orange,False


In [77]:
df = selected.groupby('_individual')['moved'].mean().reset_index().merge(rankings)
px.scatter(
    df, x='score', y='moved'
)

In [78]:
moved = selected.groupby('_individual')['moved'].mean().reset_index()
prob = data[data['distance'] == 0]
prob['prob_moved'] = 1 - prob['probability']
prob_moved = prob.groupby('_individual')['prob_moved'].mean().reset_index()
df = moved.merge(prob_moved)
px.scatter(df, x='prob_moved', y='moved')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [79]:
df.corr()

Unnamed: 0,_individual,moved,prob_moved
_individual,1.0,0.18964,0.704737
moved,0.18964,1.0,0.426889
prob_moved,0.704737,0.426889,1.0


Why the movement? 

- Size?
- Location of Origin?
- Time of Year?

Some combination of the above?

In [None]:
sizes = read_data_w_cache('select tag_key, fork_length_cm from mgietzmann_tags')
key = read_data_w_cache('select distinct _individual, tag_key from movement_model_features_m2_a1')
sizes = key.merge(sizes)[['_individual', 'fork_length_cm']]
sizes.head()

In [None]:
df = selected.groupby('_individual')['moved'].mean().reset_index().merge(rankings).merge(sizes)
px.scatter(df, x='fork_length_cm', y='moved')

In [None]:
regions = read_data_w_cache('select tag_key, region from mgietzmann_tag_regions')
regions = key.merge(regions)[['_individual', 'region']].fillna('NaN')
regions.head()

In [None]:
df = selected.groupby('_individual')['moved'].mean().reset_index().merge(rankings).merge(regions)
px.violin(df, x='region', y='moved')

In [None]:
selected['month'] = selected['time'].dt.month
df = selected.groupby(['_individual', 'month'])['moved'].mean().reset_index()
px.violin(df, x='month', y='moved')

I think we should include:

- Fork Length
- Temperature
- Time of Year
- Region

And see what this gets us. 