In [1]:
import os
import random

import h3
import numpy as np
import pandas as pd
import geopy.distance
import plotly.express as px
from mirrorverse.plotting import plot_h3_slider, plot_h3_animation
from mirrorverse.utils import read_data_w_cache

os.environ['HAVEN_DATABASE'] = 'haven'
os.environ['AWS_PROFILE'] = 'admin'

In [2]:
sql = '''
select  
    i._individual,
    i._decision,
    i._choice,
    f._selected,
    i._train,
    i.log_odds,
    i.odds,
    i.probability,
    f.h3_index,
    f.time,
    f.net_primary_production,
    f.mixed_layer_thickness,
    f.distance,
    f.water_heading,
    f.movement_heading
from 
    movement_model_inference_m3_a3_v4 i 
    inner join movement_model_features_m3_a2 f 
        on i._individual = f._individual
        and i._decision = f._decision
        and i._choice = f._choice
'''
data = read_data_w_cache(sql)
data['date'] = data['time'].dt.strftime("%Y-%m-%d")
print(data.shape)
data.head()

(107979, 16)


Unnamed: 0,_individual,_decision,_choice,_selected,_train,log_odds,odds,probability,h3_index,time,net_primary_production,mixed_layer_thickness,distance,water_heading,movement_heading,date
0,11,12,18,False,True,-0.324417,0.722949,0.000331,8422dcbffffffff,2015-12-01,2.22278,64.143019,90.31546,-1.524217,2.730203,2015-12-01
1,11,24,7,False,True,3.539392,34.445984,0.011292,8422d13ffffffff,2015-12-13,0.866957,71.311587,42.56041,1.012816,1.448339,2015-12-13
2,11,27,12,False,True,1.713832,5.550191,0.002206,8422d1dffffffff,2015-12-16,1.611191,73.773732,74.917305,0.898518,-0.103599,2015-12-16
3,11,12,14,False,True,0.454968,1.576123,0.000723,8422d53ffffffff,2015-12-01,2.405352,71.828177,90.118355,-2.382584,-0.427833,2015-12-01
4,11,12,12,False,True,1.474331,4.368113,0.002002,8422d19ffffffff,2015-12-01,2.035083,77.259518,75.250938,-2.341862,-0.098357,2015-12-01


In [3]:
sql = '''
select  
    _individual,
    avg(probability) as score
from 
    movement_model_inference_m3_a3_v4
where 
    _selected
group by 
    1
'''
rankings = read_data_w_cache(sql).sort_values('score', ascending=False)
print(rankings.shape)
rankings

(91, 2)


Unnamed: 0,_individual,score
74,14,0.674147
12,12,0.659856
34,9,0.634890
58,11,0.625680
15,40,0.612057
...,...,...
33,84,0.293279
89,19,0.260843
4,68,0.255869
2,29,0.234262


In [4]:
sql = '''
with v3_con as (
    select
        _individual,
        avg(-ln(probability)) as loss
    from 
        movement_model_inference_m3_a3_v8
    where 
        _selected 
    group by 
        1
), v1_con as (
    select
        _individual,
        avg(-ln(probability)) as loss
    from 
        movement_model_inference_m3_a3_v4
    where 
        _selected 
    group by 
        1
)
select 
    v1_con._individual,
    v1_con.loss as v1_loss,
    v3_con.loss as v3_loss,
    v3_con.loss - v1_con.loss as diff
from 
    v1_con
    left join v3_con
        on v1_con._individual = v3_con._individual
order by 
    4 desc
'''
diff = read_data_w_cache(sql)
diff.head()

Unnamed: 0,_individual,v1_loss,v3_loss,diff
0,14,0.656533,0.802583,0.14605
1,19,2.051716,2.180444,0.128727
2,30,0.898793,1.000154,0.101361
3,10,1.258626,1.357608,0.098982
4,15,1.671027,1.769452,0.098425


In [5]:
def set_line_color(row):
    if row['distance'] == 0:
        return "orange"
    elif row['_selected']:
        return "purple"
    else:
        return "black"
    
data['color'] = data.apply(set_line_color, axis=1)

In [None]:
def add_common_time(data):
    data['mod_date'] = data['date'].apply(lambda d: '-'.join(['2020'] + list(d.split('-')[1:])))
    data['_time'] = pd.to_datetime(data['mod_date'])
    return data

fits = add_common_time(
    data[data['color'] == 'orange'].merge(
        diff[['_individual', 'diff']]
    )
).groupby(['h3_index', '_time'])[['_individual', 'diff']].mean().reset_index()
fits['color'] = 'black'
plot_h3_animation(
    fits, 
    value_col='diff',
    h3_col='h3_index',
    slider_col='_time',
    line_color_col='color',
    bold_colors=['orange', 'purple'],
    zoom=3, 
    center={"lat": 55, "lon": -165},
    duration=200,
    zmin=-0.1,
    zmax=0.1,
    colorscale="rdbu"
)

In [None]:
def add_common_time(data):
    data['mod_date'] = data['date'].apply(lambda d: '-'.join(['2020'] + list(d.split('-')[1:])))
    data['_time'] = pd.to_datetime(data['mod_date'])
    return data

poor_fits = add_common_time(
    data[data['color'] == 'orange'].merge(
        rankings[rankings['score'] >= rankings['score'].quantile(0.75)]
    )
).groupby(['h3_index', '_time'])[['_individual']].mean().reset_index()
poor_fits['color'] = 'orange'
good_fits = add_common_time(
    data[data['color'] == 'orange'].merge(
        rankings[rankings['score'] <= rankings['score'].quantile(0.25)]
    )
).groupby(['h3_index', '_time'])[['_individual']].mean().reset_index()
good_fits['color'] = 'purple'
overall = pd.concat([poor_fits, good_fits])
plot_h3_animation(
    overall, 
    value_col='_individual',
    h3_col='h3_index',
    slider_col='_time',
    line_color_col='color',
    bold_colors=['orange', 'purple'],
    zoom=3, 
    center={"lat": 55, "lon": -165},
    duration=200
)

In [6]:
selected = data[data['_selected']]
selected = selected.sort_values(['_individual', 'time'])
selected['moved'] = selected['distance'] > 0
selected.head()

Unnamed: 0,_individual,_decision,_choice,_selected,_train,log_odds,odds,probability,h3_index,time,net_primary_production,mixed_layer_thickness,distance,water_heading,movement_heading,date,color,moved
95427,0,1,17,True,True,6.393696,598.063171,0.700043,840cce9ffffffff,2020-08-04,4.212408,9.668819,0.0,-2.467472,0.0,2020-08-04,orange,False
82248,0,2,17,True,True,6.393696,598.063171,0.700043,840cce9ffffffff,2020-08-05,3.94457,9.668819,0.0,-2.817833,0.0,2020-08-05,orange,False
82048,0,3,18,True,True,4.39471,81.021103,0.094837,840ccebffffffff,2020-08-06,5.164183,10.518363,38.880829,-1.138572,-0.606672,2020-08-06,purple,True
82200,0,4,18,True,True,6.393696,598.063171,0.632331,840ccebffffffff,2020-08-07,7.052177,10.518363,0.0,-2.099981,0.0,2020-08-07,orange,False
95739,0,5,18,True,True,6.393696,598.063171,0.632331,840ccebffffffff,2020-08-08,6.84375,10.518363,0.0,-2.678993,0.0,2020-08-08,orange,False


In [20]:
df = selected.groupby('_individual')['moved'].mean().reset_index().merge(rankings)
px.scatter(
    df, x='score', y='moved'
)

In [21]:
moved = selected.groupby('_individual')['moved'].mean().reset_index()
prob = data[data['distance'] == 0]
prob['prob_moved'] = 1 - prob['probability']
prob_moved = prob.groupby('_individual')['prob_moved'].mean().reset_index()
df = moved.merge(prob_moved)
px.scatter(df, x='prob_moved', y='moved')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
df.corr()

Unnamed: 0,_individual,moved,prob_moved
_individual,1.0,0.210721,0.504829
moved,0.210721,1.0,0.420872
prob_moved,0.504829,0.420872,1.0


Why the movement? 

- Size?
- Location of Origin?
- Time of Year?

Some combination of the above?

In [None]:
sizes = read_data_w_cache('select tag_key, fork_length_cm from mgietzmann_tags')
key = read_data_w_cache('select distinct _individual, tag_key from movement_model_features_m2_a1')
sizes = key.merge(sizes)[['_individual', 'fork_length_cm']]
sizes.head()

In [None]:
df = selected.groupby('_individual')['moved'].mean().reset_index().merge(rankings).merge(sizes)
px.scatter(df, x='fork_length_cm', y='moved')

In [None]:
regions = read_data_w_cache('select tag_key, region from mgietzmann_tag_regions')
regions = key.merge(regions)[['_individual', 'region']].fillna('NaN')
regions.head()

In [None]:
df = selected.groupby('_individual')['moved'].mean().reset_index().merge(rankings).merge(regions)
px.violin(df, x='region', y='moved')

In [None]:
selected['month'] = selected['time'].dt.month
df = selected.groupby(['_individual', 'month'])['moved'].mean().reset_index()
px.violin(df, x='month', y='moved')

I think we should include:

- Fork Length
- Temperature
- Time of Year
- Region

And see what this gets us. 