In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os
import h3
import itertools
from tqdm import tqdm
from multiprocessing import Pool
from scipy.stats import norm
from collections import defaultdict
from sklearn.cluster import AgglomerativeClustering

from mirrorverse.warehouse.utils import get_engine
from mirrorverse.chinook.states import spatial_key_to_index

pd.options.mode.chained_assignment = None

os.environ["DATABASE_URL"] = "sqlite:////workspaces/mirrorverse/mirrorverse.db"

# Load the Data

In [2]:
sql = '''
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths
'''
depth = pd.read_sql(sql, get_engine())
depth = depth[~np.isnan(depth['depth'])]
print(depth.shape)
depth.head()

2024-05-19 10:39:52,902 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-19 10:39:52,902 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths
")
2024-05-19 10:39:52,903 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-19 10:39:52,904 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths
")
2024-05-19 10:39:52,905 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-19 10:39:52,906 INFO sqlalchemy.engine.Engine 
select 
    tag_key,
    date_key,
    depth,
    epoch
from 
    tag_depths

2024-05-19 10:39:52,907 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-19 10:39:55,785 INFO sqlalchemy.engine.Engine ROLLBACK
(1033662, 4)


Unnamed: 0,tag_key,date_key,depth,epoch
0,129843,1387411200,83.4,1387411200
1,129843,1387411200,80.7,1387412100
2,129843,1387411200,91.5,1387413000
3,129843,1387411200,91.5,1387413900
4,129843,1387411200,88.8,1387414800


In [3]:
def select_a_class(depth, depth_classes):
    sd = depth * 0.08 / 1.96 # ~two standard deviations gives our 95% confidence interval
    if sd == 0:
        division = np.zeros(len(depth_classes))
        division[0] = 1
    else:
        # we're going to assume the depth classes are sorted
        z = (depth_classes - depth) / sd
        division = norm.cdf(z)
        division[1:] = division[1:] - division[:-1]
    # if there aren't quite enough depth classes the 
    # probabilities may not sum to 1, so we'll normalize
    division = division / division.sum()
    #if len(division[np.isnan(division)]) != 0:
    #    return np.nan
    return np.random.choice(depth_classes, p=division)

min_size = 0.01
depth_classes = np.array([25, 50, 75, 100, 150, 200, 250, 300, 400, 500])

depth['depth_class'] = depth['depth'].apply(lambda x: select_a_class(x, depth_classes))
depth.head()

Unnamed: 0,tag_key,date_key,depth,epoch,depth_class
0,129843,1387411200,83.4,1387411200,100
1,129843,1387411200,80.7,1387412100,100
2,129843,1387411200,91.5,1387413000,100
3,129843,1387411200,91.5,1387413900,100
4,129843,1387411200,88.8,1387414800,100


In [4]:
print(depth.shape)
depth = depth.sample(int(depth.shape[0] / len(depth_classes)), replace=True)
print(depth.shape)

(1033662, 5)
(103366, 5)


# Add Context

In [5]:
sql = '''
select 
    tt.*,
    h.home_region,
    e.elevation
from 
    tag_tracks tt 
    left join home_regions h
        on tt.tag_key = h.tag_key
    left join elevation e 
        on tt.h3_level_4_key = e.h3_level_4_key
'''
tt = pd.read_sql_query(
    sql,
    get_engine()
)
print(tt.head())
tt.head()

2024-05-19 10:40:49,308 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-19 10:40:49,309 INFO sqlalchemy.engine.Engine 
select 
    tt.*,
    h.home_region,
    e.elevation
from 
    tag_tracks tt 
    left join home_regions h
        on tt.tag_key = h.tag_key
    left join elevation e 
        on tt.h3_level_4_key = e.h3_level_4_key

2024-05-19 10:40:49,309 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-19 10:40:49,339 INFO sqlalchemy.engine.Engine ROLLBACK
  tag_key    date_key   longitude   latitude      h3_level_4_key home_region  \
0  129843  1387411200 -166.922615  54.131760  595087630329184255        None   
1  129843  1387497600 -166.884086  54.258072  595087595969445887        None   
2  129843  1387584000 -166.910525  54.312433  595087595969445887        None   
3  129843  1387670400 -166.817057  54.358280  595087595969445887        None   
4  129843  1387756800 -166.676901  54.389694  595087595969445887        None   

    elevation  
0 -184.870688  
1 -790.973118 

Unnamed: 0,tag_key,date_key,longitude,latitude,h3_level_4_key,home_region,elevation
0,129843,1387411200,-166.922615,54.13176,595087630329184255,,-184.870688
1,129843,1387497600,-166.884086,54.258072,595087595969445887,,-790.973118
2,129843,1387584000,-166.910525,54.312433,595087595969445887,,-790.973118
3,129843,1387670400,-166.817057,54.35828,595087595969445887,,-790.973118
4,129843,1387756800,-166.676901,54.389694,595087595969445887,,-790.973118


In [6]:
depth = depth.merge(tt[['tag_key', 'date_key', 'longitude', 'latitude', 'home_region', 'elevation']])
depth.head()

Unnamed: 0,tag_key,date_key,depth,epoch,depth_class,longitude,latitude,home_region,elevation
0,202588,1598486400,28.5,1598542800,50,-157.875,56.1,,-108.727546
1,202588,1598486400,10.0,1598533200,25,-157.875,56.1,,-108.727546
2,202588,1598486400,12.5,1598531400,25,-157.875,56.1,,-108.727546
3,202588,1598486400,28.5,1598542200,50,-157.875,56.1,,-108.727546
4,202588,1598486400,25.5,1598496000,50,-157.875,56.1,,-108.727546


In [7]:
from suntimes import SunTimes

def get_sunrise(lat, lon, date):
    return SunTimes(longitude=lon, latitude=lat, altitude=0).risewhere(date, 'UTC').hour

def get_sunset(lat, lon, date):
    return SunTimes(longitude=lon, latitude=lat, altitude=0).setwhere(date, 'UTC').hour

depth['datetime'] = pd.to_datetime(depth['epoch'], utc=True, unit='s')
depth['date'] = depth['datetime'].dt.date
depth = depth[np.abs(depth['longitude']) <= 180]
depth['sunrise'] = depth.apply(
    lambda r: get_sunrise(r['latitude'], r['longitude'], r['date']), axis=1
)
depth['sunset'] = depth.apply(
    lambda r: get_sunset(r['latitude'], r['longitude'], r['date']), axis=1
)


depth['hour'] = depth['datetime'].dt.hour

depth['daytime'] = (depth['hour'] < depth['sunset']) | (depth['hour'] > depth['sunrise'])
depth.head()

Unnamed: 0,tag_key,date_key,depth,epoch,depth_class,longitude,latitude,home_region,elevation,datetime,date,sunrise,sunset,hour,daytime
0,202588,1598486400,28.5,1598542800,50,-157.875,56.1,,-108.727546,2020-08-27 15:40:00+00:00,2020-08-27,15,5,15,False
1,202588,1598486400,10.0,1598533200,25,-157.875,56.1,,-108.727546,2020-08-27 13:00:00+00:00,2020-08-27,15,5,13,False
2,202588,1598486400,12.5,1598531400,25,-157.875,56.1,,-108.727546,2020-08-27 12:30:00+00:00,2020-08-27,15,5,12,False
3,202588,1598486400,28.5,1598542200,50,-157.875,56.1,,-108.727546,2020-08-27 15:30:00+00:00,2020-08-27,15,5,15,False
4,202588,1598486400,25.5,1598496000,50,-157.875,56.1,,-108.727546,2020-08-27 02:40:00+00:00,2020-08-27,15,5,2,True


# Build Into Choices

In [198]:
print(depth.shape)
choices = depth.copy().sample(10000 * 10)
print(choices.shape)
choices = (
    choices.reset_index(drop=True).reset_index().rename(
        {'index': '_decision', 'depth_class': 'selected_class', 'tag_key': '_identifier'}, 
        axis=1
    )
)
choices.head()

(103338, 15)
(100000, 15)


Unnamed: 0,_decision,_identifier,date_key,depth,epoch,selected_class,longitude,latitude,home_region,elevation,datetime,date,sunrise,sunset,hour,daytime
0,0,205404,1603152000,83.0,1603199400,100,-153.725,58.075,SEAK,-78.920882,2020-10-20 13:10:00+00:00,2020-10-20,17,2,13,False
1,1,202597,1597190400,6.5,1597240800,25,-159.375,55.2,SEAK,-80.210727,2020-08-12 14:00:00+00:00,2020-08-12,15,6,14,False
2,2,159014b,1493337600,8.5,1493382300,25,-152.3,59.6,WA/OR,-45.030789,2017-04-28 12:25:00+00:00,2017-04-28,14,5,12,False
3,3,205400,1604188800,140.5,1604245200,150,-150.05,58.725,,-119.637932,2020-11-01 15:40:00+00:00,2020-11-01,17,2,15,False
4,4,210760,1622764800,28.5,1622834100,50,-133.975,54.4,BC,-627.996741,2021-06-04 19:15:00+00:00,2021-06-04,12,5,19,True


In [199]:
all_choices = choices[['_decision']].merge(pd.DataFrame({'depth_class': depth_classes}), how='cross')
print(all_choices.shape)
all_choices.head()

(1000000, 2)


Unnamed: 0,_decision,depth_class
0,0,25
1,0,50
2,0,75
3,0,100
4,0,150


In [200]:
choices = choices.merge(all_choices, how='outer', on='_decision')
choices['selected'] = choices['depth_class'] == choices['selected_class']
del choices['selected_class']
print(choices.shape)
choices.head()

(1000000, 17)


Unnamed: 0,_decision,_identifier,date_key,depth,epoch,longitude,latitude,home_region,elevation,datetime,date,sunrise,sunset,hour,daytime,depth_class,selected
0,0,205404,1603152000,83.0,1603199400,-153.725,58.075,SEAK,-78.920882,2020-10-20 13:10:00+00:00,2020-10-20,17,2,13,False,25,False
1,0,205404,1603152000,83.0,1603199400,-153.725,58.075,SEAK,-78.920882,2020-10-20 13:10:00+00:00,2020-10-20,17,2,13,False,50,False
2,0,205404,1603152000,83.0,1603199400,-153.725,58.075,SEAK,-78.920882,2020-10-20 13:10:00+00:00,2020-10-20,17,2,13,False,75,False
3,0,205404,1603152000,83.0,1603199400,-153.725,58.075,SEAK,-78.920882,2020-10-20 13:10:00+00:00,2020-10-20,17,2,13,False,100,True
4,0,205404,1603152000,83.0,1603199400,-153.725,58.075,SEAK,-78.920882,2020-10-20 13:10:00+00:00,2020-10-20,17,2,13,False,150,False


# Build the Features

In [310]:
model_data = choices.copy()
model_data['month'] = model_data['datetime'].dt.month
model_data['interval'] = 24 - model_data['sunrise'] + model_data['sunset']
model_data['daytime'] = model_data['daytime'].astype(float)
features = ['depth_class', 'month', 'daytime']#'latitude', 'sunrise', 'interval', 'daytime']
meta = ['interval']
model_data = model_data[['selected', '_decision', '_identifier'] + features + meta]
model_data.head()

Unnamed: 0,selected,_decision,_identifier,depth_class,month,daytime,interval
0,False,0,205404,25,10,0.0,9
1,False,0,205404,50,10,0.0,9
2,False,0,205404,75,10,0.0,9
3,True,0,205404,100,10,0.0,9
4,False,0,205404,150,10,0.0,9


In [311]:
ids = model_data['_identifier'].unique()
train_ids = np.random.choice(ids, int(0.6 * len(ids)), replace=False)
test_ids = np.array(list(set(ids) - set(train_ids)))
print(
    len(train_ids),
    len(test_ids)
)

66 45


In [312]:
folds = 4
rows = []
np.random.shuffle(train_ids)
for i, _identifier in enumerate(train_ids):
    rows.append({
        '_identifier': _identifier,
        'fold': i % folds
    })
folds_assignment = pd.DataFrame(rows)
folds_assignment.head()

Unnamed: 0,_identifier,fold
0,229239,0
1,229235,1
2,205415,2
3,202591,3
4,142189,0


In [313]:
train = model_data[model_data['_identifier'].isin(train_ids)]

train = train.merge(folds_assignment)
fold = np.array(train['fold'])

In [314]:
def shape(base, to_shape, cols):
    to_shape = to_shape.copy()
    for col in cols:
        to_shape[col] = (to_shape[col] - base[col].mean()) / base[col].std()
    return to_shape

non_depth_class_cols = [f for f in features if f != 'depth_class']
shaped_train = shape(
    train, train, non_depth_class_cols
)

test = model_data[model_data['_identifier'].isin(test_ids)]
shaped_test = shape(
    train, test, non_depth_class_cols
)
print(shaped_test[non_depth_class_cols].std())
print(shaped_test[non_depth_class_cols].mean())

month      1.154581
daytime    1.001078
dtype: float64
month      0.100441
daytime   -0.074173
dtype: float64


In [315]:
from sklearn.decomposition import PCA

pca = PCA(n_components=len(features)-1)
pca.fit(shaped_train[non_depth_class_cols])

In [316]:
pca.components_

array([[ 0.70710678, -0.70710678],
       [ 0.70710678,  0.70710678]])

In [317]:
pca.explained_variance_ratio_

array([0.54558422, 0.45441578])

In [318]:
to_keep = 2

X_train = pca.transform(shaped_train[non_depth_class_cols])
X_test = pca.transform(shaped_test[non_depth_class_cols])

for i in range(to_keep):
    train[f'_feature_{i}'] = X_train[:,i]
    test[f'_feature_{i}'] = X_test[:,i]

shaped_features = ['depth_class'] + [
    f'_feature_{i}' for i in range(to_keep)
]

In [319]:
train

Unnamed: 0,selected,_decision,_identifier,depth_class,month,daytime,interval,fold,_feature_0,_feature_1
0,True,2,159014b,25,4,0.0,15,1,0.222337,-1.266764
1,False,2,159014b,50,4,0.0,15,1,0.222337,-1.266764
2,False,2,159014b,75,4,0.0,15,1,0.222337,-1.266764
3,False,2,159014b,100,4,0.0,15,1,0.222337,-1.266764
4,False,2,159014b,150,4,0.0,15,1,0.222337,-1.266764
...,...,...,...,...,...,...,...,...,...,...
564835,False,99847,205398,200,10,0.0,10,2,1.736296,0.247194
564836,False,99847,205398,250,10,0.0,10,2,1.736296,0.247194
564837,False,99847,205398,300,10,0.0,10,2,1.736296,0.247194
564838,False,99847,205398,400,10,0.0,10,2,1.736296,0.247194


In [320]:
from collections import defaultdict

import numpy as np
from tqdm import tqdm


def get_proposed_utility(dataframe, learning_rate=None):
    """
    Inputs:
    - dataframe (pd.DataFrame): a dataframe with columns "utility",
        "selected", and "_decision"
    - learning_rate (float): maximum abs score

    Returns a pd.DataFrame with proposed utility values
    """
    dataframe["sum_utility"] = dataframe.groupby("_decision")["utility"].transform(
        "sum"
    )
    dataframe["probability"] = dataframe["utility"] / dataframe["sum_utility"]
    dataframe["score"] = dataframe["selected"] - dataframe["probability"]
    if learning_rate:
        factor = np.abs(learning_rate / dataframe["score"].min())
        dataframe["score"] = dataframe["score"] * factor
    dataframe["proposed"] = dataframe["utility"] * (1 + dataframe["score"])
    return dataframe

def get_central_likelihood(dataframe):
    return np.exp(np.mean(np.log(dataframe[dataframe['selected']]['probability'])))

def train_utility_model(
    model, dataframe, dataframe_test, feature_columns, N=1, learning_rate=None
):
    """
    Inputs:
    - model: a model object with a "fit" method and a "predict" method
    - dataframe (pd.DataFrame): a dataframe with both feature columns
        and "selected" and "_decision" columns
    - feature_columns (list): a list of column names to use as features
    - N (int): the number of iterations to train the model
    - learning_rate (float): maximum abs score

    Returns a trained model
    """
    assert "selected" not in feature_columns
    assert "_decision" not in feature_columns

    folds = set(dataframe['fold'].unique())
    fold_sets = []
    for fold in folds:
        fold_sets.append(
            (fold, folds - set([fold]))
        )

    diagnostics = []
    dataframe["proposed"] = 1.0
    for i in tqdm(range(N)):
        for fold, fold_set in fold_sets:
            train = dataframe[dataframe['fold'].isin(fold_set)]

            model.fit(train[feature_columns], train["proposed"])

            dataframe['utility'] = model.predict(dataframe[feature_columns])
            dataframe_test['utility'] = model.predict(dataframe_test[feature_columns])
            dataframe = get_proposed_utility(dataframe, learning_rate=learning_rate)
            dataframe_test = get_proposed_utility(dataframe_test, learning_rate=learning_rate)
            diagnostics.append({
                "iteration": i,
                "fold": fold,
                "case": "validate",
                "central_likelihood": get_central_likelihood(dataframe[dataframe['fold'] == fold])
            })
            diagnostics.append({
                "iteration": i,
                "fold": fold,
                "case": "train",
                "central_likelihood": get_central_likelihood(dataframe[dataframe['fold'].isin(fold_set)])
            })
            diagnostics.append({
                "iteration": i,
                "fold": fold,
                "case": "test",
                "central_likelihood": get_central_likelihood(dataframe_test)
            })

    model.fit(dataframe[feature_columns], dataframe['proposed'])
    dataframe = get_proposed_utility(dataframe, learning_rate=learning_rate)

    return model, pd.DataFrame(diagnostics)
   

In [321]:
from sklearn.ensemble import RandomForestRegressor

def grid_search(
        param_grids, M, model_class, dataframe, dataframe_test, feature_columns, N=1, learning_rate=None, max_attempts = 100
):
    param_sets = []
    attempts = 0
    while len(param_sets) < M:
        assert attempts < max_attempts

        param_set = {}
        for param, grid in param_grids.items():
            param_set[param] = np.random.choice(grid)
        if param_set in param_sets:
            attempts += 1
        else:
            attempts = 0
            param_sets.append(param_set)

    diagnostics_dfs = []
    for i, param_set in enumerate(param_sets):
        model = model_class(**param_set)
        model, diagnostics_df = train_utility_model(
            model, dataframe, dataframe_test, feature_columns, N, learning_rate
        )
        for param, val in param_set.items():
            diagnostics_df[param] = val
        diagnostics_df['_param_set'] = i
        diagnostics_dfs.append(diagnostics_df)

    return param_sets, pd.concat(diagnostics_dfs)

param_sets, diagnostics = grid_search(
    {
        # basic settings
        "bootstrap": [True],
        "n_jobs": [(os.cpu_count() - 2)],
        "max_samples": [int(train.shape[0] / 2)],
        # hyper parameters
        "n_estimators": [25, 50],
        "min_weight_fraction_leaf": [1e-04, 2e-04, 1e-05, 2e-05, 1e-06],
        "max_features": [1, 2, 3]
    },
    5,
    RandomForestRegressor,
    train,
    test,
    shaped_features,
    10,
    learning_rate=31 / 32,
)

100%|██████████| 10/10 [00:26<00:00,  2.63s/it]
100%|██████████| 10/10 [00:26<00:00,  2.70s/it]
100%|██████████| 10/10 [00:27<00:00,  2.80s/it]
100%|██████████| 10/10 [00:36<00:00,  3.61s/it]
100%|██████████| 10/10 [00:36<00:00,  3.66s/it]


In [322]:
summary = diagnostics[diagnostics['iteration'] == 9].groupby(['case', '_param_set'])[['central_likelihood']].mean()
summary

Unnamed: 0_level_0,Unnamed: 1_level_0,central_likelihood
case,_param_set,Unnamed: 2_level_1
test,0,0.236293
test,1,0.236276
test,2,0.236313
test,3,0.236267
test,4,0.236291
train,0,0.245611
train,1,0.245617
train,2,0.24561
train,3,0.24561
train,4,0.245598


In [323]:
df = summary.reset_index()
best_param_set = df[df['case'] == 'validate'].sort_values('central_likelihood', ascending=False)['_param_set'].values[0]
params = param_sets[best_param_set]
params

{'bootstrap': True,
 'n_jobs': 6,
 'max_samples': 282420,
 'n_estimators': 25,
 'min_weight_fraction_leaf': 1e-06,
 'max_features': 2}

In [324]:
px.scatter(
    diagnostics[diagnostics['_param_set'] == best_param_set], x='iteration', y='central_likelihood', color='case'
)

In [325]:
model = RandomForestRegressor(**params)

model, diagnostics_results = train_utility_model(
    model,
    train,
    test,
    shaped_features,
    10,
    learning_rate=31 / 32,
)

100%|██████████| 10/10 [00:39<00:00,  3.93s/it]


In [326]:
px.line(
    diagnostics_results.groupby(['iteration', 'case']).mean().reset_index(),
    x='iteration', y='central_likelihood', color='case'
)

In [327]:
train["utility"] = model.predict(train[shaped_features])
train = get_proposed_utility(train)
print(get_central_likelihood(train))
train.head()

0.2448876848450096


Unnamed: 0,selected,_decision,_identifier,depth_class,month,daytime,interval,fold,_feature_0,_feature_1,proposed,utility,sum_utility,probability,score
0,True,2,159014b,25,4,0.0,15,1,0.222337,-1.266764,8.638713,5.856207,11.157614,0.524862,0.475138
1,False,2,159014b,50,4,0.0,15,1,0.222337,-1.266764,1.258596,1.445993,11.157614,0.129597,-0.129597
2,False,2,159014b,75,4,0.0,15,1,0.222337,-1.266764,0.806946,0.875671,11.157614,0.078482,-0.078482
3,False,2,159014b,100,4,0.0,15,1,0.222337,-1.266764,0.647509,0.690204,11.157614,0.06186,-0.06186
4,False,2,159014b,150,4,0.0,15,1,0.222337,-1.266764,1.239529,1.420333,11.157614,0.127297,-0.127297


In [328]:
test["utility"] = model.predict(test[shaped_features])
test = get_proposed_utility(test)
print(get_central_likelihood(test))
test.head()

0.23678563150350643


Unnamed: 0,selected,_decision,_identifier,depth_class,month,daytime,interval,_feature_0,_feature_1,utility,sum_utility,probability,score,proposed
0,False,0,205404,25,10,0.0,9,1.736296,0.247194,3.217444,11.595412,0.277476,-0.277476,2.324682
1,False,0,205404,50,10,0.0,9,1.736296,0.247194,1.935972,11.595412,0.16696,-0.16696,1.612742
2,False,0,205404,75,10,0.0,9,1.736296,0.247194,2.816669,11.595412,0.242912,-0.242912,2.132465
3,True,0,205404,100,10,0.0,9,1.736296,0.247194,1.807913,11.595412,0.155916,0.844084,3.333943
4,False,0,205404,150,10,0.0,9,1.736296,0.247194,1.442126,11.595412,0.12437,-0.12437,1.262768


In [329]:
model.feature_importances_

array([0.87242811, 0.07060301, 0.05696887])

In [330]:
train.groupby('depth_class')[['probability', 'selected']].mean()

Unnamed: 0_level_0,probability,selected
depth_class,Unnamed: 1_level_1,Unnamed: 2_level_1
25,0.508335,0.509365
50,0.148172,0.149458
75,0.10856,0.109323
100,0.07587,0.077509
150,0.099398,0.102064
200,0.039222,0.039976
250,0.009857,0.008799
300,0.004761,0.002638
400,0.003323,0.000868
500,0.002504,0.0


In [331]:
test.groupby('depth_class')[['probability', 'selected']].mean()

Unnamed: 0_level_0,probability,selected
depth_class,Unnamed: 1_level_1,Unnamed: 2_level_1
25,0.463534,0.490854
50,0.146508,0.17389
75,0.117795,0.121289
100,0.088769,0.098561
150,0.117216,0.087669
200,0.043866,0.02236
250,0.010481,0.004389
300,0.005651,0.000873
400,0.003671,0.000115
500,0.002509,0.0


In [332]:
train.groupby(['depth_class', pd.qcut(train['interval'], q=3, labels=False)])[['probability', 'selected']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,probability,selected
depth_class,interval,Unnamed: 2_level_1,Unnamed: 3_level_1
25,0,0.365887,0.36644
25,1,0.563637,0.566082
25,2,0.642809,0.642198
50,0,0.143734,0.140902
50,1,0.148324,0.151504
50,2,0.155037,0.15971
75,0,0.144976,0.149408
75,1,0.093302,0.089721
75,2,0.076089,0.078367
100,0,0.116897,0.122178


In [333]:
test.groupby(['depth_class', pd.qcut(test['interval'], q=3, labels=False)])[['probability', 'selected']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,probability,selected
depth_class,interval,Unnamed: 2_level_1,Unnamed: 3_level_1
25,0,0.324975,0.334206
25,1,0.525545,0.628953
25,2,0.615808,0.557277
50,0,0.139205,0.186177
50,1,0.146627,0.158974
50,2,0.159431,0.17503
75,0,0.152718,0.167576
75,1,0.099222,0.089162
75,2,0.083994,0.088165
100,0,0.13547,0.145077


In [334]:
train.groupby(['depth_class', 'daytime'])[['probability', 'selected']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,probability,selected
depth_class,daytime,Unnamed: 2_level_1,Unnamed: 3_level_1
25,0.0,0.48196,0.484432
25,1.0,0.532123,0.531854
50,0.0,0.153443,0.154148
50,1.0,0.143417,0.145229
75,0.0,0.114609,0.114948
75,1.0,0.103103,0.104249
100,0.0,0.083658,0.085492
100,1.0,0.068845,0.070308
150,0.0,0.112371,0.115695
150,1.0,0.087697,0.08977
