In [370]:
import pandas as pd
import numpy as np
import geopy.distance
from tqdm import tqdm
from functools import partial
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, explained_variance_score
import h3
from IPython.display import clear_output

# Hypothesis #1: It's all a Gas

We identified two main kinds of behavior:
- What we were calling a random walk
- Migrations

I wanted to start with the former. The idea was simple, we start with the notion that all is random. That is to say that given a series of options (a disc of movement) our dear Chinook would just pick a direction at random and go for it. 

However if you go look at a map of the paths these fish take you'll see it's clearly not a purely random mass of points - there are general patterns to where the fish go. So the next idea was to provide updated probabilities - perhaps once you get near deep water you stop random walking in that direction and end up "turning around" because the "utility" of that water is no good. 

So to model this the idea was to predict the utility of each patch of water we come across. The algorithm would be like the following:

1. Given your starting point sample a series of points around you (within the distance you can swim of course)
2. Compute the utility of each of those points and weight their overall probability of selection 
3. Pick at random with the probabilities assigned.
4. Repeat.

Now of course we'd want this utility to be based off of things fish understand but for starters I thought, why not cheat a little just to see if this idea works out in this easiest case. So I decided to model utility on lat lon. 

You may at this point be wondering, how can I predict utility when all I have are the points fish *did* choose? Well in fact I have all the points they didn't choose to swim to as well. For every pair of start end points I can also point out end points they didn't swim to. 

So let's start by building a dataset.

## Building a DataSet

In [None]:
data = pd.read_csv("data/tag_tracks.csv").rename({
    "Ptt": "ptt",
    "Latitude": "lat",
    "Longitude": "lon",
    "Dates - Date Key → Date": "date",
    "Dates - Date Key → Year": "year",
    "Dates - Date Key → Month": "month",
    "Dates - Date Key → Day": "day",
}, axis=1)
print(data.shape)
data.head()

We'll need to build pairs for what we are doing:

In [None]:
pairs = []
for ptt in tqdm(data["ptt"].unique()):
    rows = [row for _, row in data[data["ptt"] == ptt].sort_values("date", ascending=True).iterrows()]
    for start, end in zip(rows[:-1], rows[1:]):
        pairs.append({
            "ptt": ptt,
            "start_date": start["date"],
            "end_date": end["date"],
            "start_lat": start["lat"],
            "start_lon": start["lon"],
            "end_lat": end["lat"],
            "end_lon": end["lon"],
            "distance": geopy.distance.geodesic(
                (start["lat"], start["lon"]), (end["lat"], end["lon"])
            ).km
        })
pairs = pd.DataFrame(pairs)
print(pairs.shape)
pairs.head()

Now let's start by sampling distances

In [None]:
def distance_sampler_factory(distances, N):
    return np.random.choice(distances, N)

distance_sampler = partial(distance_sampler_factory, pairs['distance'])
px.histogram(
    distance_sampler(1000)
)

Next we'll sample directions:

In [None]:
def radial_sampler(N):
    lat = np.random.uniform(-1, 1, N)
    lon = np.random.uniform(-1, 1, N)
    sizes = np.sqrt(lat ** 2 + lon ** 2)
    lat = lat / sizes
    lon = lon / sizes
    return lat, lon

lat, lon = radial_sampler(1000)
px.scatter(x=lon, y=lat)

And then because lat lon is a tricky space we'll need to determine how far along our angle we need to go to reach our distance.

In [None]:
def find_point(start, vector, distance, tolerance=1, step=1):
    start = np.array(start)
    vector = np.array(vector)
    end = start + vector * step
    current_distance = geopy.distance.geodesic(start, end).km
    extended = True
    while abs(current_distance - distance) > tolerance:
        if current_distance < distance:
            if not extended:
                step /= 2
            end = end + vector * step
            extended = True
        else:
            if extended:
                step /= 2
            end = end - vector * step
            extended = False
        current_distance = geopy.distance.geodesic(start, end).km
    return end

In [None]:
from datetime import datetime
   
def sample_from_point(date, lat, lon, N, tolerance=1):
    start = (lat, lon)
    lat_vectors, lon_vectors = radial_sampler(N)
    distances = distance_sampler(N)
    for lat_vector, lon_vector, distance in zip(lat_vectors, lon_vectors, distances):
        end = find_point(start, (lat_vector, lon_vector), distance, tolerance)
        yield end[0], end[1], distance, date

sampled_points = list(sample_from_point(datetime(2024, 4, 6), 30, 10, 1000))
df = pd.DataFrame(sampled_points, columns=['lat', 'lon', 'distance', 'date'])
px.scatter_geo(df, lat='lat', lon='lon', color='distance')

Cool it works!

Let's go ahead and sample new points.

In [None]:
fake_points = []
samples_per_row = 2
for _, row in tqdm(pairs.iterrows()):
    fake_points.extend(sample_from_point(row["end_date"], row["start_lat"], row["start_lon"], samples_per_row))
fake_points = pd.DataFrame(fake_points, columns=['lat', 'lon', 'distance', 'date'])
px.scatter_geo(fake_points, lat='lat', lon='lon', color='distance')

Now we join with our real data

In [None]:
zeros = fake_points[['lat', 'lon', 'date']]
zeros['chosen'] = 0
ones = pairs[['end_lat', 'end_lon', 'end_date']].rename({'end_lat': 'lat', 'end_lon': 'lon', 'end_date': 'date'}, axis=1)
ones = ones.sample(zeros.shape[0], replace=True)
ones['chosen'] = 1
augmented_data = pd.concat([zeros, ones])
augmented_data.groupby('chosen').count()

In [None]:
px.scatter_geo(augmented_data.sample(5000), lat='lat', lon='lon', color='chosen')

## Building the Model

Time to build our utility model. 

In [None]:
train, test = train_test_split(augmented_data, test_size=0.2, random_state=42)

X = train[['lat', 'lon']]
y = train['chosen']

models = {
    "Random Forest": RandomForestRegressor(
        random_state=42, n_jobs=3
    )
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [5, 10, 20, 50]},
}
cv = KFold(n_splits=4, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
        refit=True,
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_), "best_estimator": grid_search.best_estimator_}
    results.append(result)

In [None]:
train['pred'] = results[0]['best_estimator'].predict(train[['lat', 'lon']])
print(accuracy_score(train['chosen'], train['pred'] > 0.5))

In [None]:
test['pred'] = results[0]['best_estimator'].predict(test[['lat', 'lon']])
print(accuracy_score(test['chosen'], test['pred'] > 0.5))

Terrifyingly high... but we'll get to why in a moment. Let's first use this to predict a series of paths for our fish.

## Creating Paths

In [None]:
def create_random_track(lat, lon, model, steps, N=100):
    path = [(lat, lon)]
    for _ in range(steps):
        possible_steps = list(sample_from_point('', lat, lon, N))
        X = pd.DataFrame(
            np.array([[lat, lon] for lat, lon, _, _ in possible_steps]),
            columns=['lat', 'lon']
        )
        if model is not None:
            probs = model.predict(X)
        else:
            probs = np.ones(X.shape[0])
        probs = probs / probs.sum()
        selection = np.random.choice(range(len(possible_steps)), p=probs)
        lat, lon = possible_steps[selection][:2]
        path.append((lat, lon))
    return pd.DataFrame(path, columns=['lat', 'lon'])
        

### Using Our Model

In [None]:
paths = []
for ptt in tqdm(data['ptt'].unique()):
    df = data[data['ptt'] == ptt].sort_values("date", ascending=True).iloc[[0]]
    lat = df['lat'].values[0]
    lon = df['lon'].values[0]
    path = create_random_track(
        lat, lon,
        results[0]['best_estimator'],
        data[data['ptt'] == ptt].shape[0]
    )
    path['case'] = 'fake'
    path['ptt'] = ptt
    old_path = data[data['ptt'] == ptt][['lat', 'lon']]
    old_path['case'] = 'real'
    old_path['ptt'] = ptt
    paths.append(pd.concat([path, old_path]))

In [None]:
model_data = pd.concat(paths)
px.scatter_geo(model_data, lat='lat', lon='lon', color='case')

### Purely Random

In [None]:
paths = []
for ptt in tqdm(data['ptt'].unique()):
    df = data[data['ptt'] == ptt].sort_values("date", ascending=True).iloc[[0]]
    lat = df['lat'].values[0]
    lon = df['lon'].values[0]
    path = create_random_track(
        lat, lon,
        None,
        data[data['ptt'] == ptt].shape[0]
    )
    path['case'] = 'fake'
    path['ptt'] = ptt
    old_path = data[data['ptt'] == ptt][['lat', 'lon']]
    old_path['case'] = 'real'
    old_path['ptt'] = ptt
    paths.append(pd.concat([path, old_path]))

In [None]:
random_data = pd.concat(paths)
px.scatter_geo(random_data, lat='lat', lon='lon', color='case')

## What's Going On?

We had this super well fitted model of our "utility" and yet here we are looking at a pile of garbage! It's very nearly indistuinguishable from a totally random walk! 

In [None]:
training_ptt = set(np.random.choice(data['ptt'].unique(), round(data['ptt'].unique().shape[0] * 0.8)))
testing_ptt = set(data['ptt'].unique()) - training_ptt

fake_points = []
samples_per_row = 2
training_pairs = pairs[pairs['ptt'].isin(training_ptt)]
for _, row in tqdm(training_pairs.iterrows()):
    fake_points.extend(sample_from_point(row["end_date"], row["start_lat"], row["start_lon"], samples_per_row))
fake_points = pd.DataFrame(fake_points, columns=['lat', 'lon', 'distance', 'date'])

zeros = fake_points[['lat', 'lon', 'date']]
zeros['chosen'] = 0
ones = training_pairs[['end_lat', 'end_lon', 'end_date']].rename({'end_lat': 'lat', 'end_lon': 'lon', 'end_date': 'date'}, axis=1)
ones = ones.sample(zeros.shape[0], replace=True)
ones['chosen'] = 1
augmented_data = pd.concat([zeros, ones])
augmented_data.groupby('chosen').count()

In [None]:
fake_points = []
samples_per_row = 2
testing_pairs = pairs[pairs['ptt'].isin(testing_ptt)]
for _, row in tqdm(testing_pairs.iterrows()):
    fake_points.extend(sample_from_point(row["end_date"], row["start_lat"], row["start_lon"], samples_per_row))
fake_points = pd.DataFrame(fake_points, columns=['lat', 'lon', 'distance', 'date'])

zeros = fake_points[['lat', 'lon', 'date']]
zeros['chosen'] = 0
ones = testing_pairs[['end_lat', 'end_lon', 'end_date']].rename({'end_lat': 'lat', 'end_lon': 'lon', 'end_date': 'date'}, axis=1)
ones = ones.sample(zeros.shape[0], replace=True)
ones['chosen'] = 1
test = pd.concat([zeros, ones])
test.groupby('chosen').count()

In [None]:
train = augmented_data

X = train[['lat', 'lon']]
y = train['chosen']

models = {
    "Random Forest": RandomForestRegressor(
        random_state=42, n_jobs=3
    )
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [5, 10, 20, 50]},
}
cv = KFold(n_splits=4, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
        refit=True,
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_), "best_estimator": grid_search.best_estimator_}
    results.append(result)

In [None]:
train['pred'] = results[0]['best_estimator'].predict(train[['lat', 'lon']]) > 0.5
print(accuracy_score(train['chosen'], train['pred']))

In [None]:
test['pred'] = results[0]['best_estimator'].predict(test[['lat', 'lon']]) > 0.5
print(accuracy_score(test['chosen'], test['pred']))

In [None]:
test.groupby(['chosen', 'pred']).count()

In [None]:
def create_random_track(lat, lon, model, steps, N=1000):
    path = [(lat, lon)]
    for _ in range(steps):
        while True:
            possible_steps = list(sample_from_point('', lat, lon, N))
            X = pd.DataFrame(
                np.array([[lat, lon] for lat, lon, _, _ in possible_steps]),
                columns=['lat', 'lon']
            )
            if model is not None:
                probs = model.predict(X)
                probs[probs < 0.5] = 0
            else:
                probs = np.ones(X.shape[0])
            if probs.sum() > 0:
                break

        probs = probs / probs.sum()
        selection = np.random.choice(range(len(possible_steps)), p=probs)
        lat, lon = possible_steps[selection][:2]
        path.append((lat, lon))
    return pd.DataFrame(path, columns=['lat', 'lon'])

paths = []
for ptt in tqdm(np.random.choice(data['ptt'].unique(), 10)):
    df = data[data['ptt'] == ptt].sort_values("date", ascending=True).iloc[[0]]
    lat = df['lat'].values[0]
    lon = df['lon'].values[0]
    path = create_random_track(
        lat, lon,
        results[0]['best_estimator'],
        data[data['ptt'] == ptt].shape[0]
    )
    path['case'] = 'fake'
    path['ptt'] = ptt
    old_path = data[data['ptt'] == ptt][['lat', 'lon']]
    old_path['case'] = 'real'
    old_path['ptt'] = ptt
    paths.append(pd.concat([path, old_path]))

In [None]:
more_selective_data = pd.concat(paths)
px.scatter_geo(more_selective_data, lat='lat', lon='lon', color='case')

## Problem #1

That worked a lot better. While this is definitely a smaller sample we don't have loads of fish taking a stroll through the alaska landmass. So what changed? Well we got pickier. What we've trained is a model that's really good at asking - is this a real fish or not. Given totally new fish, and potential decisions those fish could've made, 84% of the time it's correct! However what we can clearly see from the above is that while it ends up picking points that definitely *could* belong to a fish, they don't, together, look like a fish's path. 

This is because looking at the fish's independent paths we see that there's a lot of autocorrelation through time. Once a fish is moving in a specific direction it's reasonably likely that it will continue moving in that same direction. The notion that our fish are just random walkers is clearly debunked. 

However we also now know that it's surprisingly easy to predict (from a series of points) which ones are real, and which ones aren't... 

There's definitely more to be said with regards to that point. We balanced our dataset but what happens when we actually search the space fully - a lot of those won't be real points the fish would belong to and our false positive rate might start looking a lot like our true positive rate. 

## Problem 2

Assuming we have an accuracy of 84% we actually have a false positive / true positive rates depending on how common points that fish wouldn't go to are.

In [None]:
accuracy = 0.84
degree_not_fishy = np.arange(0.05, 1., 0.05)
rates = degree_not_fishy * (1 - accuracy) / ((1 - degree_not_fishy) * accuracy)
df = pd.DataFrame({
    "degree_not_fishy": degree_not_fishy,
    "fp/tp rate": rates
})
px.line(df, x='degree_not_fishy', y='fp/tp rate')

## Problem 3

What's also weird about this is that we're saying that there *is* a prediction at each point. That the model is predicting if this is likely to be a fishy point or not. But we wanted to know something more probabilistic. 

That's because other fish will likely come near these points and *not* choose them. In fact as we're talking about infinitesimal points in space the fact is that almost every single point will
only ever be seen once. So this honestly may not even be phrased the right way. 

This gets at another question I had - what if only one fish *ever* sees a series of points? For example one of our fish goes wandering off into the deep end as it were and therefore creates a track that according to our model is "definitely fishy". Yet no other fish ever went there. 

So:

1. We're not capturing the fact that chosen points by one fish were not chosen by another fish leading to an idea of "fishy" that isn't real. 
2. Fish are unlikely to *ever* choose the exact same point again. Which means that if we did account for non-choices all of our points (in the limit) become 0's 
3. There is also no real weighting for points that other fish simply didn't have the opportunity to explore because they went a completely different way. Macro scale movement is not being captured.

# Next Steps

Given the predictability we got from such a dumb feature this whole strategy obviously holds some weight. We're going to need some sense of autocorrelation for sure but I think there's still something here in terms of at least learning range. And perhaps even the value of each part of the range. However I think I need to be able to deal with the 3 observations listed above (in Problem 3). The whole idea of starting with a uniformity hypothesis and going from there I think still makes a great deal of sense. And we obviously need to relate this to biological variables. But trying this in lat lon space is a good first start just because it is so obvious in lat lon space alone. 

I think I can deal with (2) just by doing some spatial smoothing. These are not pinpoint accuracy data points anyways so smudging stuff out I think makes good sense. With it smoothed it should be pretty easy to evaulate all the "cells" (if that's the route I go) that other fish did not choose. Because now I'll be building a dataset of decision non decisions based on cells not samples which should be much more straightforward. 

I'm still going to have a balancing problem but we'll come to that later. 

In terms of weighting the fish that just swim off into oblivion I could make it as simple as disregarding hexes that have too few samples in them. But there may be nicer ways of dealing with that. We'll start by just discretizing the space in order to better represent my whole idea of did make a decision vs didn't without having every sample plunge to zero in the limit. Besides it'll make computation later so much easier.

## Some Other Stuff Cause All of this Weirds Me Out

In [None]:
lons = np.linspace(-170, -124, 1000) 
lats = np.linspace(46, 61, 100)
df = pd.DataFrame([
    {
        'lon': lon,
        'lat': lat,
    } for lon in lons 
    for lat in lats
])
df['pred'] = results[0]['best_estimator'].predict(df[['lat', 'lon']])

In [None]:
pdf = df.groupby([pd.qcut(df['lat'], 50), pd.qcut(df['lon'], 50)]).mean().reset_index(drop=True)
px.scatter_geo(pdf, lat='lat', lon='lon', color='pred', opacity=0.5)

In [None]:
test['correct'] = test['chosen'] == test['pred']
px.scatter_geo(test[test['chosen'] == 1], lat='lat', lon='lon', color='correct', opacity=0.5)

In [None]:
test['correct'] = test['chosen'] == test['pred']
px.scatter_geo(test[test['chosen'] == 0], lat='lat', lon='lon', color='correct', opacity=0.5)

# Discretizing the Problem

In [267]:
def find_neighbors(h3_index, threshold_km, neighbors_index):
    h3_coords = h3.h3_to_geo(h3_index)
    checked = set()
    neighbors = set()
    distance = 1
    found_neighbors = True
    while found_neighbors:
        found_neighbors = False
        candidates = h3.k_ring(h3_index, distance)
        new_candidates = set(candidates) - checked
        for candidate in new_candidates:
            if geopy.distance.geodesic(h3_coords, h3.h3_to_geo(candidate)).km <= threshold_km:
                neighbors.add(candidate)
                found_neighbors = True
            checked.add(candidate)
        distance += 1
    neighbors_index[h3_index] = neighbors



RESOLUTION = 4
MAX_KM = 100

selected_h3 = set([
    h3.geo_to_h3(lat, lon, RESOLUTION)
    for lat, lon in zip(data['lat'], data['lon'])
])

neighbors_index = {}
for h3_index in tqdm(selected_h3):
    find_neighbors(h3_index, MAX_KM, neighbors_index)

100%|██████████| 738/738 [00:03<00:00, 227.50it/s]


In [173]:
rows = []
for _, row in tqdm(training_pairs.iterrows()):
    start = h3.geo_to_h3(row['start_lat'], row['start_lon'], RESOLUTION)
    end = h3.geo_to_h3(row['end_lat'], row['end_lon'], RESOLUTION)
    date = datetime.strptime(row['end_date'], '%Y-%m-%d')
    rows.append({
        'h3_index': end, 
        'chosen': 1,
        'date': date,
        'year': date.year,
        'month': date.month,
        'day': date.day
    })
    for neighbor in neighbors_index[start]:
        if neighbor != end:
            rows.append({
                'h3_index': neighbor, 
                'chosen': 0,
                'date': date,
                'year': date.year,
                'month': date.month,
                'day': date.day
            })

train = pd.DataFrame(rows)
print(train.shape)
train.head()

3841it [00:00, 23019.97it/s]


(72996, 6)


Unnamed: 0,h3_index,chosen,date,year,month,day
0,840c9a9ffffffff,1,2014-08-05,2014,8,5
1,840c9e3ffffffff,0,2014-08-05,2014,8,5
2,840c981ffffffff,0,2014-08-05,2014,8,5
3,840c9a7ffffffff,0,2014-08-05,2014,8,5
4,840c9b1ffffffff,0,2014-08-05,2014,8,5


In [201]:
rows = []
for _, row in tqdm(testing_pairs.iterrows()):
    start = h3.geo_to_h3(row['start_lat'], row['start_lon'], RESOLUTION)
    end = h3.geo_to_h3(row['end_lat'], row['end_lon'], RESOLUTION)
    date = datetime.strptime(row['end_date'], '%Y-%m-%d')
    rows.append({
        'h3_index': end, 
        'chosen': 1,
        'date': date,
        'year': date.year,
        'month': date.month,
        'day': date.day
    })
    for neighbor in neighbors_index[start]:
        if neighbor != end:
            rows.append({
                'h3_index': neighbor, 
                'chosen': 0,
                'date': date,
                'year': date.year,
                'month': date.month,
                'day': date.day
            })

test = pd.DataFrame(rows)
print(test.shape)
test.head()

3580it [00:00, 20811.57it/s]


(68623, 6)


Unnamed: 0,h3_index,chosen,date,year,month,day
0,8422d03ffffffff,1,2013-12-20,2013,12,20
1,8422d55ffffffff,0,2013-12-20,2013,12,20
2,8422d1dffffffff,0,2013-12-20,2013,12,20
3,8422d01ffffffff,0,2013-12-20,2013,12,20
4,8422d51ffffffff,0,2013-12-20,2013,12,20


In [174]:
df = train.groupby('h3_index').mean().reset_index()
df['lat'] = df.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
df['lon'] = df.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
px.scatter_geo(df, lat='lat', lon='lon', color='chosen')

That was just some indexing to make the next part much quicker than it would otherwise be. Now we need to go and actually accumulate our training data. 

Let's load some bathymetry data in.

In [175]:
def spatial_key_to_index(spatial_key):
    return hex(spatial_key)[2:]

elevation = pd.read_csv("data/bathymetry.csv")
elevation['h3_index'] = elevation['h3_index'].astype(np.int64).astype(str)
elevation['h3_index'] = elevation.apply(lambda row: spatial_key_to_index(np.int64(row['h3_index'])), axis=1)
elevation.head()

Unnamed: 0,h3_index,elevation
0,8402105ffffffff,-450.0104
1,840210dffffffff,-1184.517832
2,8402121ffffffff,-499.541915
3,8402123ffffffff,-270.425039
4,8402125ffffffff,-1022.894455


In [176]:
surface_temps = pd.read_csv("data/surface_temps.csv").rename({
    "H3 Key 4": "h3_index",
    "Dates - Date Key → Month": "month",
    "Dates - Date Key → Day": "day",
    "Temperature C": "temp"
}, axis=1)[['h3_index', 'month', 'day', 'temp']]
surface_temps['h3_index'] = surface_temps['h3_index'].astype(np.int64).astype(str)
surface_temps['h3_index'] = surface_temps.apply(lambda row: spatial_key_to_index(np.int64(row['h3_index'])), axis=1)
surface_temps.head()

Unnamed: 0,h3_index,month,day,temp
0,8402101ffffffff,1,15,-0.2685
1,8402103ffffffff,1,15,-0.2685
2,8402105ffffffff,1,15,-0.2685
3,8402107ffffffff,1,15,-0.2685
4,8402109ffffffff,1,15,-0.2685


In [177]:
surface_temps.day.unique(), surface_temps.month.unique()

(array([15]), array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]))

In [193]:
primary_productivity = pd.read_csv("data/primary_productivity.csv").rename({
    "H3 Key 4": "h3_index",
    "Dates - Date Key → Month": "month",
    "Dates - Date Key → Day": "day",
    "Log Chla Ave": "log_chla"
}, axis=1)[['h3_index', 'month', 'day', 'log_chla']]
primary_productivity['h3_index'] = primary_productivity['h3_index'].astype(np.int64).astype(str)
primary_productivity['h3_index'] = primary_productivity.apply(lambda row: spatial_key_to_index(np.int64(row['h3_index'])), axis=1)
primary_productivity.loc[primary_productivity['log_chla'] < 3, 'log_chla'] = 3
primary_productivity.head()

Unnamed: 0,h3_index,month,day,log_chla
0,840c401ffffffff,1,15,6.683674
1,840c407ffffffff,1,15,6.726833
2,840c409ffffffff,1,15,6.403988
3,840c40dffffffff,1,15,6.067268
4,840c411ffffffff,1,15,5.836758


In [194]:
primary_productivity.day.unique(), primary_productivity.month.unique()

(array([15]), array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]))

In [195]:
train = train[['h3_index', 'chosen', 'date', 'year', 'month', 'day']]

train = train.merge(elevation, on='h3_index')
train = train.merge(surface_temps[['h3_index', 'month', 'temp']], on=['h3_index', 'month'])
train = train.merge(primary_productivity[['h3_index', 'month', 'log_chla']], on=['h3_index', 'month'])

train['lat'] = train.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
train['lon'] = train.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
train.head()

Unnamed: 0,h3_index,chosen,date,year,month,day,elevation,temp,log_chla,lat,lon
0,840c9b1ffffffff,0,2014-09-01,2014,9,1,-2902.48537,5.596667,5.588496,56.475441,-173.56374
1,840c9b1ffffffff,0,2014-09-02,2014,9,2,-2902.48537,5.596667,5.588496,56.475441,-173.56374
2,8422ca3ffffffff,0,2014-09-01,2014,9,1,-3470.9437,5.898167,5.521461,55.717197,-173.700322
3,8422ca3ffffffff,0,2014-09-02,2014,9,2,-3470.9437,5.898167,5.521461,55.717197,-173.700322
4,840c9b7ffffffff,0,2014-09-01,2014,9,1,-3326.792843,5.753333,5.59136,56.099546,-173.690187


In [208]:
test = test[['h3_index', 'chosen', 'date', 'year', 'month', 'day']]

test = test.merge(elevation, on='h3_index')
test = test.merge(surface_temps[['h3_index', 'month', 'temp']], on=['h3_index', 'month'])
test = test.merge(primary_productivity[['h3_index', 'month', 'log_chla']], on=['h3_index', 'month'])

test['lat'] = test.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
test['lon'] = test.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
test.head()

Unnamed: 0,h3_index,chosen,date,year,month,day,elevation,temp,log_chla,lat,lon
0,8422d03ffffffff,1,2013-12-20,2013,12,20,-771.670639,1.215667,8.805272,54.360925,-166.742418
1,8422d03ffffffff,1,2013-12-21,2013,12,21,-771.670639,1.215667,8.805272,54.360925,-166.742418
2,8422d03ffffffff,1,2013-12-22,2013,12,22,-771.670639,1.215667,8.805272,54.360925,-166.742418
3,8422d03ffffffff,1,2013-12-23,2013,12,23,-771.670639,1.215667,8.805272,54.360925,-166.742418
4,8422d03ffffffff,1,2013-12-24,2013,12,24,-771.670639,1.215667,8.805272,54.360925,-166.742418


In [221]:
features = ['elevation', 'temp', 'log_chla']

X = train[features]
y = train['chosen']

models = {
    "Random Forest": RandomForestRegressor(
        random_state=42, n_jobs=3
    )
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [200]}#{"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [5, 10, 20, 50]},
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
        refit=True,
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_), "best_estimator": grid_search.best_estimator_}
    results.append(result)

In [222]:
results[0]["best_estimator"].n_estimators, results[0]["best_estimator"].min_samples_leaf

(100, 200)

In [223]:
train["pred"] = results[0]["best_estimator"].predict(train[features])
train["error"] = train["chosen"] - train["pred"]
print(mean_squared_error(train["chosen"], train["pred"]))
print(explained_variance_score(train["chosen"], train["pred"]))

0.04692958421652515
0.07971100870452341


In [247]:
test["pred"] = results[0]["best_estimator"].predict(test[features])
test["error"] = test["chosen"] - test["pred"]
print(mean_squared_error(test["chosen"], test["pred"]))
print(explained_variance_score(test["chosen"], test["pred"]))

0.04975565157557358
0.012403722215059143


In [248]:
df = test.groupby('h3_index').mean().reset_index()
px.scatter_geo(df, lat='lat', lon='lon', color='error')

In [245]:
feature = 'elevation'
dataset = train
y = 'pred'
df1 = test.groupby(pd.qcut(test[feature], 30))[['chosen', feature]].mean().reset_index(drop=True)
df2 = test.groupby(pd.qcut(test[feature], 30))[['pred', feature]].mean().reset_index(drop=True).rename({'pred': 'chosen'}, axis=1)
df3 = train.groupby(pd.qcut(train[feature], 30))[['chosen', feature]].mean().reset_index(drop=True)
df4 = train.groupby(pd.qcut(train[feature], 30))[['pred', feature]].mean().reset_index(drop=True).rename({'pred': 'chosen'}, axis=1)
df1['case'] = 'test-real'
df2['case'] = 'test-pred'
df3['case'] = 'train-real'
df4['case'] = 'train-pred'
df = pd.concat([df1, df2, df3, df4])
px.line(df, x=feature, y='chosen', color='case')











In [250]:
full_features = elevation
full_features = full_features.merge(surface_temps[['h3_index', 'month', 'temp']], on=['h3_index'])
full_features = full_features.merge(primary_productivity[['h3_index', 'month', 'log_chla']], on=['h3_index', 'month'])
full_features

Unnamed: 0,h3_index,elevation,month,temp,log_chla
0,8402123ffffffff,-270.425039,5,-0.268500,3.000000
1,8402123ffffffff,-270.425039,6,0.531628,3.000000
2,8402801ffffffff,-2262.651136,6,1.136333,3.018073
3,8402803ffffffff,-916.098408,6,0.785190,3.000000
4,8402805ffffffff,-2577.574399,6,0.903583,3.061052
...,...,...,...,...,...
83235,8447669ffffffff,-5741.416461,8,14.585944,3.419547
83236,8447669ffffffff,-5741.416461,9,24.043722,3.613007
83237,8447669ffffffff,-5741.416461,10,15.206778,6.108284
83238,8447669ffffffff,-5741.416461,11,5.355389,3.973432


In [260]:
def create_random_track(lat, lon, date, model, steps):
    path = [(lat, lon)]
    for _ in range(steps):
        h3_index = h3.geo_to_h3(lat, lon, RESOLUTION)
        if h3_index not in neighbors_index:
            find_neighbors(h3_index, MAX_KM, neighbors_index)
        raw_possibilities = list(neighbors_index[h3_index])
        possibilities = pd.DataFrame(raw_possibilities, columns=['h3_index'])
        possibilities['month'] = date.month
        possibilities = possibilities.merge(full_features, on=['h3_index', 'month'])
        if possibilities.shape[0] == 0:
            break
        X = possibilities[features]
        probs = model.predict(X)
        probs = probs / probs.sum()
        raw_possibilities = list(possibilities['h3_index'])
        selection = np.random.choice(range(len(raw_possibilities)), p=probs)
        lat, lon = h3.h3_to_geo(raw_possibilities[selection])
        date = date + pd.Timedelta(days=1)
        path.append((lat, lon))
    return pd.DataFrame(path, columns=['lat', 'lon'])

paths = []
for ptt in tqdm(np.random.choice(data['ptt'].unique(), 50)):
    df = data[data['ptt'] == ptt].sort_values("date", ascending=True).iloc[[0]]
    lat = df['lat'].values[0]
    lon = df['lon'].values[0]
    date = datetime.strptime(df['date'].values[0], '%Y-%m-%d')
    path = create_random_track(
        lat, lon, date,
        results[0]['best_estimator'],
        data[data['ptt'] == ptt].shape[0]
    )
    path['case'] = 'fake'
    path['ptt'] = ptt
    old_path = data[data['ptt'] == ptt][['lat', 'lon']]
    old_path['case'] = 'real'
    old_path['ptt'] = ptt
    paths.append(pd.concat([path, old_path]))

100%|██████████| 50/50 [00:57<00:00,  1.15s/it]


In [261]:
df = pd.concat(paths)
px.scatter_geo(df, lat='lat', lon='lon', color='case')

# Auto Correlation

In [440]:
triplets = []
last_direction_lat = 0
last_direction_lon = 0
for ptt in tqdm(data["ptt"].unique()):
    rows = [row for _, row in data[data["ptt"] == ptt].sort_values("date", ascending=True).iterrows()]
    for start, middle, end in zip(rows[:-2], rows[1:-1], rows[2:]):
        middle_lat, middle_lon = h3.h3_to_geo(h3.geo_to_h3(middle['lat'], middle['lon'], RESOLUTION))
        start_lat, start_lon = h3.h3_to_geo(h3.geo_to_h3(start['lat'], start['lon'], RESOLUTION))
        direction_lat = middle_lat - start_lat
        direction_lon = middle_lon - start_lon
        direction_lat = direction_lat / np.linalg.norm([direction_lat, direction_lon])
        direction_lon = direction_lon / np.linalg.norm([direction_lat, direction_lon])
        triplets.append({
            "ptt": ptt,
            "start_date": middle["date"],
            "end_date": end["date"],
            "start_lat": middle["lat"],
            "start_lon": middle["lon"],
            "end_lat": end["lat"],
            "end_lon": end["lon"],
            "distance": geopy.distance.geodesic(
                (start["lat"], start["lon"]), (end["lat"], end["lon"])
            ).km,
            "prior_direction_lat": direction_lat if np.isfinite(direction_lat) else last_direction_lat,
            "prior_direction_lon": direction_lon if np.isfinite(direction_lon) else last_direction_lon,
            "remained": not np.isfinite(direction_lat) and not np.isfinite(direction_lon)
        })
        last_direction_lat = direction_lat if np.isfinite(direction_lat) else last_direction_lat
        last_direction_lon = direction_lon if np.isfinite(direction_lon) else last_direction_lon
triplets = pd.DataFrame(triplets)
print(triplets.shape)
triplets.head()


invalid value encountered in scalar divide

100%|██████████| 111/111 [00:01<00:00, 82.73it/s]

(7310, 11)





Unnamed: 0,ptt,start_date,end_date,start_lat,start_lon,end_lat,end_lon,distance,prior_direction_lat,prior_direction_lon,remained
0,129843,2013-12-20,2013-12-21,54.258072,-166.884086,54.312433,-166.910525,20.126011,0.988573,0.058506,False
1,129843,2013-12-21,2013-12-22,54.312433,-166.910525,54.35828,-166.817057,11.977147,0.988573,0.058506,True
2,129843,2013-12-22,2013-12-23,54.35828,-166.817057,54.389694,-166.676901,17.456323,0.988573,0.058506,True
3,129843,2013-12-23,2013-12-24,54.389694,-166.676901,54.407429,-166.526618,19.647607,0.988573,0.058506,True
4,129843,2013-12-24,2013-12-25,54.407429,-166.526618,54.407463,-166.528348,9.848835,0.988573,0.058506,True


In [441]:
training_triplets = triplets[triplets['ptt'].isin(training_ptt)]
testing_triplets = triplets[triplets['ptt'].isin(testing_ptt)]

In [442]:
rows = []
choice_id = -1
for _, row in tqdm(training_triplets.iterrows()):
    choice_id += 1
    start = h3.geo_to_h3(row['start_lat'], row['start_lon'], RESOLUTION)
    end = h3.geo_to_h3(row['end_lat'], row['end_lon'], RESOLUTION)
    date = datetime.strptime(row['end_date'], '%Y-%m-%d')
    
    start_lat, start_lon = h3.h3_to_geo(start)
    end_lat, end_lon = h3.h3_to_geo(end)
    direction_lat = end_lat - start_lat
    direction_lon = end_lon - start_lon
    direction_lat = direction_lat / np.linalg.norm([direction_lat, direction_lon])
    direction_lon = direction_lon / np.linalg.norm([direction_lat, direction_lon])

    rows.append({
        'h3_index': end, 
        'chosen': 1,
        'date': date,
        'year': date.year,
        'month': date.month,
        'day': date.day,
        'prior_direction_lat': row['prior_direction_lat'],
        'prior_direction_lon': row['prior_direction_lon'],
        'direction_lat': direction_lat if np.isfinite(direction_lat) else 0,
        'direction_lon': direction_lon if np.isfinite(direction_lon) else 0,
        'remained_before': row['remained'],
        'remain_now': not np.isfinite(direction_lat) and not np.isfinite(direction_lon),
        'choice_id': choice_id
    })
    for neighbor in neighbors_index[start]:
        end_lat, end_lon = h3.h3_to_geo(neighbor)
        direction_lat = end_lat - start_lat
        direction_lon = end_lon - start_lon
        direction_lat = direction_lat / np.linalg.norm([direction_lat, direction_lon])
        direction_lon = direction_lon / np.linalg.norm([direction_lat, direction_lon])
        if neighbor != end:
            rows.append({
                'h3_index': neighbor, 
                'chosen': 0,
                'date': date,
                'year': date.year,
                'month': date.month,
                'day': date.day,
                'prior_direction_lat': row['prior_direction_lat'],
                'prior_direction_lon': row['prior_direction_lon'],
                'direction_lat': direction_lat if np.isfinite(direction_lat) else 0,
                'direction_lon': direction_lon if np.isfinite(direction_lon) else 0,
                'remained_before': row['remained'],
                'remain_now': not np.isfinite(direction_lat) and not np.isfinite(direction_lon),
                'choice_id': choice_id
            })

train = pd.DataFrame(rows)
print(train.shape)
train.head()


invalid value encountered in scalar divide


invalid value encountered in scalar divide

3781it [00:01, 2994.31it/s]


(73185, 13)


Unnamed: 0,h3_index,chosen,date,year,month,day,prior_direction_lat,prior_direction_lon,direction_lat,direction_lon,remained_before,remain_now,choice_id
0,840c9a9ffffffff,1,2014-08-06,2014,8,6,0.354498,-0.85338,0.0,0.0,False,True,0
1,840c9adffffffff,0,2014-08-06,2014,8,6,0.354498,-0.85338,-0.232782,-0.947772,False,False,0
2,840c981ffffffff,0,2014-08-06,2014,8,6,0.354498,-0.85338,0.22226,0.987656,False,False,0
3,840c9a5ffffffff,0,2014-08-06,2014,8,6,0.354498,-0.85338,-0.569424,-0.81149,False,False,0
4,840c9a1ffffffff,0,2014-08-06,2014,8,6,0.354498,-0.85338,-0.962174,-0.111708,False,False,0


In [443]:
rows = []
choice_id = -1
for _, row in tqdm(testing_triplets.iterrows()):
    choice_id += 1
    start = h3.geo_to_h3(row['start_lat'], row['start_lon'], RESOLUTION)
    end = h3.geo_to_h3(row['end_lat'], row['end_lon'], RESOLUTION)
    date = datetime.strptime(row['end_date'], '%Y-%m-%d')
    
    start_lat, start_lon = h3.h3_to_geo(start)
    end_lat, end_lon = h3.h3_to_geo(end)
    direction_lat = end_lat - start_lat
    direction_lon = end_lon - start_lon
    direction_lat = direction_lat / np.linalg.norm([direction_lat, direction_lon])
    direction_lon = direction_lon / np.linalg.norm([direction_lat, direction_lon])

    rows.append({
        'h3_index': end, 
        'chosen': 1,
        'date': date,
        'year': date.year,
        'month': date.month,
        'day': date.day,
        'prior_direction_lat': row['prior_direction_lat'],
        'prior_direction_lon': row['prior_direction_lon'],
        'direction_lat': direction_lat if np.isfinite(direction_lat) else 0,
        'direction_lon': direction_lon if np.isfinite(direction_lon) else 0,
        'remained_before': row['remained'],
        'remain_now': not np.isfinite(direction_lat) and not np.isfinite(direction_lon),
        'choice_id': choice_id
    })
    for neighbor in neighbors_index[start]:
        end_lat, end_lon = h3.h3_to_geo(neighbor)
        direction_lat = end_lat - start_lat
        direction_lon = end_lon - start_lon
        direction_lat = direction_lat / np.linalg.norm([direction_lat, direction_lon])
        direction_lon = direction_lon / np.linalg.norm([direction_lat, direction_lon])
        if neighbor != end:
            rows.append({
                'h3_index': neighbor, 
                'chosen': 0,
                'date': date,
                'year': date.year,
                'month': date.month,
                'day': date.day,
                'prior_direction_lat': row['prior_direction_lat'],
                'prior_direction_lon': row['prior_direction_lon'],
                'direction_lat': direction_lat if np.isfinite(direction_lat) else 0,
                'direction_lon': direction_lon if np.isfinite(direction_lon) else 0,
                'remained_before': row['remained'],
                'remain_now': not np.isfinite(direction_lat) and not np.isfinite(direction_lon),
                'choice_id': choice_id
            })

test = pd.DataFrame(rows)
print(test.shape)
test.head()

0it [00:00, ?it/s]


invalid value encountered in scalar divide


invalid value encountered in scalar divide

3529it [00:01, 3022.73it/s]


(68861, 13)


Unnamed: 0,h3_index,chosen,date,year,month,day,prior_direction_lat,prior_direction_lon,direction_lat,direction_lon,remained_before,remain_now,choice_id
0,8422d03ffffffff,1,2013-12-21,2013,12,21,0.988573,0.058506,0.0,0.0,False,True,0
1,8422d55ffffffff,0,2013-12-21,2013,12,21,0.988573,0.058506,-0.988777,-0.115604,False,False,0
2,8422d17ffffffff,0,2013-12-21,2013,12,21,0.988573,0.058506,0.422882,-0.930957,False,False,0
3,8422d57ffffffff,0,2013-12-21,2013,12,21,0.988573,0.058506,-0.620017,-0.720337,False,False,0
4,8422d39ffffffff,0,2013-12-21,2013,12,21,0.988573,0.058506,0.988149,0.059349,False,False,0


In [444]:
train = train[['h3_index', 'chosen', 'date', 'year', 'month', 'day', 'prior_direction_lat', 'prior_direction_lon', 'direction_lat', 'direction_lon', 'choice_id', 'remained_before', 'remain_now']]

train = train.merge(elevation, on='h3_index')
train = train.merge(surface_temps[['h3_index', 'month', 'temp']], on=['h3_index', 'month'])
train = train.merge(primary_productivity[['h3_index', 'month', 'log_chla']], on=['h3_index', 'month'])

train['dot_product'] = train['prior_direction_lat'] * train['direction_lat'] + train['prior_direction_lon'] * train['direction_lon']

train['lat'] = train.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
train['lon'] = train.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
train.head()

Unnamed: 0,h3_index,chosen,date,year,month,day,prior_direction_lat,prior_direction_lon,direction_lat,direction_lon,choice_id,remained_before,remain_now,elevation,temp,log_chla,dot_product,lat,lon
0,8422ca3ffffffff,0,2014-09-01,2014,9,1,-0.383993,0.816144,-0.243084,-0.981042,26,False,False,-3470.9437,5.898167,5.521461,-0.707329,55.717197,-173.700322
1,8422ca3ffffffff,0,2014-09-02,2014,9,2,-0.383993,0.816144,-0.243084,-0.981042,27,True,False,-3470.9437,5.898167,5.521461,-0.707329,55.717197,-173.700322
2,840c9b1ffffffff,0,2014-09-01,2014,9,1,-0.383993,0.816144,0.380315,-0.944551,26,False,False,-2902.48537,5.596667,5.588496,-0.916927,56.475441,-173.56374
3,840c9b1ffffffff,0,2014-09-02,2014,9,2,-0.383993,0.816144,0.380315,-0.944551,27,True,False,-2902.48537,5.596667,5.588496,-0.916927,56.475441,-173.56374
4,840c9b7ffffffff,0,2014-09-01,2014,9,1,-0.383993,0.816144,0.060497,-0.998774,26,False,False,-3326.792843,5.753333,5.59136,-0.838373,56.099546,-173.690187


In [445]:
test = test[['h3_index', 'chosen', 'date', 'year', 'month', 'day', 'prior_direction_lat', 'prior_direction_lon', 'direction_lat', 'direction_lon', 'choice_id']]

test = test.merge(elevation, on='h3_index')
test = test.merge(surface_temps[['h3_index', 'month', 'temp']], on=['h3_index', 'month'])
test = test.merge(primary_productivity[['h3_index', 'month', 'log_chla']], on=['h3_index', 'month'])

test['remained_before'] = (test['prior_direction_lat'] == 0) & (test['prior_direction_lon'] == 0)
test['remain_now'] = (test['direction_lat'] == 0) & (test['direction_lon'] == 0)

test['dot_product'] = test['prior_direction_lat'] * test['direction_lat'] + test['prior_direction_lon'] * test['direction_lon']

test['lat'] = test.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
test['lon'] = test.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
test.head()

Unnamed: 0,h3_index,chosen,date,year,month,day,prior_direction_lat,prior_direction_lon,direction_lat,direction_lon,choice_id,elevation,temp,log_chla,remained_before,remain_now,dot_product,lat,lon
0,8422d03ffffffff,1,2013-12-21,2013,12,21,0.988573,0.058506,0.0,0.0,0,-771.670639,1.215667,8.805272,False,True,0.0,54.360925,-166.742418
1,8422d03ffffffff,1,2013-12-22,2013,12,22,0.988573,0.058506,0.0,0.0,1,-771.670639,1.215667,8.805272,False,True,0.0,54.360925,-166.742418
2,8422d03ffffffff,1,2013-12-23,2013,12,23,0.988573,0.058506,0.0,0.0,2,-771.670639,1.215667,8.805272,False,True,0.0,54.360925,-166.742418
3,8422d03ffffffff,1,2013-12-24,2013,12,24,0.988573,0.058506,0.0,0.0,3,-771.670639,1.215667,8.805272,False,True,0.0,54.360925,-166.742418
4,8422d03ffffffff,1,2013-12-25,2013,12,25,0.988573,0.058506,0.0,0.0,4,-771.670639,1.215667,8.805272,False,True,0.0,54.360925,-166.742418


In [518]:
features = ['elevation', 'temp', 'log_chla', 'dot_product', 'remain_now']

X = train[features]
y = train['chosen']

models = {
    "Random Forest": RandomForestRegressor(
        random_state=42, n_jobs=3
    )
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [100, 200]}#{"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [5, 10, 20, 50]},
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
        refit=True,
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_), "best_estimator": grid_search.best_estimator_}
    results.append(result)

In [519]:
sensory_model = results[0]["best_estimator"]

In [520]:
results[0]["best_estimator"].n_estimators, results[0]["best_estimator"].min_samples_leaf

(100, 100)

In [521]:
train["pred"] = results[0]["best_estimator"].predict(train[features])
train["error"] = train["chosen"] - train["pred"]
print(mean_squared_error(train["chosen"], train["pred"]))
print(explained_variance_score(train["chosen"], train["pred"]))

0.02702540046588606
0.46062396759570157


In [522]:
test["pred"] = results[0]["best_estimator"].predict(test[features])
test["error"] = test["chosen"] - test["pred"]
print(mean_squared_error(test["chosen"], test["pred"]))
print(explained_variance_score(test["chosen"], test["pred"]))

0.02803795019647415
0.43388270567568243


In [523]:
ptts = np.random.choice(data['ptt'].unique(), 50)

In [524]:
def create_random_track(lat, lon, date, model, steps):
    path = [(lat, lon, date)]
    for _ in range(steps):
        h3_index = h3.geo_to_h3(lat, lon, RESOLUTION)
        if h3_index not in neighbors_index:
            find_neighbors(h3_index, MAX_KM, neighbors_index)
        raw_possibilities = list(neighbors_index[h3_index])
        possibilities = pd.DataFrame(raw_possibilities, columns=['h3_index'])
        possibilities['month'] = date.month
        possibilities = possibilities.merge(full_features, on=['h3_index', 'month'])
        if possibilities.shape[0] == 0:
            break

        if len(path) > 1:
            prior_direction_lat = path[-1][0] - path[-2][0]
            prior_direction_lon = path[-1][1] - path[-2][1]
            prior_direction_lat = prior_direction_lat / np.linalg.norm([prior_direction_lat, prior_direction_lon])
            prior_direction_lon = prior_direction_lon / np.linalg.norm([prior_direction_lat, prior_direction_lon])
        else:
            prior_direction_lat = 0
            prior_direction_lon = 0
        possibilities['prior_direction_lat'] = prior_direction_lat if np.isfinite(prior_direction_lat) else 0
        possibilities['prior_direction_lon'] = prior_direction_lon if np.isfinite(prior_direction_lon) else 0

        possibilities['start_lat'] = lat
        possibilities['start_lon'] = lon
        possibilities['end_lat'] = possibilities.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
        possibilities['end_lon'] = possibilities.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
        possibilities['direction_lat'] = possibilities['end_lat'] - possibilities['start_lat']
        possibilities['direction_lon'] = possibilities['end_lon'] - possibilities['start_lon']
        possibilities['direction_lat'] = possibilities.apply(lambda r: r['direction_lat'] / np.linalg.norm([r['direction_lat'], r['direction_lon']]), axis=1).fillna(0)
        possibilities['direction_lon'] = possibilities.apply(lambda r: r['direction_lon'] / np.linalg.norm([r['direction_lat'], r['direction_lon']]), axis=1).fillna(0)

        possibilities['remained_before'] = (possibilities['prior_direction_lat'] == 0) & (possibilities['prior_direction_lon'] == 0)
        possibilities['remain_now'] = (possibilities['direction_lat'] == 0) & (possibilities['direction_lon'] == 0)
        possibilities['dot_product'] = possibilities['prior_direction_lat'] * possibilities['direction_lat'] + possibilities['prior_direction_lon'] * possibilities['direction_lon']

        X = possibilities[features]
        probs = model.predict(X)
        probs = probs / probs.sum()
        raw_possibilities = list(possibilities['h3_index'])
        selection = np.random.choice(range(len(raw_possibilities)), p=probs)
        lat, lon = h3.h3_to_geo(raw_possibilities[selection])
        date = date + pd.Timedelta(days=1)
        path.append((lat, lon, date))
    return pd.DataFrame(path, columns=['lat', 'lon', 'date'])

paths = []
for ptt in tqdm(ptts):
    df = data[data['ptt'] == ptt].sort_values("date", ascending=True).iloc[[0]]
    lat = df['lat'].values[0]
    lon = df['lon'].values[0]
    date = datetime.strptime(df['date'].values[0], '%Y-%m-%d')
    path = create_random_track(
        lat, lon, date,
        results[0]['best_estimator'],
        data[data['ptt'] == ptt].shape[0]
    )
    path['case'] = 'fake'
    path['ptt'] = ptt
    old_path = data[data['ptt'] == ptt][['lat', 'lon', 'date']]
    old_path['date'] = pd.to_datetime(old_path['date'])
    old_path['h3_index'] = old_path.apply(lambda row: h3.geo_to_h3(row['lat'], row['lon'], RESOLUTION), axis=1)
    old_path['lat'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
    old_path['lon'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
    old_path['case'] = 'real'
    old_path['ptt'] = ptt
    paths.append(pd.concat([path, old_path]))
    clear_output()

100%|██████████| 50/50 [00:50<00:00,  1.01s/it]


In [525]:
df = pd.concat(paths)
d = df[df['case'] == 'fake'].groupby('ptt').agg({'lat': 'count'}).reset_index().merge(
    df[df['case'] == 'real'].groupby('ptt').agg({'lat': 'count'}).reset_index(),
    on='ptt'
).rename({'lat_x': 'fake', 'lat_y': 'real'}, axis=1)
d = d[d['fake'] / d['real']  > 0.8]
df = df[df['ptt'].isin(d['ptt'])]

In [526]:
px.scatter_geo(df[df['case'] == 'real'], lat='lat', lon='lon', color='ptt')

In [527]:
px.scatter_geo(df[df['case'] == 'fake'], lat='lat', lon='lon', color='ptt')

In [530]:
px.scatter_geo(
    df[(df['ptt'] == '229202') & (df['case'] == 'real')].reset_index(drop=True).reset_index(),
    lat='lat', lon='lon', color='index'
)

In [531]:
px.scatter_geo(
    df[(df['ptt'] == '229202') & (df['case'] == 'fake')].reset_index(drop=True).reset_index(),
    lat='lat', lon='lon', color='index'
)

In [534]:
train.groupby('remain_now')[['chosen']].mean()

Unnamed: 0_level_0,chosen
remain_now,Unnamed: 1_level_1
False,0.018789
True,0.664267


In [507]:
d

Unnamed: 0,ptt,fake,real
0,142194,33,32
1,142196,68,66
2,142197,62,61
3,142198,53,52
4,142199,51,50
5,159002b,65,64
6,159006,37,36
7,159006b,61,60
8,159008,40,39
9,159008b,26,25


In [325]:
def create_random_track(lat, lon, date, model, steps):
    path = [(lat, lon)]
    for _ in range(steps):
        h3_index = h3.geo_to_h3(lat, lon, RESOLUTION)
        if h3_index not in neighbors_index:
            find_neighbors(h3_index, MAX_KM, neighbors_index)
        raw_possibilities = list(neighbors_index[h3_index])
        possibilities = pd.DataFrame(raw_possibilities, columns=['h3_index'])
        possibilities['month'] = date.month
        possibilities = possibilities.merge(full_features, on=['h3_index', 'month'])
        if possibilities.shape[0] == 0:
            break

        if len(path) > 1:
            prior_direction_lat = path[-1][0] - path[-2][0]
            prior_direction_lon = path[-1][1] - path[-2][1]
            prior_direction_lat = prior_direction_lat / np.linalg.norm([prior_direction_lat, prior_direction_lon])
            prior_direction_lon = prior_direction_lon / np.linalg.norm([prior_direction_lat, prior_direction_lon])
        else:
            prior_direction_lat = 0
            prior_direction_lon = 0
        possibilities['prior_direction_lat'] = prior_direction_lat if np.isfinite(prior_direction_lat) else 0
        possibilities['prior_direction_lon'] = prior_direction_lon if np.isfinite(prior_direction_lon) else 0

        possibilities['start_lat'] = lat
        possibilities['start_lon'] = lon
        possibilities['end_lat'] = possibilities.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
        possibilities['end_lon'] = possibilities.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
        possibilities['direction_lat'] = possibilities['end_lat'] - possibilities['start_lat']
        possibilities['direction_lon'] = possibilities['end_lon'] - possibilities['start_lon']
        possibilities['direction_lat'] = possibilities.apply(lambda r: r['direction_lat'] / np.linalg.norm([r['direction_lat'], r['direction_lon']]), axis=1).fillna(0)
        possibilities['direction_lon'] = possibilities.apply(lambda r: r['direction_lon'] / np.linalg.norm([r['direction_lat'], r['direction_lon']]), axis=1).fillna(0)

        possibilities['remained_before'] = (possibilities['prior_direction_lat'] == 0) & (possibilities['prior_direction_lon'] == 0)
        possibilities['remain_now'] = (possibilities['direction_lat'] == 0) & (possibilities['direction_lon'] == 0)
        possibilities['dot_product'] = possibilities['prior_direction_lat'] * possibilities['direction_lat'] + possibilities['prior_direction_lon'] * possibilities['direction_lon']

        X = possibilities[features]
        probs = np.ones(X.shape[0])
        probs = probs / probs.sum()
        raw_possibilities = list(possibilities['h3_index'])
        selection = np.random.choice(range(len(raw_possibilities)), p=probs)
        lat, lon = h3.h3_to_geo(raw_possibilities[selection])
        date = date + pd.Timedelta(days=1)
        path.append((lat, lon))
    return pd.DataFrame(path, columns=['lat', 'lon'])

paths = []
for ptt in tqdm(ptts):
    df = data[data['ptt'] == ptt].sort_values("date", ascending=True).iloc[[0]]
    lat = df['lat'].values[0]
    lon = df['lon'].values[0]
    date = datetime.strptime(df['date'].values[0], '%Y-%m-%d')
    path = create_random_track(
        lat, lon, date,
        results[0]['best_estimator'],
        data[data['ptt'] == ptt].shape[0]
    )
    path['case'] = 'fake'
    path['ptt'] = ptt
    old_path = data[data['ptt'] == ptt][['lat', 'lon']]
    old_path['h3_index'] = old_path.apply(lambda row: h3.geo_to_h3(row['lat'], row['lon'], RESOLUTION), axis=1)
    old_path['lat'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
    old_path['lon'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
    old_path['case'] = 'real'
    old_path['ptt'] = ptt
    paths.append(pd.concat([path, old_path]))
    clear_output()

100%|██████████| 50/50 [00:28<00:00,  1.75it/s]


In [326]:
df = pd.concat(paths)
px.scatter_geo(df[df['case'] == 'fake'], lat='lat', lon='lon', color='ptt')

## Second Level of the Brain??

In [369]:
ML = max(len(v) for v in neighbors_index.values())
print(ML)

def agg(x):
    x = list(x)
    if len(x) < ML:
        x += [0] * (ML - len(x))
    return x

train2 = train[['chosen', 'choice_id', 'pred']]
train2 = train2.groupby('choice_id').agg(agg).reset_index()

rows = []
samples = 10
for _, row in train2.iterrows():
    zipped = list(zip(row['pred'], row['chosen']))
    for _ in range(samples):
        np.random.shuffle(zipped)
        pred = [x[0] for x in zipped]
        chosen = np.array([x[1] for x in zipped])
        try:
            index = np.where(chosen == 1)[0][0]
        except Exception as e:
            continue # we have some where we didn't have features
        new_row = {
            f'pred_{i}': pred[i] for i in range(ML)
        }
        new_row['chosen'] = index 
        rows.append(new_row)

train2 = pd.DataFrame(rows)
train2

32


Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,...,pred_23,pred_24,pred_25,pred_26,pred_27,pred_28,pred_29,pred_30,pred_31,chosen
0,0.022913,0.000000,0.042163,0.000000,0.017276,0.043215,0.009865,0.000000,0.011049,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.060314,0.041491,0.159019,0.000000,0.336097,31
1,0.000000,0.000000,0.000000,0.000000,0.336097,0.000000,0.000000,0.008387,0.000000,0.000000,...,0.022913,0.000000,0.000000,0.043215,0.000000,0.060314,0.000000,0.000000,0.041491,4
2,0.008387,0.017276,0.000000,0.336097,0.000000,0.022913,0.000000,0.000000,0.000000,0.000000,...,0.159019,0.000000,0.000000,0.034369,0.041831,0.041491,0.000000,0.000000,0.009865,3
3,0.000000,0.000000,0.000000,0.336097,0.000000,0.042236,0.159019,0.022913,0.000000,0.000000,...,0.000000,0.009865,0.011049,0.060314,0.042163,0.000000,0.000000,0.000000,0.043215,3
4,0.000000,0.000000,0.000000,0.000000,0.043215,0.008387,0.009865,0.000000,0.060314,0.000000,...,0.017276,0.042236,0.000000,0.000000,0.000000,0.042163,0.041831,0.000000,0.000000,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30585,0.002334,0.019749,0.567783,0.029776,0.000000,0.000000,0.016297,0.017830,0.000000,0.036376,...,0.034410,0.035135,0.011251,0.000000,0.000000,0.056584,0.018177,0.010920,0.000000,24
30586,0.029776,0.018177,0.016297,0.000000,0.000000,0.000000,0.034410,0.056584,0.000000,0.000000,...,0.000000,0.567783,0.036376,0.014227,0.024475,0.013477,0.035135,0.011251,0.019749,29
30587,0.567783,0.000000,0.029776,0.024475,0.010920,0.000000,0.000000,0.002334,0.000000,0.000000,...,0.014227,0.000000,0.000000,0.010749,0.013477,0.000000,0.035135,0.017830,0.000000,29
30588,0.000000,0.018177,0.014227,0.024475,0.017830,0.000000,0.028266,0.029776,0.567783,0.010749,...,0.000000,0.000000,0.010920,0.036376,0.000000,0.011251,0.000000,0.000000,0.019749,18


In [374]:
ML = max(len(v) for v in neighbors_index.values())
print(ML)

def agg(x):
    x = list(x)
    if len(x) < ML:
        x += [0] * (ML - len(x))
    return x

test2 = test[['chosen', 'choice_id', 'pred']]
test2 = test2.groupby('choice_id').agg(agg).reset_index()

rows = []
samples = 10
for _, row in test2.iterrows():
    zipped = list(zip(row['pred'], row['chosen']))
    for _ in range(samples):
        np.random.shuffle(zipped)
        pred = [x[0] for x in zipped]
        chosen = np.array([x[1] for x in zipped])
        try:
            index = np.where(chosen == 1)[0][0]
        except Exception as e:
            continue # we have some where we didn't have features
        new_row = {
            f'pred_{i}': pred[i] for i in range(ML)
        }
        new_row['chosen'] = index 
        rows.append(new_row)

test2 = pd.DataFrame(rows)
test2

32


Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,...,pred_23,pred_24,pred_25,pred_26,pred_27,pred_28,pred_29,pred_30,pred_31,chosen
0,0.057496,0.000000,0.000000,0.021556,0.000092,0.000000,0.000000,0.014325,0.012585,0.000000,...,0.000000,0.023866,0.016170,0.778648,0.000000,0.011939,0.000000,0.009471,0.009014,26
1,0.011454,0.012589,0.000000,0.000000,0.000000,0.014325,0.011511,0.000000,0.009471,0.041323,...,0.016170,0.000000,0.023866,0.006430,0.000000,0.021055,0.000000,0.000092,0.021556,20
2,0.011511,0.014325,0.000092,0.000000,0.000000,0.000000,0.009014,0.023866,0.009471,0.000000,...,0.021055,0.011939,0.000000,0.000000,0.011454,0.000000,0.041323,0.011998,0.016170,22
3,0.011998,0.014325,0.000000,0.011511,0.000000,0.006430,0.000000,0.000000,0.021556,0.014268,...,0.011939,0.000000,0.000000,0.000000,0.009471,0.011454,0.778648,0.021055,0.000000,29
4,0.016170,0.023866,0.009014,0.012589,0.011511,0.000000,0.021556,0.000000,0.011998,0.006430,...,0.000000,0.011939,0.041323,0.000000,0.000092,0.000000,0.778648,0.000000,0.000000,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28825,0.000000,0.001487,0.000000,0.074696,0.041966,0.000000,0.063094,0.020757,0.026225,0.000000,...,0.060187,0.015279,0.000000,0.006285,0.000000,0.000000,0.000000,0.000696,0.000085,15
28826,0.000000,0.000000,0.000000,0.007643,0.001487,0.000000,0.043744,0.000000,0.000000,0.645310,...,0.000000,0.000696,0.000000,0.041966,0.060187,0.000000,0.063094,0.000000,0.018638,9
28827,0.000000,0.015279,0.006285,0.018638,0.063094,0.000696,0.060187,0.007643,0.000000,0.020757,...,0.000000,0.002047,0.000000,0.032619,0.000000,0.645310,0.000000,0.001487,0.000000,28
28828,0.074696,0.006285,0.000000,0.000000,0.000000,0.032619,0.018638,0.043744,0.063094,0.000000,...,0.060187,0.000000,0.000000,0.000000,0.000000,0.007643,0.041966,0.020757,0.000696,22


In [400]:
features = [f'pred_{i}' for i in range(ML)]

X = train2[features]
y = train2['chosen']

models = {
    "Random Forest": RandomForestClassifier(
        random_state=42, n_jobs=3
    )
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [200]}#{"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [5, 10, 20, 50]},
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
        refit=True,
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_), "best_estimator": grid_search.best_estimator_}
    results.append(result)

In [401]:
choice_model = results[0]["best_estimator"]

In [373]:
train2['pred'] = results[0]['best_estimator'].predict(X)
accuracy_score(train2['chosen'], train2['pred'])

0.6670480549199085

In [375]:
test2['pred'] = results[0]['best_estimator'].predict(test2[features])
accuracy_score(test2['chosen'], test2['pred'])

0.6666319805757891

In [391]:
features = ['elevation', 'temp', 'log_chla', 'remained_before', 'remain_now', 'dot_product']

def create_random_track(lat, lon, date, model, steps):
    path = [(lat, lon)]
    for _ in range(steps):
        h3_index = h3.geo_to_h3(lat, lon, RESOLUTION)
        if h3_index not in neighbors_index:
            find_neighbors(h3_index, MAX_KM, neighbors_index)
        raw_possibilities = list(neighbors_index[h3_index])
        possibilities = pd.DataFrame(raw_possibilities, columns=['h3_index'])
        possibilities['month'] = date.month
        possibilities = possibilities.merge(full_features, on=['h3_index', 'month'])
        if possibilities.shape[0] == 0:
            break

        if len(path) > 1:
            prior_direction_lat = path[-1][0] - path[-2][0]
            prior_direction_lon = path[-1][1] - path[-2][1]
            prior_direction_lat = prior_direction_lat / np.linalg.norm([prior_direction_lat, prior_direction_lon])
            prior_direction_lon = prior_direction_lon / np.linalg.norm([prior_direction_lat, prior_direction_lon])
        else:
            prior_direction_lat = 0
            prior_direction_lon = 0
        possibilities['prior_direction_lat'] = prior_direction_lat if np.isfinite(prior_direction_lat) else 0
        possibilities['prior_direction_lon'] = prior_direction_lon if np.isfinite(prior_direction_lon) else 0

        possibilities['start_lat'] = lat
        possibilities['start_lon'] = lon
        possibilities['end_lat'] = possibilities.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
        possibilities['end_lon'] = possibilities.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
        possibilities['direction_lat'] = possibilities['end_lat'] - possibilities['start_lat']
        possibilities['direction_lon'] = possibilities['end_lon'] - possibilities['start_lon']
        possibilities['direction_lat'] = possibilities.apply(lambda r: r['direction_lat'] / np.linalg.norm([r['direction_lat'], r['direction_lon']]), axis=1).fillna(0)
        possibilities['direction_lon'] = possibilities.apply(lambda r: r['direction_lon'] / np.linalg.norm([r['direction_lat'], r['direction_lon']]), axis=1).fillna(0)

        possibilities['remained_before'] = (possibilities['prior_direction_lat'] == 0) & (possibilities['prior_direction_lon'] == 0)
        possibilities['remain_now'] = (possibilities['direction_lat'] == 0) & (possibilities['direction_lon'] == 0)
        possibilities['dot_product'] = possibilities['prior_direction_lat'] * possibilities['direction_lat'] + possibilities['prior_direction_lon'] * possibilities['direction_lon']

        X = possibilities[features]
        probs = sensory_model.predict(X)

        utility = list(probs)
        if len(utility) < ML:
            utility += [0] * (ML - len(utility))
        decision_info = pd.DataFrame([utility], columns=[f'pred_{i}' for i in range(ML)])
        selection1 = results[0]['best_estimator'].predict(decision_info)[0]
        

        raw_possibilities = list(possibilities['h3_index'])
        if selection1 >= len(raw_possibilities):
            selection1 = np.random.choice(range(len(raw_possibilities)))


        probs = probs / probs.sum()
        selection2 = np.random.choice(range(len(raw_possibilities)), p=probs)

        if np.random.random() > 0.5:
            selection = selection1
        else:
            selection = selection2

        lat, lon = h3.h3_to_geo(raw_possibilities[selection])
        date = date + pd.Timedelta(days=1)
        path.append((lat, lon))
    return pd.DataFrame(path, columns=['lat', 'lon'])

paths = []
for ptt in tqdm(ptts):
    df = data[data['ptt'] == ptt].sort_values("date", ascending=True).iloc[[0]]
    lat = df['lat'].values[0]
    lon = df['lon'].values[0]
    date = datetime.strptime(df['date'].values[0], '%Y-%m-%d')
    path = create_random_track(
        lat, lon, date,
        results[0]['best_estimator'],
        data[data['ptt'] == ptt].shape[0]
    )
    path['case'] = 'fake'
    path['ptt'] = ptt
    old_path = data[data['ptt'] == ptt][['lat', 'lon']]
    old_path['h3_index'] = old_path.apply(lambda row: h3.geo_to_h3(row['lat'], row['lon'], RESOLUTION), axis=1)
    old_path['lat'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
    old_path['lon'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
    old_path['case'] = 'real'
    old_path['ptt'] = ptt
    paths.append(pd.concat([path, old_path]))
    clear_output()

100%|██████████| 50/50 [01:26<00:00,  1.72s/it]


In [392]:
df = pd.concat(paths)
px.scatter_geo(df[df['case'] == 'fake'], lat='lat', lon='lon', color='ptt')

In [393]:
px.scatter_geo(df[df['case'] == 'real'], lat='lat', lon='lon', color='ptt')

## One Last Model?

In [398]:
train3 = train2.copy()
train3['correct'] = train3['pred'] == train3['chosen']
train3

Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,...,pred_25,pred_26,pred_27,pred_28,pred_29,pred_30,pred_31,chosen,pred,correct
0,0.022913,0.000000,0.042163,0.000000,0.017276,0.043215,0.009865,0.000000,0.011049,0.000000,...,0.000000,0.000000,0.060314,0.041491,0.159019,0.000000,0.336097,31,31,True
1,0.000000,0.000000,0.000000,0.000000,0.336097,0.000000,0.000000,0.008387,0.000000,0.000000,...,0.000000,0.043215,0.000000,0.060314,0.000000,0.000000,0.041491,4,4,True
2,0.008387,0.017276,0.000000,0.336097,0.000000,0.022913,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.034369,0.041831,0.041491,0.000000,0.000000,0.009865,3,3,True
3,0.000000,0.000000,0.000000,0.336097,0.000000,0.042236,0.159019,0.022913,0.000000,0.000000,...,0.011049,0.060314,0.042163,0.000000,0.000000,0.000000,0.043215,3,3,True
4,0.000000,0.000000,0.000000,0.000000,0.043215,0.008387,0.009865,0.000000,0.060314,0.000000,...,0.000000,0.000000,0.000000,0.042163,0.041831,0.000000,0.000000,11,18,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30585,0.002334,0.019749,0.567783,0.029776,0.000000,0.000000,0.016297,0.017830,0.000000,0.036376,...,0.011251,0.000000,0.000000,0.056584,0.018177,0.010920,0.000000,24,2,False
30586,0.029776,0.018177,0.016297,0.000000,0.000000,0.000000,0.034410,0.056584,0.000000,0.000000,...,0.036376,0.014227,0.024475,0.013477,0.035135,0.011251,0.019749,29,24,False
30587,0.567783,0.000000,0.029776,0.024475,0.010920,0.000000,0.000000,0.002334,0.000000,0.000000,...,0.000000,0.010749,0.013477,0.000000,0.035135,0.017830,0.000000,29,0,False
30588,0.000000,0.018177,0.014227,0.024475,0.017830,0.000000,0.028266,0.029776,0.567783,0.010749,...,0.010920,0.036376,0.000000,0.011251,0.000000,0.000000,0.019749,18,8,False


In [402]:
features = [f'pred_{i}' for i in range(ML)]

X = train3[features]
y = train3['correct']

models = {
    "Random Forest": RandomForestRegressor(
        random_state=42, n_jobs=3
    )
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [200]}#{"n_estimators": [10, 20, 50, 100], "min_samples_leaf": [5, 10, 20, 50]},
}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=cv,
        refit=True,
    ).fit(X, y)
    result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_), "best_estimator": grid_search.best_estimator_}
    results.append(result)

In [403]:
train3['pred_conf'] = results[0]['best_estimator'].predict(X)
train3 

Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,...,pred_26,pred_27,pred_28,pred_29,pred_30,pred_31,chosen,pred,correct,pred_conf
0,0.022913,0.000000,0.042163,0.000000,0.017276,0.043215,0.009865,0.000000,0.011049,0.000000,...,0.000000,0.060314,0.041491,0.159019,0.000000,0.336097,31,31,True,0.610040
1,0.000000,0.000000,0.000000,0.000000,0.336097,0.000000,0.000000,0.008387,0.000000,0.000000,...,0.043215,0.000000,0.060314,0.000000,0.000000,0.041491,4,4,True,0.591877
2,0.008387,0.017276,0.000000,0.336097,0.000000,0.022913,0.000000,0.000000,0.000000,0.000000,...,0.034369,0.041831,0.041491,0.000000,0.000000,0.009865,3,3,True,0.632930
3,0.000000,0.000000,0.000000,0.336097,0.000000,0.042236,0.159019,0.022913,0.000000,0.000000,...,0.060314,0.042163,0.000000,0.000000,0.000000,0.043215,3,3,True,0.599939
4,0.000000,0.000000,0.000000,0.000000,0.043215,0.008387,0.009865,0.000000,0.060314,0.000000,...,0.000000,0.000000,0.042163,0.041831,0.000000,0.000000,11,18,False,0.599766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30585,0.002334,0.019749,0.567783,0.029776,0.000000,0.000000,0.016297,0.017830,0.000000,0.036376,...,0.000000,0.000000,0.056584,0.018177,0.010920,0.000000,24,2,False,0.614702
30586,0.029776,0.018177,0.016297,0.000000,0.000000,0.000000,0.034410,0.056584,0.000000,0.000000,...,0.014227,0.024475,0.013477,0.035135,0.011251,0.019749,29,24,False,0.623628
30587,0.567783,0.000000,0.029776,0.024475,0.010920,0.000000,0.000000,0.002334,0.000000,0.000000,...,0.010749,0.013477,0.000000,0.035135,0.017830,0.000000,29,0,False,0.594143
30588,0.000000,0.018177,0.014227,0.024475,0.017830,0.000000,0.028266,0.029776,0.567783,0.010749,...,0.036376,0.000000,0.011251,0.000000,0.000000,0.019749,18,8,False,0.544902


In [405]:
train3['pred_conf'].describe()

count    30590.000000
mean         0.667564
std          0.077540
min          0.404334
25%          0.609412
50%          0.670919
75%          0.726956
max          0.886689
Name: pred_conf, dtype: float64

In [432]:
features = ['elevation', 'temp', 'log_chla', 'remained_before', 'remain_now', 'dot_product']

def create_random_track(lat, lon, date, model, steps):
    path = [(lat, lon, date)]
    for _ in range(steps):
        h3_index = h3.geo_to_h3(lat, lon, RESOLUTION)
        if h3_index not in neighbors_index:
            find_neighbors(h3_index, MAX_KM, neighbors_index)
        raw_possibilities = list(neighbors_index[h3_index])
        possibilities = pd.DataFrame(raw_possibilities, columns=['h3_index'])
        possibilities['month'] = date.month
        possibilities = possibilities.merge(full_features, on=['h3_index', 'month'])
        if possibilities.shape[0] == 0:
            break

        if len(path) > 1:
            prior_direction_lat = path[-1][0] - path[-2][0]
            prior_direction_lon = path[-1][1] - path[-2][1]
            prior_direction_lat = prior_direction_lat / np.linalg.norm([prior_direction_lat, prior_direction_lon])
            prior_direction_lon = prior_direction_lon / np.linalg.norm([prior_direction_lat, prior_direction_lon])
        else:
            prior_direction_lat = 0
            prior_direction_lon = 0
        possibilities['prior_direction_lat'] = prior_direction_lat if np.isfinite(prior_direction_lat) else 0
        possibilities['prior_direction_lon'] = prior_direction_lon if np.isfinite(prior_direction_lon) else 0

        possibilities['start_lat'] = lat
        possibilities['start_lon'] = lon
        possibilities['end_lat'] = possibilities.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
        possibilities['end_lon'] = possibilities.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
        possibilities['direction_lat'] = possibilities['end_lat'] - possibilities['start_lat']
        possibilities['direction_lon'] = possibilities['end_lon'] - possibilities['start_lon']
        possibilities['direction_lat'] = possibilities.apply(lambda r: r['direction_lat'] / np.linalg.norm([r['direction_lat'], r['direction_lon']]), axis=1).fillna(0)
        possibilities['direction_lon'] = possibilities.apply(lambda r: r['direction_lon'] / np.linalg.norm([r['direction_lat'], r['direction_lon']]), axis=1).fillna(0)

        possibilities['remained_before'] = (possibilities['prior_direction_lat'] == 0) & (possibilities['prior_direction_lon'] == 0)
        possibilities['remain_now'] = (possibilities['direction_lat'] == 0) & (possibilities['direction_lon'] == 0)
        possibilities['dot_product'] = possibilities['prior_direction_lat'] * possibilities['direction_lat'] + possibilities['prior_direction_lon'] * possibilities['direction_lon']

        X = possibilities[features]
        probs = sensory_model.predict(X)

        utility = list(probs)
        if len(utility) < ML:
            utility += [0] * (ML - len(utility))
        decision_info = pd.DataFrame([utility], columns=[f'pred_{i}' for i in range(ML)])
        selection1 = choice_model.predict(decision_info)[0]
        

        raw_possibilities = list(possibilities['h3_index'])
        if selection1 >= len(raw_possibilities):
            selection1 = np.random.choice(range(len(raw_possibilities)))


        probs = probs / probs.sum()
        selection2 = np.random.choice(range(len(raw_possibilities)), p=probs)

        confidence = results[0]['best_estimator'].predict(decision_info)[0]
        if np.random.random() <= 0:#confidence:
            selection = selection1
        else:
            selection = selection2

        lat, lon = h3.h3_to_geo(raw_possibilities[selection])
        date = date + pd.Timedelta(days=1)
        path.append((lat, lon, date))
    return pd.DataFrame(path, columns=['lat', 'lon', 'date'])

paths = []
for ptt in tqdm(ptts):
    df = data[data['ptt'] == ptt].sort_values("date", ascending=True).iloc[[0]]
    lat = df['lat'].values[0]
    lon = df['lon'].values[0]
    date = datetime.strptime(df['date'].values[0], '%Y-%m-%d')
    path = create_random_track(
        lat, lon, date,
        results[0]['best_estimator'],
        data[data['ptt'] == ptt].shape[0]
    )
    path['case'] = 'fake'
    path['ptt'] = ptt
    old_path = data[data['ptt'] == ptt][['lat', 'lon', 'date']]
    old_path['date'] = pd.to_datetime(old_path['date'])
    old_path['h3_index'] = old_path.apply(lambda row: h3.geo_to_h3(row['lat'], row['lon'], RESOLUTION), axis=1)
    old_path['lat'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
    old_path['lon'] = old_path.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
    old_path['case'] = 'real'
    old_path['ptt'] = ptt
    paths.append(pd.concat([path, old_path]))
    clear_output()

100%|██████████| 50/50 [01:47<00:00,  2.15s/it]


In [433]:
df = pd.concat(paths)
px.scatter_geo(df[df['case'] == 'fake'], lat='lat', lon='lon', color='ptt')

In [434]:
df = pd.concat(paths)
px.scatter_geo(df[df['case'] == 'real'], lat='lat', lon='lon', color='ptt')

In [437]:
px.scatter_geo(
    df[(df['ptt'] == '172912') & (df['case'] == 'fake')].reset_index(drop=True).reset_index(),
    lat='lat', lon='lon', color='index'
)


In [438]:
px.scatter_geo(
    df[(df['ptt'] == '172912') & (df['case'] == 'real')].reset_index(drop=True).reset_index(),
    lat='lat', lon='lon', color='index'
)

In [416]:
d = data[data['ptt'] == '210770'].sort_values('date', ascending=True).reset_index(drop=True).reset_index()
d['h3_index'] = df.apply(lambda row: h3.geo_to_h3(row['lat'], row['lon'], RESOLUTION), axis=1)
d['lat'] = df.apply(lambda row: h3.h3_to_geo(row['h3_index'])[0], axis=1)
d['lon'] = df.apply(lambda row: h3.h3_to_geo(row['h3_index'])[1], axis=1)
px.scatter_geo(df, lat='lat', lon='lon', color='index')

In [423]:
data[data['ptt'] == '210770'].sort_values('date', ascending=True).reset_index(drop=True).reset_index()

Unnamed: 0,index,ptt,lat,lon,date,year,month,day
0,0,210770,59.600,-139.875,2021-03-22,2021,3,22
1,1,210770,59.600,-139.950,2021-03-23,2021,3,23
2,2,210770,59.600,-140.325,2021-03-24,2021,3,24
3,3,210770,59.575,-140.825,2021-03-25,2021,3,25
4,4,210770,59.575,-141.300,2021-03-26,2021,3,26
...,...,...,...,...,...,...,...,...
87,87,210770,57.000,-136.700,2021-06-17,2021,6,17
88,88,210770,56.675,-136.450,2021-06-18,2021,6,18
89,89,210770,56.500,-136.275,2021-06-19,2021,6,19
90,90,210770,56.150,-136.050,2021-06-20,2021,6,20
