In [None]:
import os
os.environ['AWS_PROFILE'] = 'admin'
os.environ['HAVEN_DATABASE'] = 'haven'

from mirrorverse.utils import read_data_w_cache
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
from collections import defaultdict
import h3

In [None]:
data = read_data_w_cache(
    '''
    select 
        h3_index,
        year,
        month,
        diatom,
        pzooplankton
    from
        plankton
    where 
        year in (1995, 2000, 2005)
        and depth = -25
    '''
)
print(data.shape)
data.head()

In [None]:
mapping = defaultdict(lambda: [0] * 12)
for _, row in tqdm(data.iterrows(), total=data.shape[0]):
    mapping[(row['h3_index'], row['year'])][int(row['month']) - 1] = (row['diatom'], row['pzooplankton'])

In [None]:
rows = []
for (h3_index, year), values in tqdm(mapping.items()):
    row = {
        'h3_index': h3_index,
        'year': year
    }
    for i, (diatom, pzooplankton) in enumerate(values):
        row[f'diatom_{i+1}'] = diatom
        row[f'pzooplankton_{i+1}'] = pzooplankton
    rows.append(row)
data = pd.DataFrame(rows)
print(data.shape)
data.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score


start_month = 3
predict_month = 6

columns = [f'diatom_{i}' for i in range(start_month, predict_month + 1)]

X = data[columns]
y = data[[f'pzooplankton_{predict_month}']]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

regr = RandomForestRegressor(min_samples_leaf=5, random_state=0)
regr.fit(X_train, y_train)
y_train_pred = regr.predict(X_train)
y_test_pred = regr.predict(X_test)
(
    round(explained_variance_score(y_train, y_train_pred), 3),
    round(explained_variance_score(y_test, y_test_pred), 3)
)

In [None]:
compare = data[['h3_index', 'year']]
compare['lat'] = data['h3_index'].apply(lambda x: h3.h3_to_geo(x)[0])
compare['lon'] = data['h3_index'].apply(lambda x: h3.h3_to_geo(x)[1])
compare['true'] = data[f'pzooplankton_{predict_month}']
compare['pred'] = regr.predict(data[columns])
compare = compare.groupby(['lat', 'lon', 'year'])[['true', 'pred']].mean().reset_index()
compare['h3_index'] = compare.apply(lambda r: h3.geo_to_h3(r['lat'], r['lon'], 5), axis=1)
print(compare.shape)
compare.head()

In [None]:
from mirrorverse.plotting import build_geojson
import plotly.graph_objects as go

fig = go.Figure()

df = compare[compare['year'] == 2000]

geojson = build_geojson(df, 'h3_index')
fig.add_trace(
    go.Choroplethmapbox(
        geojson=geojson,
        locations=df['h3_index'],
        z=df['true'],
        visible=True,
        marker_line_color='rgba(255,255,255,0)',
    )
)

fig.update_layout(
    autosize=False, # Disable autosizing
    width=800, # Set width in pixels
    height=800, # Set height in pixels
)

fig.update_layout(
    margin={"r":0,"t":30,"l":0,"b":0}, mapbox=dict(style="carto-positron", zoom=4, center = {"lat": 57, "lon": -150})
)

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Choroplethmapbox(
        geojson=geojson,
        locations=df['h3_index'],
        z=df['pred'],
        visible=True,
        marker_line_color='rgba(255,255,255,0)',
    )
)

fig.update_layout(
    autosize=False, # Disable autosizing
    width=800, # Set width in pixels
    height=800, # Set height in pixels
)

fig.update_layout(
    margin={"r":0,"t":30,"l":0,"b":0}, mapbox=dict(style="carto-positron", zoom=4, center = {"lat": 57, "lon": -150})
)

fig.show()