In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .master("local")
    .appName("SQLite JDBC")
    .config(
        "spark.jars",
        "/workspaces/mirrorverse/sqlite-jdbc-3.34.0.jar")
    .config(
        "spark.driver.extraClassPath",
        "/workspaces/mirrorverse/sqlite-jdbc-3.34.0.jar")
    .getOrCreate())

ps_conn = "jdbc:sqlite:/workspaces/mirrorverse/mirrorverse.db"

24/07/01 14:14:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
from mirrorverse.models.chinook_depth.spark import build_training_data

depth_classes = [25, 50, 75, 100, 150, 200, 250, 300, 400, 500]
features = ["depth_class", "month", "daytime", "period_progress", "elevation"]
train_dir = "train"
test_dir = "test"

In [None]:
import numpy as np

build_training_data(
    spark,
    ps_conn,
    depth_classes=np.array(depth_classes),
    features=features,
    train_dir=train_dir,
    test_dir=test_dir,
    split=0.8,
    overwrite=True,
)

In [40]:
from mirrorverse.models.chinook_depth.keras import load_data, build_model
from tensorflow.keras.layers import Dense

def build_randomized_param_sets(param_grids, M, max_attempts):
    """
    Inputs:
    - param_grids: dict, parameter names to grid of values
    - M: int, number of parameter sets to generate
    - max_attempts: int, maximum number of attempts to generate a unique parameter set

    Outputs:
    - list of dicts, parameter sets
    """
    param_sets = []
    attempts = 0
    while len(param_sets) < M:
        assert attempts < max_attempts

        param_set = {}
        for param, grid in param_grids.items():
            param_set[param] = np.random.choice(grid)
        if param_set in param_sets:
            attempts += 1
        else:
            attempts = 0
            param_sets.append(param_set)

    return param_sets

grids = {
    "batch_size": [100, 500, 1000, 5000, 10000],
    "epochs": [10],
    "layers": [
        lambda: [Dense(64, activation='relu'), Dense(32, activation='relu'), Dense(16, activation='relu')],
        lambda: [Dense(32, activation='relu'), Dense(16, activation='relu'), Dense(8, activation='relu')],
        lambda: [Dense(16, activation='relu'), Dense(8, activation='relu'), Dense(4, activation='relu')],

        lambda: [Dense(16, activation='relu'), Dense(8, activation='relu'), Dense(16, activation='relu')],
        lambda: [Dense(8, activation='relu'), Dense(16, activation='relu'), Dense(8, activation='relu')],
        lambda: [Dense(16, activation='relu'), Dense(16, activation='relu'), Dense(16, activation='relu')],
    ]
}

param_sets = build_randomized_param_sets(grids, 5 * 6, 10)
param_sets


[{'batch_size': 10000, 'epochs': 5, 'layers': <function __main__.<lambda>()>},
 {'batch_size': 5000, 'epochs': 5, 'layers': <function __main__.<lambda>()>}]

In [None]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.set_experiment("/chinook_depth_model_tuning")

# Turn off autologging.
mlflow.tensorflow.autolog(disable=True)

N = len(depth_classes)
for param_set in param_sets:
    batch_size = param_set["batch_size"]
    epochs = param_set["epochs"]
    layers = param_set["layers"]()

    with mlflow.start_run() as run:

        train = load_data(train_dir, N, features, batch_size=batch_size, shuffle_buffer_size=10000)
        test = load_data(test_dir, N, features, batch_size=batch_size, shuffle_buffer_size=10000)

        for i, layer in enumerate(layers):
            mlflow.log_param(f"layer_{i}_units", layer.get_config()['units'])

        model, layers = build_model(N, features, layers)

        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("epochs", epochs)
    
        history = model.fit(train, validation_data=test, epochs=epochs)

        for metric_name, metrics in history.history.items():
            for i, metric in enumerate(metrics):
                mlflow.log_metric(metric_name, metric, step=i)
            mlflow.log_metric(f"final_{metric_name}", metrics[-1])
            mlflow.log_metric(f"min_{metric_name}", min(metrics))

        mlflow.keras.log_model(model, "models")

In [None]:
# TODO: Run Full Inference

from mirrorverse.models.chinook_depth.spark import run_inference

query = """
with h3 as (
    select distinct
        h3_level_4_key 
    from 
        tag_tracks
), time as (
    select distinct
        date_key as epoch
    from 
        dates
)
select 
    t.epoch,
    h3.h3_level_4_key
from 
    h3
    cross join time t
"""

run_inference(
    spark, 
    ps_conn,
    depth_classes,
    features,
    query,
    "train/normalization_parameters.json",
    "/workspaces/mirrorverse/mlartifacts/539696127796712602/4b66bdcb3d194395be7bf40225d1db2f/artifacts/models/data/model.keras",
    "depth_predictions"
)

In [5]:
import pandas as pd

pd_conn = "sqlite:////workspaces/mirrorverse/mirrorverse.db"
df = pd.read_sql("select * from depth_predictions where epoch > (1676246400 -(3600 * 24 * 365)) ", pd_conn)
df

Unnamed: 0,h3_level_4_key,epoch,depth_class,probability
0,594692656546709503,1644796800,25.0,0.359553
1,594692656546709503,1644883200,25.0,0.359553
2,594692656546709503,1644969600,25.0,0.359553
3,594692656546709503,1645056000,25.0,0.359553
4,594692656546709503,1645142400,25.0,0.359553
...,...,...,...,...
2457535,594988459534319615,1674777600,500.0,0.001831
2457536,594988459534319615,1675123200,500.0,0.001831
2457537,594988459534319615,1675641600,500.0,0.001769
2457538,594988459534319615,1675728000,500.0,0.001769


## Notes

```bash
apt-get update
apt install default-jre

curl -O https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.34.0/sqlite-jdbc-3.34.0.jar


Needed conda to get h5py installed correctly

mkdir -p ~/miniconda3
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh -O ~/miniconda3/miniconda.sh
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
rm -rf ~/miniconda3/miniconda.sh

~/miniconda3/bin/conda init bash
```