# Model Training

## Task 0 - Setup

In [None]:
import datetime
import os

from pathlib import Path

import pins
import vetiver
import polars as pl
import polars.selectors as cs

from posit.connect import Client
from sklearn.base import TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from rsconnect.api import RSConnectServer
from vetiver import VetiverModel

if Path(".env").exists():
    from dotenv import load_dotenv

    load_dotenv()

In [None]:
with Client() as client:
    username = client.me.username

print(f"Connect username is: '{username}'")

## Task 1 - Reading the data

### 🔄 Task

- Read in and glimpse the vessel history data
- Read in and glimpse the vessel verbose data
- Read in and glimpse the weather data

### 🧑‍💻 Code

In [None]:
db_uri = os.environ["DATABASE_URI_PYTHON"]

In [None]:
vessel_history = pl.read_database_uri(
    query=f"SELECT * FROM {username}_vessel_history_clean;", uri=db_uri, engine="adbc"
)

vessel_history.head(3)

In [None]:
vessel_verbose = pl.read_database_uri(
    query=f"SELECT * FROM {username}_vessel_verbose_clean;", uri=db_uri, engine="adbc"
)

vessel_verbose.head(3)

In [None]:
weather = pl.read_database_uri(
    query=f"SELECT * FROM {username}_terminal_weather_clean;", uri=db_uri, engine="adbc"
)

weather.head(3)

## Task 2 - Feature Engineering

### 🔄 Task

- Join the `vessel_history`, `vessel_verbose` and `weather` data into a form useful for modeling
- Transform the columns in new ones we can use for modeling

### 🧑‍💻 Code

In [None]:
trips_combined = vessel_history.with_columns(
    (pl.col("ActualDepart") - pl.col("ScheduledDepart"))
    .dt.total_seconds()
    .alias("Delay"),
    pl.col("Date").dt.month().alias("Month"),
    pl.col("Date").dt.weekday().alias("Weekday"),
    pl.col("Date").dt.hour().alias("Hour"),
).drop("EstArrival")

trips_combined.head(3)

In [None]:
trips_combined.plot.hist("Delay", bin_range=(-1800, 7200), bins=30)

In [None]:
trips_combined = trips_combined.select(
    pl.exclude("Delay"),
    pl.col("Delay")
    .map_elements(lambda x: max(x, 1), return_dtype=pl.Float64)
    .log()
    .alias("LogDelay"),
)

trips_combined.plot.hist("LogDelay")

In [None]:
vessel_info = vessel_verbose.select(
    pl.col("VesselName"),
    pl.col("ClassName"),
    # we can also select multiple columns in one `pl.col(...)`
    pl.col(
        "SpeedInKnots",
        "EngineCount",
        "Horsepower",
        "MaxPassengerCount",
        "PassengerOnly",
        "FastFerry",
        "PropulsionInfo",
    ),
    pl.col("YearBuilt", "YearRebuilt").dt.year(),
)

vessel_info.head(3)

In [None]:
trips_combined = trips_combined.join(
    vessel_info, left_on="Vessel", right_on="VesselName", how="left", coalesce=True
)

trips_combined.head(3)

In [None]:
weather = weather.select(
    pl.col(
        "time",
        "weather_code",
        "temperature_2m",
        "precipitation",
        "cloud_cover",
        "wind_speed_10m",
        "wind_direction_10m",
        "wind_gusts_10m",
        "terminal_name",
    )
)

trips_combined = (
    trips_combined.with_columns(pl.col("Date").dt.round("1h").alias("time"))
    .join(
        weather.rename(lambda col_name: f"departing_{col_name}"),
        how="left",
        left_on=["Departing", "time"],
        right_on=["departing_terminal_name", "departing_time"],
        coalesce=True,
    )
    .join(
        weather.rename(lambda col_name: f"arriving_{col_name}"),
        how="left",
        left_on=["Arriving", "time"],
        right_on=["arriving_terminal_name", "arriving_time"],
        coalesce=True,
    )
    .select(pl.exclude("time"))
)

trips_combined.head(3)

In [None]:
trips_combined.null_count()

In [None]:
trips_combined = trips_combined.drop_nulls(subset=cs.exclude("YearRebuilt"))

trips_combined.null_count()

In [None]:
numeric_features = [
    "SpeedInKnots",
    "EngineCount",
    "Horsepower",
    "MaxPassengerCount",
    # "PassengerOnly",
    # "FastFerry",
    "YearBuilt",
    "YearRebuilt",
    "departing_temperature_2m",
    # "departing_precipitation",
    "departing_cloud_cover",
    "departing_wind_speed_10m",
    "departing_wind_direction_10m",
    "departing_wind_gusts_10m",
    "arriving_temperature_2m",
    # "arriving_precipitation",
    "arriving_cloud_cover",
    "arriving_wind_speed_10m",
    "arriving_wind_direction_10m",
    "arriving_wind_gusts_10m",
]

categorical_features = [
    "Vessel",
    "Month",
    "Weekday",
    "Hour",
    "Departing",
    "Arriving",
    "ClassName",
    "PropulsionInfo",
    "departing_weather_code",
    "arriving_weather_code",
]

for cf in categorical_features:
    print(
        f"feature: '{cf}', count:",
        trips_combined.group_by(cf).agg(pl.len()).sort("len"),
    )

In [None]:
low_count_weather_codes = set(
    [
        *trips_combined.group_by("departing_weather_code")
        .agg(pl.len())
        .sort("len")
        .filter(pl.col("len") < 300)["departing_weather_code"]
        .to_list(),
        *trips_combined.group_by("arriving_weather_code")
        .agg(pl.len())
        .sort("len")
        .filter(pl.col("len") < 300)["arriving_weather_code"]
        .to_list(),
    ]
)

low_count_weather_codes

In [None]:
def recode_weather_codes(code):
    return "other" if code in low_count_weather_codes else str(code)


trips_combined = trips_combined.with_columns(
    pl.col("departing_weather_code").map_elements(
        recode_weather_codes, return_dtype=pl.String
    ),
    pl.col("arriving_weather_code").map_elements(
        recode_weather_codes, return_dtype=pl.String
    ),
)

In [None]:
trips_combined = trips_combined.select(cs.exclude("ScheduledDepart", "ActualDepart")).with_columns(pl.col("Date").dt.date())

In [None]:
trips_combined

## Task 3 - Model Training

### 🔄 Task

Define a `scikit-learn` pipeline that

- Transform the data for the model to ingest
- Trains a gradient boosted machine model to predict the logged departure delay

### 🧑‍💻 Code

In [None]:
preprocessor = ColumnTransformer(
    [
        # this just passes the variables through as-is
        ("num", "passthrough", numeric_features),
        # this one-hot encodes the variables
        ("cat", OneHotEncoder(), categorical_features),
    ]
)

In [None]:
regressor = HistGradientBoostingRegressor(verbose=2, random_state=2)

In [None]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [None]:
model = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("density", DenseTransformer()),
        ("regressor", regressor),
    ]
)

In [None]:
train_test_data = trips_combined.filter(
    pl.col("Date") < (datetime.date.today() - datetime.timedelta(weeks=2))
)

monitoring_data = trips_combined.filter(
    pl.col("Date") >= (datetime.date.today() - datetime.timedelta(weeks=2))
)

In [None]:
monitoring_data.write_database(
    table_name=f"{username}_monitoring_data",
    connection=db_uri,
    engine="adbc",
    if_table_exists="replace",
)

In [None]:
X = train_test_data.drop("LogDelay", "Date")
y = train_test_data["LogDelay"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

print(f"Number of rows of training data: {X_train.shape[0]}")
print(f"Number of rows testing data:  {X_test.shape[0]}")

In [None]:
X_test.with_columns(y_test).write_database(
    table_name=f"{username}_test_data",
    connection=db_uri,
    engine="adbc",
    if_table_exists="replace",
)

In [None]:
%%time
model.fit(X_train.to_pandas(), y_train)

In [None]:
model.score(X_test, y_test)

## Task 4 - Model Deployment

### 🔄 Task

- Deploy the model using `vetiver` and `pins` onto Posit Connect
- Deploy an API around the model onto Posit

### 🧑‍💻 Code

In [None]:
v = VetiverModel(
    model, model_name=f"{username}/ferry_delay", prototype_data=X.to_pandas()
)

In [None]:
model_board = pins.board_connect(allow_pickle_read=True)
vetiver.vetiver_pin_write(model_board, model=v)

In [None]:
import pins
from vetiver import VetiverModel

b = pins.board_connect(server_url='https://pub.ferryland.posit.team/', allow_pickle_read=True)
v2 = VetiverModel.from_pin(b, 'brooklynbagel/ferry_delay', version = '266')
v2

In [None]:
import model as mo

mo.DenseTransformer

dir(mo)

In [None]:
vetiver.write_app(
    file="app.py",
    board=model_board,
    pin_name=f"{username}/ferry_delay"
)

In [None]:
%%time

!rsconnect deploy fastapi api

In [None]:

# connect_server = RSConnectServer(url=os.environ["CONNECT_SERVER"], api_key=os.environ["CONNECT_API_KEY"])

# vetiver.deploy_rsconnect(
#     connect_server=connect_server,
#     board=model_board,
#     pin_name=f"{username}/ferry_delay",
#     extra_files=["requirements.txt"],
# )

## Task 5 - Model Card

### 🔄 Task

- Use a model card to describe various metrics for how the model performs
- Deploy the card to Connect

### 🧑‍💻 Code

In [None]:
# vetiver.templates.model_card()