# Model Training

## Task 0 - Setup

In [1]:
import datetime
import os

from pathlib import Path

import pins
import vetiver
import polars as pl
import polars.selectors as cs

from posit.connect import Client
from sklearn.base import TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from rsconnect.api import RSConnectServer
from vetiver import VetiverModel

if Path(".env").exists():
    from dotenv import load_dotenv

    load_dotenv()

In [2]:
with Client() as client:
    username = client.me.username

print(f"Connect username is: '{username}'")

Connect username is: 'brooklynbagel'


## Task 1 - Reading the data

### 🔄 Task

- Read in and glimpse the vessel history data
- Read in and glimpse the vessel verbose data
- Read in and glimpse the weather data

### 🧑‍💻 Code

In [3]:
db_uri = os.environ["DATABASE_URI_PYTHON"]

In [4]:
vessel_history = pl.read_database_uri(
    query=f"SELECT * FROM {username}_vessel_history_clean;", uri=db_uri, engine="adbc"
)

vessel_history.head(3)

Vessel,Departing,Arriving,ScheduledDepart,ActualDepart,EstArrival,Date
str,str,str,"datetime[μs, UTC]","datetime[μs, UTC]","datetime[μs, UTC]","datetime[μs, UTC]"
"""cathlamet""","""vashon island""","""southworth""",2024-03-01 08:15:00 UTC,2024-03-01 08:16:09 UTC,2024-03-01 08:33:19 UTC,2024-03-01 08:15:00 UTC
"""cathlamet""","""southworth""","""fauntleroy""",2024-03-01 08:35:00 UTC,2024-03-01 08:36:11 UTC,2024-03-01 08:55:49 UTC,2024-03-01 08:35:00 UTC
"""cathlamet""","""fauntleroy""","""vashon island""",2024-03-01 09:05:00 UTC,2024-03-01 09:07:30 UTC,2024-03-01 09:22:30 UTC,2024-03-01 09:05:00 UTC


In [5]:
vessel_verbose = pl.read_database_uri(
    query=f"SELECT * FROM {username}_vessel_verbose_clean;", uri=db_uri, engine="adbc"
)

vessel_verbose.head(3)

VesselID,VesselSubjectID,VesselName,VesselAbbrev,ClassID,ClassSubjectID,ClassName,SortSeq,DrawingImg,SilhouetteImg,PublicDisplayName,Status,OwnedByWSF,CarDeckRestroom,CarDeckShelter,Elevator,ADAAccessible,MainCabinGalley,MainCabinRestroom,PublicWifi,ADAInfo,AdditionalInfo,VesselNameDesc,VesselHistory,CityBuilt,SpeedInKnots,EngineCount,Horsepower,MaxPassengerCount,PassengerOnly,FastFerry,PropulsionInfo,TallDeckClearance,RegDeckSpace,TallDeckSpace,Tonnage,Displacement,YearBuilt,YearRebuilt,SolasCertified,MaxPassengerCountForInternational,BeamInches,LengthInches,DraftInches
i64,i64,str,str,i64,i64,str,i64,str,str,str,i64,bool,bool,bool,bool,bool,bool,bool,bool,str,str,str,str,str,i64,i64,i64,i64,bool,bool,str,i64,i64,i64,i64,i64,date,date,bool,i64,i64,i64,i64
1,1,"""cathlamet""","""cat""",10,310,"""issaquah 130""",40,"""https://www.wsdot.wa.gov/ferri…","""https://www.wsdot.wa.gov/ferri…","""Issaquah""",1,True,True,False,True,True,True,True,False,"""The MV Cathlamet has elevator …",,"""From the Kathlamet tribe, the …",,"""seattle, wa""",16,2,5000,1200,False,False,"""diesel""",186,124,26,2477,3310,1981-01-01,1993-01-01,False,,944,3936,198
2,2,"""chelan""","""che""",10,310,"""issaquah 130""",40,"""https://www.wsdot.wa.gov/ferri…","""https://www.wsdot.wa.gov/ferri…","""Issaquah""",1,True,True,False,True,True,True,True,False,"""The MV Chelan has elevator acc…",,"""From the Chelan language: Tsi…",,"""seattle, wa""",16,2,5000,1200,False,False,"""diesel""",188,124,30,2477,3405,1981-01-01,2005-01-01,True,1090.0,944,3936,201
65,428,"""chetzemoka""","""chz""",162,427,"""kwa-di tabil""",75,"""https://www.wsdot.wa.gov/ferri…","""https://www.wsdot.wa.gov/ferri…","""Kwa-di Tabil""",1,True,False,False,True,True,True,True,False,"""MV Chetzemoka has elevator acc…",,"""The name honors a friendly Nat…",,"""seattle""",15,2,6000,748,False,False,"""diesel""",192,64,9,4623,2415,2010-01-01,,False,,768,3284,132


In [6]:
weather = pl.read_database_uri(
    query=f"SELECT * FROM {username}_terminal_weather_clean;", uri=db_uri, engine="adbc"
)

weather.head(3)

latitude,longitude,generationtime_ms,utc_offset_seconds,timezone,timezone_abbreviation,elevation,time,weather_code,temperature_2m,precipitation,cloud_cover,wind_speed_10m,wind_direction_10m,wind_gusts_10m,terminal_name
f64,f64,f64,i64,str,str,f64,"datetime[μs, UTC]",i64,f64,f64,i64,f64,i64,f64,str
48.541298,-122.727264,6.186008,0,"""gmt""","""gmt""",0.0,2024-03-01 00:00:00 UTC,3,6.2,0.0,80,37.4,154,47.9,"""anacortes"""
48.541298,-122.727264,6.186008,0,"""gmt""","""gmt""",0.0,2024-03-01 01:00:00 UTC,51,5.9,0.1,94,36.7,159,48.6,"""anacortes"""
48.541298,-122.727264,6.186008,0,"""gmt""","""gmt""",0.0,2024-03-01 02:00:00 UTC,3,5.7,0.0,83,36.6,156,49.0,"""anacortes"""


## Task 2 - Feature Engineering

### 🔄 Task

- Join the `vessel_history`, `vessel_verbose` and `weather` data into a form useful for modeling
- Transform the columns in new ones we can use for modeling

### 🧑‍💻 Code

In [7]:
trips_combined = vessel_history.with_columns(
    (pl.col("ActualDepart") - pl.col("ScheduledDepart"))
    .dt.total_seconds()
    .alias("Delay"),
    pl.col("Date").dt.month().alias("Month"),
    pl.col("Date").dt.weekday().alias("Weekday"),
    pl.col("Date").dt.hour().alias("Hour"),
).drop("EstArrival")

trips_combined.head(3)

Vessel,Departing,Arriving,ScheduledDepart,ActualDepart,Date,Delay,Month,Weekday,Hour
str,str,str,"datetime[μs, UTC]","datetime[μs, UTC]","datetime[μs, UTC]",i64,i8,i8,i8
"""cathlamet""","""vashon island""","""southworth""",2024-03-01 08:15:00 UTC,2024-03-01 08:16:09 UTC,2024-03-01 08:15:00 UTC,69,3,5,8
"""cathlamet""","""southworth""","""fauntleroy""",2024-03-01 08:35:00 UTC,2024-03-01 08:36:11 UTC,2024-03-01 08:35:00 UTC,71,3,5,8
"""cathlamet""","""fauntleroy""","""vashon island""",2024-03-01 09:05:00 UTC,2024-03-01 09:07:30 UTC,2024-03-01 09:05:00 UTC,150,3,5,9


In [8]:
trips_combined.plot.hist("Delay", bin_range=(-1800, 7200), bins=30)

%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)


In [9]:
trips_combined = trips_combined.select(
    pl.exclude("Delay"),
    pl.col("Delay")
    .map_elements(lambda x: max(x, 1), return_dtype=pl.Float64)
    .log()
    .alias("LogDelay"),
)

trips_combined.plot.hist("LogDelay")

In [10]:
vessel_info = vessel_verbose.select(
    pl.col("VesselName"),
    pl.col("ClassName"),
    # we can also select multiple columns in one `pl.col(...)`
    pl.col(
        "SpeedInKnots",
        "EngineCount",
        "Horsepower",
        "MaxPassengerCount",
        "PassengerOnly",
        "FastFerry",
        "PropulsionInfo",
    ),
    pl.col("YearBuilt", "YearRebuilt").dt.year(),
)

vessel_info.head(3)

VesselName,ClassName,SpeedInKnots,EngineCount,Horsepower,MaxPassengerCount,PassengerOnly,FastFerry,PropulsionInfo,YearBuilt,YearRebuilt
str,str,i64,i64,i64,i64,bool,bool,str,i32,i32
"""cathlamet""","""issaquah 130""",16,2,5000,1200,False,False,"""diesel""",1981,1993.0
"""chelan""","""issaquah 130""",16,2,5000,1200,False,False,"""diesel""",1981,2005.0
"""chetzemoka""","""kwa-di tabil""",15,2,6000,748,False,False,"""diesel""",2010,


In [11]:
trips_combined = trips_combined.join(
    vessel_info, left_on="Vessel", right_on="VesselName", how="left", coalesce=True
)

trips_combined.head(3)

Vessel,Departing,Arriving,ScheduledDepart,ActualDepart,Date,Month,Weekday,Hour,LogDelay,ClassName,SpeedInKnots,EngineCount,Horsepower,MaxPassengerCount,PassengerOnly,FastFerry,PropulsionInfo,YearBuilt,YearRebuilt
str,str,str,"datetime[μs, UTC]","datetime[μs, UTC]","datetime[μs, UTC]",i8,i8,i8,f64,str,i64,i64,i64,i64,bool,bool,str,i32,i32
"""cathlamet""","""vashon island""","""southworth""",2024-03-01 08:15:00 UTC,2024-03-01 08:16:09 UTC,2024-03-01 08:15:00 UTC,3,5,8,4.234107,"""issaquah 130""",16,2,5000,1200,False,False,"""diesel""",1981,1993
"""cathlamet""","""southworth""","""fauntleroy""",2024-03-01 08:35:00 UTC,2024-03-01 08:36:11 UTC,2024-03-01 08:35:00 UTC,3,5,8,4.26268,"""issaquah 130""",16,2,5000,1200,False,False,"""diesel""",1981,1993
"""cathlamet""","""fauntleroy""","""vashon island""",2024-03-01 09:05:00 UTC,2024-03-01 09:07:30 UTC,2024-03-01 09:05:00 UTC,3,5,9,5.010635,"""issaquah 130""",16,2,5000,1200,False,False,"""diesel""",1981,1993


In [12]:
weather = weather.select(
    pl.col(
        "time",
        "weather_code",
        "temperature_2m",
        "precipitation",
        "cloud_cover",
        "wind_speed_10m",
        "wind_direction_10m",
        "wind_gusts_10m",
        "terminal_name",
    )
)

trips_combined = (
    trips_combined.with_columns(pl.col("Date").dt.round("1h").alias("time"))
    .join(
        weather.rename(lambda col_name: f"departing_{col_name}"),
        how="left",
        left_on=["Departing", "time"],
        right_on=["departing_terminal_name", "departing_time"],
        coalesce=True,
    )
    .join(
        weather.rename(lambda col_name: f"arriving_{col_name}"),
        how="left",
        left_on=["Arriving", "time"],
        right_on=["arriving_terminal_name", "arriving_time"],
        coalesce=True,
    )
    .select(pl.exclude("time"))
)

trips_combined.head(3)

Vessel,Departing,Arriving,ScheduledDepart,ActualDepart,Date,Month,Weekday,Hour,LogDelay,ClassName,SpeedInKnots,EngineCount,Horsepower,MaxPassengerCount,PassengerOnly,FastFerry,PropulsionInfo,YearBuilt,YearRebuilt,departing_weather_code,departing_temperature_2m,departing_precipitation,departing_cloud_cover,departing_wind_speed_10m,departing_wind_direction_10m,departing_wind_gusts_10m,arriving_weather_code,arriving_temperature_2m,arriving_precipitation,arriving_cloud_cover,arriving_wind_speed_10m,arriving_wind_direction_10m,arriving_wind_gusts_10m
str,str,str,"datetime[μs, UTC]","datetime[μs, UTC]","datetime[μs, UTC]",i8,i8,i8,f64,str,i64,i64,i64,i64,bool,bool,str,i32,i32,i64,f64,f64,i64,f64,i64,f64,i64,f64,f64,i64,f64,i64,f64
"""cathlamet""","""vashon island""","""southworth""",2024-03-01 08:15:00 UTC,2024-03-01 08:16:09 UTC,2024-03-01 08:15:00 UTC,3,5,8,4.234107,"""issaquah 130""",16,2,5000,1200,False,False,"""diesel""",1981,1993,73,2.6,1.6,100,16.5,203,37.4,73,2.5,1.6,100,16.5,203,37.4
"""cathlamet""","""southworth""","""fauntleroy""",2024-03-01 08:35:00 UTC,2024-03-01 08:36:11 UTC,2024-03-01 08:35:00 UTC,3,5,8,4.26268,"""issaquah 130""",16,2,5000,1200,False,False,"""diesel""",1981,1993,73,3.0,0.7,100,19.3,204,41.4,75,3.5,2.8,100,20.6,196,39.2
"""cathlamet""","""fauntleroy""","""vashon island""",2024-03-01 09:05:00 UTC,2024-03-01 09:07:30 UTC,2024-03-01 09:05:00 UTC,3,5,9,5.010635,"""issaquah 130""",16,2,5000,1200,False,False,"""diesel""",1981,1993,75,3.5,2.8,100,20.6,196,39.2,73,3.1,0.7,100,19.3,204,41.4


In [13]:
trips_combined.null_count()

Vessel,Departing,Arriving,ScheduledDepart,ActualDepart,Date,Month,Weekday,Hour,LogDelay,ClassName,SpeedInKnots,EngineCount,Horsepower,MaxPassengerCount,PassengerOnly,FastFerry,PropulsionInfo,YearBuilt,YearRebuilt,departing_weather_code,departing_temperature_2m,departing_precipitation,departing_cloud_cover,departing_wind_speed_10m,departing_wind_direction_10m,departing_wind_gusts_10m,arriving_weather_code,arriving_temperature_2m,arriving_precipitation,arriving_cloud_cover,arriving_wind_speed_10m,arriving_wind_direction_10m,arriving_wind_gusts_10m
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28802,115,115,115,115,115,115,115,115,115,115,115,115,115,115


In [14]:
trips_combined = trips_combined.drop_nulls(subset=cs.exclude("YearRebuilt"))

trips_combined.null_count()

Vessel,Departing,Arriving,ScheduledDepart,ActualDepart,Date,Month,Weekday,Hour,LogDelay,ClassName,SpeedInKnots,EngineCount,Horsepower,MaxPassengerCount,PassengerOnly,FastFerry,PropulsionInfo,YearBuilt,YearRebuilt,departing_weather_code,departing_temperature_2m,departing_precipitation,departing_cloud_cover,departing_wind_speed_10m,departing_wind_direction_10m,departing_wind_gusts_10m,arriving_weather_code,arriving_temperature_2m,arriving_precipitation,arriving_cloud_cover,arriving_wind_speed_10m,arriving_wind_direction_10m,arriving_wind_gusts_10m
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28727,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
numeric_features = [
    "SpeedInKnots",
    "EngineCount",
    "Horsepower",
    "MaxPassengerCount",
    # "PassengerOnly",
    # "FastFerry",
    "YearBuilt",
    "YearRebuilt",
    "departing_temperature_2m",
    # "departing_precipitation",
    "departing_cloud_cover",
    "departing_wind_speed_10m",
    "departing_wind_direction_10m",
    "departing_wind_gusts_10m",
    "arriving_temperature_2m",
    # "arriving_precipitation",
    "arriving_cloud_cover",
    "arriving_wind_speed_10m",
    "arriving_wind_direction_10m",
    "arriving_wind_gusts_10m",
]

categorical_features = [
    "Vessel",
    "Month",
    "Weekday",
    "Hour",
    "Departing",
    "Arriving",
    "ClassName",
    "PropulsionInfo",
    "departing_weather_code",
    "arriving_weather_code",
]

for cf in categorical_features:
    print(
        f"feature: '{cf}', count:",
        trips_combined.group_by(cf).agg(pl.len()).sort("len"),
    )

feature: 'Vessel', count: shape: (19, 2)
┌────────────┬──────┐
│ Vessel     ┆ len  │
│ ---        ┆ ---  │
│ str        ┆ u32  │
╞════════════╪══════╡
│ salish     ┆ 872  │
│ chelan     ┆ 1715 │
│ puyallup   ┆ 1767 │
│ yakima     ┆ 1961 │
│ samish     ┆ 2084 │
│ …          ┆ …    │
│ tillikum   ┆ 3349 │
│ cathlamet  ┆ 3385 │
│ tokitae    ┆ 4221 │
│ chetzemoka ┆ 4947 │
│ kittitas   ┆ 5489 │
└────────────┴──────┘
feature: 'Month', count: shape: (6, 2)
┌───────┬───────┐
│ Month ┆ len   │
│ ---   ┆ ---   │
│ i8    ┆ u32   │
╞═══════╪═══════╡
│ 8     ┆ 328   │
│ 7     ┆ 10138 │
│ 6     ┆ 10292 │
│ 4     ┆ 11020 │
│ 3     ┆ 11168 │
│ 5     ┆ 11374 │
└───────┴───────┘
feature: 'Weekday', count: shape: (7, 2)
┌─────────┬──────┐
│ Weekday ┆ len  │
│ ---     ┆ ---  │
│ i8      ┆ u32  │
╞═════════╪══════╡
│ 4       ┆ 7639 │
│ 7       ┆ 7666 │
│ 6       ┆ 7688 │
│ 5       ┆ 7689 │
│ 3       ┆ 7730 │
│ 2       ┆ 7847 │
│ 1       ┆ 8061 │
└─────────┴──────┘
feature: 'Hour', count: shape: (24, 2)
┌──

In [16]:
low_count_weather_codes = set(
    [
        *trips_combined.group_by("departing_weather_code")
        .agg(pl.len())
        .sort("len")
        .filter(pl.col("len") < 300)["departing_weather_code"]
        .to_list(),
        *trips_combined.group_by("arriving_weather_code")
        .agg(pl.len())
        .sort("len")
        .filter(pl.col("len") < 300)["arriving_weather_code"]
        .to_list(),
    ]
)

low_count_weather_codes

{63, 65, 71, 73, 75}

In [17]:
def recode_weather_codes(code):
    return "other" if code in low_count_weather_codes else str(code)


trips_combined = trips_combined.with_columns(
    pl.col("departing_weather_code").map_elements(
        recode_weather_codes, return_dtype=pl.String
    ),
    pl.col("arriving_weather_code").map_elements(
        recode_weather_codes, return_dtype=pl.String
    ),
)

In [18]:
trips_combined = trips_combined.select(cs.exclude("ScheduledDepart", "ActualDepart")).with_columns(pl.col("Date").dt.date())

In [19]:
trips_combined

Vessel,Departing,Arriving,Date,Month,Weekday,Hour,LogDelay,ClassName,SpeedInKnots,EngineCount,Horsepower,MaxPassengerCount,PassengerOnly,FastFerry,PropulsionInfo,YearBuilt,YearRebuilt,departing_weather_code,departing_temperature_2m,departing_precipitation,departing_cloud_cover,departing_wind_speed_10m,departing_wind_direction_10m,departing_wind_gusts_10m,arriving_weather_code,arriving_temperature_2m,arriving_precipitation,arriving_cloud_cover,arriving_wind_speed_10m,arriving_wind_direction_10m,arriving_wind_gusts_10m
str,str,str,date,i8,i8,i8,f64,str,i64,i64,i64,i64,bool,bool,str,i32,i32,str,f64,f64,i64,f64,i64,f64,str,f64,f64,i64,f64,i64,f64
"""cathlamet""","""vashon island""","""southworth""",2024-03-01,3,5,8,4.234107,"""issaquah 130""",16,2,5000,1200,false,false,"""diesel""",1981,1993,"""other""",2.6,1.6,100,16.5,203,37.4,"""other""",2.5,1.6,100,16.5,203,37.4
"""cathlamet""","""southworth""","""fauntleroy""",2024-03-01,3,5,8,4.26268,"""issaquah 130""",16,2,5000,1200,false,false,"""diesel""",1981,1993,"""other""",3.0,0.7,100,19.3,204,41.4,"""other""",3.5,2.8,100,20.6,196,39.2
"""cathlamet""","""fauntleroy""","""vashon island""",2024-03-01,3,5,9,5.010635,"""issaquah 130""",16,2,5000,1200,false,false,"""diesel""",1981,1993,"""other""",3.5,2.8,100,20.6,196,39.2,"""other""",3.1,0.7,100,19.3,204,41.4
"""cathlamet""","""vashon island""","""southworth""",2024-03-01,3,5,9,3.401197,"""issaquah 130""",16,2,5000,1200,false,false,"""diesel""",1981,1993,"""3""",3.5,0.0,100,21.1,197,41.4,"""3""",3.4,0.0,100,21.1,197,41.4
"""cathlamet""","""vashon island""","""fauntleroy""",2024-03-01,3,5,12,4.553877,"""issaquah 130""",16,2,5000,1200,false,false,"""diesel""",1981,1993,"""3""",3.2,0.0,99,18.4,191,43.2,"""3""",3.3,0.0,91,19.7,189,40.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""yakima""","""lopez island""","""anacortes""",2024-08-01,8,4,16,7.753194,"""super""",17,4,8000,2000,false,false,"""diesel-electric (dc)""",1967,2000,"""0""",19.7,0.0,0,19.6,347,42.1,"""0""",14.5,0.0,20,16.3,354,20.9
"""yakima""","""orcas island""","""shaw island""",2024-08-01,8,4,19,8.408494,"""super""",17,4,8000,2000,false,false,"""diesel-electric (dc)""",1967,2000,"""0""",22.8,0.0,13,11.8,348,34.2,"""0""",23.3,0.0,13,11.8,348,34.2
"""yakima""","""shaw island""","""orcas island""",2024-08-01,8,4,19,7.32975,"""super""",17,4,8000,2000,false,false,"""diesel-electric (dc)""",1967,2000,"""0""",23.3,0.0,13,11.8,348,34.2,"""0""",22.8,0.0,13,11.8,348,34.2
"""yakima""","""shaw island""","""anacortes""",2024-08-01,8,4,19,8.404472,"""super""",17,4,8000,2000,false,false,"""diesel-electric (dc)""",1967,2000,"""0""",24.4,0.0,17,11.4,349,31.7,"""0""",16.2,0.0,13,13.6,338,14.8


## Task 3 - Model Training

### 🔄 Task

Define a `scikit-learn` pipeline that

- Transform the data for the model to ingest
- Trains a gradient boosted machine model to predict the logged departure delay

### 🧑‍💻 Code

In [20]:
preprocessor = ColumnTransformer(
    [
        # this just passes the variables through as-is
        ("num", "passthrough", numeric_features),
        # this one-hot encodes the variables
        ("cat", OneHotEncoder(), categorical_features),
    ]
)

In [21]:
regressor = HistGradientBoostingRegressor(verbose=2, random_state=2)

In [22]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [23]:
model = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("density", DenseTransformer()),
        ("regressor", regressor),
    ]
)

In [24]:
train_test_data = trips_combined.filter(
    pl.col("Date") < (datetime.date.today() - datetime.timedelta(weeks=2))
)

monitoring_data = trips_combined.filter(
    pl.col("Date") >= (datetime.date.today() - datetime.timedelta(weeks=2))
)

In [25]:
monitoring_data.write_database(
    table_name=f"{username}_monitoring_data",
    connection=db_uri,
    engine="adbc",
    if_table_exists="replace",
)

2478

In [26]:
X = train_test_data.drop("LogDelay", "Date")
y = train_test_data["LogDelay"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

print(f"Number of rows of training data: {X_train.shape[0]}")
print(f"Number of rows testing data:  {X_test.shape[0]}")

Number of rows of training data: 41473
Number of rows testing data:  10369


In [27]:
X_test.with_columns(y_test).write_database(
    table_name=f"{username}_test_data",
    connection=db_uri,
    engine="adbc",
    if_table_exists="replace",
)

10369

In [28]:
%%time
model.fit(X_train.to_pandas(), y_train)

Binning 0.041 GB of training data: 0.192 s
Binning 0.005 GB of validation data: 0.018 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 31 leaves, max depth = 15, train loss: 1.40675, val loss: 1.53945, in 0.047s
[2/100] 1 tree, 31 leaves, max depth = 15, train loss: 1.37022, val loss: 1.50228, in 0.046s
[3/100] 1 tree, 31 leaves, max depth = 16, train loss: 1.33868, val loss: 1.46987, in 0.046s
[4/100] 1 tree, 31 leaves, max depth = 10, train loss: 1.31324, val loss: 1.44388, in 0.051s
[5/100] 1 tree, 31 leaves, max depth = 13, train loss: 1.29165, val loss: 1.42140, in 0.082s
[6/100] 1 tree, 31 leaves, max depth = 18, train loss: 1.27156, val loss: 1.39932, in 0.052s
[7/100] 1 tree, 31 leaves, max depth = 17, train loss: 1.25477, val loss: 1.38222, in 0.036s
[8/100] 1 tree, 31 leaves, max depth = 16, train loss: 1.23925, val loss: 1.36628, in 0.039s
[9/100] 1 tree, 31 leaves, max depth = 13, train loss: 1.22522, val loss: 1.35237, in 0.060s
[10/100] 1 tree, 31 leaves, max depth = 11

In [29]:
model.score(X_test, y_test)

0.2596678547018034

## Task 4 - Model Deployment

### 🔄 Task

- Deploy the model using `vetiver` and `pins` onto Posit Connect
- Deploy an API around the model onto Posit

### 🧑‍💻 Code

In [30]:
v = VetiverModel(
    model, model_name=f"{username}/ferry_delay", prototype_data=X.to_pandas()
)

In [31]:
model_board = pins.board_connect(allow_pickle_read=True)
vetiver.vetiver_pin_write(model_board, model=v)

Model Cards provide a framework for transparent, responsible reporting. 
 Use the vetiver `.qmd` Quarto template as a place to start, 
 with vetiver.model_card()
Writing pin:
Name: 'brooklynbagel/ferry_delay'
Version: 20240809T201245Z-1a3a1


In [34]:
vetiver.write_app(
    file="app.py",
    board=model_board,
    pin_name=f"{username}/ferry_delay"
)

In [41]:
%%time

!rsconnect deploy fastapi . requirements.txt model.py

[0mValidating server...[0m[32;20m 	[OK]
[0m[0mValidating app mode...[0m[32;20m 	[OK]
[0m[0mMaking bundle ...[0m[32;20m 	[OK]
[0m[0mDeploying bundle ...[0m[32;20m 	[OK]
[0m[0mSaving deployed information...[0m[32;20m 	[OK]
[0m[0mBuilding FastAPI application...[0m
[0mBundle created with Python version 3.12.3 is compatible with environment Kubernetes::ghcr.io/rstudio/content-pro:r4.4.1-py3.12.4-ubuntu2204 with Python version 3.12.4 from /opt/python/3.12.4/bin/python3 [0m
[0mBundle requested Python version 3.12.3; using /opt/python/3.12.4/bin/python3 from Kubernetes::ghcr.io/rstudio/content-pro:r4.4.1-py3.12.4-ubuntu2204 which has version 3.12.4[0m
[0mDetermining session server location ...[0m
[0m2024/08/09 20:47:03.812561868 [rsc-session] Content GUID: 95f4a9f3-a0d5-43dc-9304-af32a18daae7
2024/08/09 20:47:03.812612984 [rsc-session] Content ID: 120
2024/08/09 20:47:03.812618041 [rsc-session] Bundle ID: 281
2024/08/09 20:47:03.812621815 [rsc-session] Job Key: zFk

In [None]:

# connect_server = RSConnectServer(url=os.environ["CONNECT_SERVER"], api_key=os.environ["CONNECT_API_KEY"])

# vetiver.deploy_rsconnect(
#     connect_server=connect_server,
#     board=model_board,
#     pin_name=f"{username}/ferry_delay",
#     extra_files=["requirements.txt"],
# )

## Task 5 - Model Card

### 🔄 Task

- Use a model card to describe various metrics for how the model performs
- Deploy the card to Connect

### 🧑‍💻 Code

In [None]:
# vetiver.templates.model_card()