In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [3]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [4]:
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,start_hour,start_station_id,target
0,0,5,7,3,2,1,2,0,2,2,...,30,25,13,12,9,9,11,2023-01-29,5905.140137,10
1,2,0,0,0,1,1,0,0,6,6,...,24,17,7,11,14,7,4,2023-01-30,5905.140137,3
2,1,2,1,0,0,0,1,3,12,12,...,40,27,28,23,10,7,7,2023-01-31,5905.140137,3
3,2,5,0,0,1,0,4,7,34,22,...,34,34,22,22,12,17,4,2023-02-01,5905.140137,0
4,0,2,0,1,1,2,3,5,18,14,...,16,23,25,11,10,6,2,2023-02-02,5905.140137,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2374,4,1,0,1,0,2,8,21,16,11,...,70,19,13,8,9,1,5,2025-03-27,6822.089844,0
2375,0,0,1,0,1,1,3,12,23,6,...,61,43,19,11,9,4,6,2025-03-28,6822.089844,3
2376,3,4,1,1,1,1,2,4,16,9,...,49,33,33,8,7,2,4,2025-03-29,6822.089844,2
2377,0,3,0,0,0,0,1,2,7,10,...,12,7,16,6,3,1,7,2025-03-30,6822.089844,0


In [6]:
df_5905 = df[df["start_station_id"] == 5905.140137]
df_6140 = df[df["start_station_id"] == 6140.049805]
df_6822 = df[df["start_station_id"] == 6822.089844]

In [8]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train_5905, y_train_5905, X_test_5905, y_test_5905 = split_time_series_data(
    df_5905,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train_5905.shape)
print(y_train_5905.shape)
print(X_test_5905.shape)
print(y_test_5905.shape)

(703, 674)
(703,)
(90, 674)
(90,)


In [9]:
X_train_6140, y_train_6140, X_test_6140, y_test_6140 = split_time_series_data(
    df_6140,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train_6140.shape)
print(y_train_6140.shape)
print(X_test_6140.shape)
print(y_test_6140.shape)

(703, 674)
(703,)
(90, 674)
(90,)


In [10]:
X_train_6822, y_train_6822, X_test_6822, y_test_6822 = split_time_series_data(
    df_6822,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train_6822.shape)
print(y_train_6822.shape)
print(X_test_6822.shape)
print(y_test_6822.shape)

(703, 674)
(703,)
(90, 674)
(90,)


In [12]:
past_ride_columns = [c for c in X_train_5905.columns if c.startswith("rides_")]
X_train_only_numeric_5905 = X_train_5905[past_ride_columns]
X_test_only_numeric_5905 = X_test_5905[past_ride_columns]

X_train_only_numeric_6140 = X_train_6140[past_ride_columns]
X_test_only_numeric_6140 = X_test_6140[past_ride_columns]

X_train_only_numeric_6822 = X_train_6822[past_ride_columns]
X_test_only_numeric_6822 = X_test_6822[past_ride_columns]

In [14]:
import lightgbm as lgb

model_5905 = lgb.LGBMRegressor()
model_5905.fit(X_train_only_numeric_5905, y_train_5905)

model_6140 = lgb.LGBMRegressor()
model_6140.fit(X_train_only_numeric_6140, y_train_6140)

model_6822 = lgb.LGBMRegressor()
model_6822.fit(X_train_only_numeric_6822, y_train_6822)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25729
[LightGBM] [Info] Number of data points in the train set: 703, number of used features: 672
[LightGBM] [Info] Start training from score 4.533428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006742 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25180
[LightGBM] [Info] Number of data points in the train set: 703, number of used features: 672
[LightGBM] [Info] Start training from score 2.739687
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006308 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20480
[LightGBM] [Info] Number of data points in the train set: 703, number of used features: 672
[LightGBM] [Info] Start tra

In [15]:
from sklearn.metrics import mean_absolute_error

preds_5905 = model_5905.predict(X_test_only_numeric_5905)
preds_6140 = model_6140.predict(X_test_only_numeric_6140)
preds_6822 = model_6822.predict(X_test_only_numeric_6822)

test_mae_5905 = mean_absolute_error(y_test_5905, preds_5905)
test_mae_6140 = mean_absolute_error(y_test_6140, preds_6140)
test_mae_6822 = mean_absolute_error(y_test_6822, preds_6822)

print(f"MAE for 5905: {test_mae_5905:.2f}")
print(f"MAE for 6140: {test_mae_6140:.2f}")
print(f"MAE for 6822: {test_mae_6822:.2f}")

MAE for 5905: 1.78
MAE for 6140: 1.38
MAE for 6822: 1.18


In [16]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv()

mlflow = set_mlflow_tracking()

log_model_to_mlflow(model_5905, X_test_only_numeric_5905, "LGBMRegressor_5905", "mean_absolute_error", score=test_mae_5905)
log_model_to_mlflow(model_6140, X_test_only_numeric_6140, "LGBMRegressor_6140", "mean_absolute_error", score=test_mae_6140)
log_model_to_mlflow(model_6822, X_test_only_numeric_6822, "LGBMRegressor_6822", "mean_absolute_error", score=test_mae_6822)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/02 16:20:11 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressor_5905' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressor_5905
INFO:src.experiment_utils:Logged mean_absolute_error: 1.778158488292354
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/02 16:20:19 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Successfully registered model 'LGBMRegressor'.
2025/05/02 16:20:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 1
Created version '1' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run aged-goose-832 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/3/runs/e38027dc5c33416aba3fd2e11d231f7b
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/3


2025/05/02 16:20:25 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressor_6140' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressor_6140
INFO:src.experiment_utils:Logged mean_absolute_error: 1.3806080379332206
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/02 16:20:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 2
Created version '2' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run youthful-grouse-710 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/4/runs/8311dcd51a88495498098b64f0411911
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/4


2025/05/02 16:20:39 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressor_6822' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressor_6822
INFO:src.experiment_utils:Logged mean_absolute_error: 1.1820546887260097
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/02 16:20:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 3
Created version '3' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run handsome-loon-850 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/5/runs/4969606e8b6e445a95b2c9fadd1e6831
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/5


<mlflow.models.model.ModelInfo at 0x182a991c610>

In [18]:
from pathlib import Path
import pickle

path5905 = Path('..') / "models" / "lgbm_5905.pkl"
path6140 = Path('..') / "models" / "lgbm_6140.pkl"
path6822 = Path('..') / "models" / "lgbm_6822.pkl"

with open(path5905, 'wb') as f:
    pickle.dump(model_5905, f)

with open(path6140, 'wb') as f:
    pickle.dump(model_6140, f)

with open(path6822, 'wb') as f:
    pickle.dump(model_6822, f)

In [20]:
with open(path5905, 'rb') as f:
    loaded = pickle.load(f)

In [22]:
preds = loaded.predict(X_test_only_numeric_5905)
mae = mean_absolute_error(y_test_5905, preds)
mae

1.778158488292354