In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [4]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [5]:
df_5905 = df[df["start_station_id"] == 5905.140137]
df_6140 = df[df["start_station_id"] == 6140.049805]
df_6822 = df[df["start_station_id"] == 6822.089844]

In [6]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train_5905, y_train_5905, X_test_5905, y_test_5905 = split_time_series_data(
    df_5905,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train_5905.shape)
print(y_train_5905.shape)
print(X_test_5905.shape)
print(y_test_5905.shape)

(703, 674)
(703,)
(90, 674)
(90,)


In [7]:
X_train_6140, y_train_6140, X_test_6140, y_test_6140 = split_time_series_data(
    df_6140,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train_6140.shape)
print(y_train_6140.shape)
print(X_test_6140.shape)
print(y_test_6140.shape)

(703, 674)
(703,)
(90, 674)
(90,)


In [8]:
X_train_6822, y_train_6822, X_test_6822, y_test_6822 = split_time_series_data(
    df_6822,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train_6822.shape)
print(y_train_6822.shape)
print(X_test_6822.shape)
print(y_test_6822.shape)

(703, 674)
(703,)
(90, 674)
(90,)


In [9]:
past_ride_columns = [c for c in X_train_5905.columns if c.startswith("rides_")]
X_train_only_numeric_5905 = X_train_5905[past_ride_columns]
X_test_only_numeric_5905 = X_test_5905[past_ride_columns]

X_train_only_numeric_6140 = X_train_6140[past_ride_columns]
X_test_only_numeric_6140 = X_test_6140[past_ride_columns]

X_train_only_numeric_6822 = X_train_6822[past_ride_columns]
X_test_only_numeric_6822 = X_test_6822[past_ride_columns]

In [24]:
from pathlib import Path
import pickle

path5905 = Path('..') / "models" / "lgbm_5905.pkl"
path6140 = Path('..') / "models" / "lgbm_6140.pkl"
path6822 = Path('..') / "models" / "lgbm_6822.pkl"

with open(path5905, 'rb') as f:
    model_5905 = pickle.load(f)

with open(path6140, 'rb') as f:
    model_6140 = pickle.load(f)

with open(path6822, 'rb') as f:
    model_6822 = pickle.load(f)

In [21]:
importances_5905 = model_5905.feature_importances_
feature_names_5905 = X_train_only_numeric_5905.columns

importance_df = pd.DataFrame({'feature': feature_names_5905, 'importance': importances_5905})
importance_df.sort_values(by='importance', ascending=False, inplace=True)

# Display top features
importance_df.head(10)

Unnamed: 0,feature,importance
671,rides_t-1,46
663,rides_t-9,34
661,rides_t-11,29
168,rides_t-504,28
670,rides_t-2,27
516,rides_t-156,23
659,rides_t-13,21
662,rides_t-10,21
258,rides_t-414,20
10,rides_t-662,20


In [30]:
features_5905 = [
    "rides_t-1",
    "rides_t-9",
    "rides_t-11",
    "rides_t-504",
    "rides_t-2",
    "rides_t-156",
    "rides_t-13",
    "rides_t-10",
    "rides_t-414",
    "rides_t-662"
]

In [22]:
importances_6140 = model_6140.feature_importances_
feature_names_6140 = X_train_only_numeric_6140.columns

importance_df = pd.DataFrame({'feature': feature_names_6140, 'importance': importances_6140})
importance_df.sort_values(by='importance', ascending=False, inplace=True)

# Display top features
importance_df.head(10)

Unnamed: 0,feature,importance
661,rides_t-11,33
664,rides_t-8,23
19,rides_t-653,20
667,rides_t-5,19
670,rides_t-2,18
633,rides_t-39,16
644,rides_t-28,16
376,rides_t-296,15
617,rides_t-55,14
564,rides_t-108,14


In [29]:
features_6140 = [
    "rides_t-11",
    "rides_t-8",
    "rides_t-653",
    "rides_t-5",
    "rides_t-2",
    "rides_t-39",
    "rides_t-28",
    "rides_t-296",
    "rides_t-55",
    "rides_t-108"
]

In [25]:
importances_6822 = model_6822.feature_importances_
feature_names_6822 = X_train_only_numeric_6822.columns

importance_df = pd.DataFrame({'feature': feature_names_6822, 'importance': importances_6822})
importance_df.sort_values(by='importance', ascending=False, inplace=True)

# Display top features
importance_df.head(10)

Unnamed: 0,feature,importance
671,rides_t-1,26
595,rides_t-77,24
667,rides_t-5,17
540,rides_t-132,16
654,rides_t-18,15
377,rides_t-295,15
594,rides_t-78,15
158,rides_t-514,15
403,rides_t-269,14
561,rides_t-111,14


In [33]:
features_6822 = [
    "rides_t-1",
    "rides_t-77",
    "rides_t-5",
    "rides_t-132",
    "rides_t-18",
    "rides_t-295",
    "rides_t-78",
    "rides_t-514",
    "rides_t-269",
    "rides_t-111"
]

In [34]:
X_train_only_numeric_5905 = X_train_5905[features_5905]
X_test_only_numeric_5905 = X_test_5905[features_5905]

X_train_only_numeric_6140 = X_train_6140[features_6140]
X_test_only_numeric_6140 = X_test_6140[features_6140]

X_train_only_numeric_6822 = X_train_6822[features_6822]
X_test_only_numeric_6822 = X_test_6822[features_6822]

In [35]:
import lightgbm as lgb

model_5905 = lgb.LGBMRegressor()
model_5905.fit(X_train_only_numeric_5905, y_train_5905)

model_6140 = lgb.LGBMRegressor()
model_6140.fit(X_train_only_numeric_6140, y_train_6140)

model_6822 = lgb.LGBMRegressor()
model_6822.fit(X_train_only_numeric_6822, y_train_6822)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 703, number of used features: 10
[LightGBM] [Info] Start training from score 4.533428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000075 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 544
[LightGBM] [Info] Number of data points in the train set: 703, number of used features: 10
[LightGBM] [Info] Start training from score 2.739687
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 407
[LightGBM] [Info] Number of data points in the train set: 703, number of used features: 10
[LightGBM] [Info] Start training fro

In [36]:
from sklearn.metrics import mean_absolute_error

preds_5905 = model_5905.predict(X_test_only_numeric_5905)
preds_6140 = model_6140.predict(X_test_only_numeric_6140)
preds_6822 = model_6822.predict(X_test_only_numeric_6822)

test_mae_5905 = mean_absolute_error(y_test_5905, preds_5905)
test_mae_6140 = mean_absolute_error(y_test_6140, preds_6140)
test_mae_6822 = mean_absolute_error(y_test_6822, preds_6822)

print(f"MAE for 5905: {test_mae_5905:.2f}")
print(f"MAE for 6140: {test_mae_6140:.2f}")
print(f"MAE for 6822: {test_mae_6822:.2f}")

MAE for 5905: 1.94
MAE for 6140: 1.49
MAE for 6822: 1.46


In [37]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv()

mlflow = set_mlflow_tracking()

log_model_to_mlflow(model_5905, X_test_only_numeric_5905, "LGBMRegressor_FI_5905", "mean_absolute_error", score=test_mae_5905)
log_model_to_mlflow(model_6140, X_test_only_numeric_6140, "LGBMRegressor_FI_6140", "mean_absolute_error", score=test_mae_6140)
log_model_to_mlflow(model_6822, X_test_only_numeric_6822, "LGBMRegressor_FI_6822", "mean_absolute_error", score=test_mae_6822)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/02 17:09:10 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressor_FI_5905' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressor_FI_5905
INFO:src.experiment_utils:Logged mean_absolute_error: 1.936080812067457
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/02 17:09:16 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/02 17:09:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 4
Created version '4' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run sneaky-deer-67 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/9/runs/e3beaf5631834df4b587cd1aa348351c
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/9


2025/05/02 17:09:23 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressor_FI_6140' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressor_FI_6140
INFO:src.experiment_utils:Logged mean_absolute_error: 1.4865047024291524
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/02 17:09:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 5
Created version '5' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run crawling-donkey-516 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/10/runs/14bda0a81e01488da2d2621f8f59f122
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/10


2025/05/02 17:09:37 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressor_FI_6822' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressor_FI_6822
INFO:src.experiment_utils:Logged mean_absolute_error: 1.4611374542620283
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/02 17:09:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 6
Created version '6' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run mercurial-boar-73 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/11/runs/8e3be6f8c2f344648b1114bb1648827b
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/11


<mlflow.models.model.ModelInfo at 0x2cb59c9be50>