In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [5]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [6]:
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,start_hour,start_station_id,target
0,0,5,7,3,2,1,2,0,2,2,...,30,25,13,12,9,9,11,2023-01-29,5905.140137,10
1,2,0,0,0,1,1,0,0,6,6,...,24,17,7,11,14,7,4,2023-01-30,5905.140137,3
2,1,2,1,0,0,0,1,3,12,12,...,40,27,28,23,10,7,7,2023-01-31,5905.140137,3
3,2,5,0,0,1,0,4,7,34,22,...,34,34,22,22,12,17,4,2023-02-01,5905.140137,0
4,0,2,0,1,1,2,3,5,18,14,...,16,23,25,11,10,6,2,2023-02-02,5905.140137,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2374,4,1,0,1,0,2,8,21,16,11,...,70,19,13,8,9,1,5,2025-03-27,6822.089844,0
2375,0,0,1,0,1,1,3,12,23,6,...,61,43,19,11,9,4,6,2025-03-28,6822.089844,3
2376,3,4,1,1,1,1,2,4,16,9,...,49,33,33,8,7,2,4,2025-03-29,6822.089844,2
2377,0,3,0,0,0,0,1,2,7,10,...,12,7,16,6,3,1,7,2025-03-30,6822.089844,0


In [7]:
df_5905 = df[df["start_station_id"] == 5905.140137]
df_6140 = df[df["start_station_id"] == 6140.049805]
df_6822 = df[df["start_station_id"] == 6822.089844]

In [8]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train_5905, y_train_5905, X_test_5905, y_test_5905 = split_time_series_data(
    df_5905,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train_5905.shape)
print(y_train_5905.shape)
print(X_test_5905.shape)
print(y_test_5905.shape)

(703, 674)
(703,)
(90, 674)
(90,)


In [9]:
X_train_6140, y_train_6140, X_test_6140, y_test_6140 = split_time_series_data(
    df_6140,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train_6140.shape)
print(y_train_6140.shape)
print(X_test_6140.shape)
print(y_test_6140.shape)

(703, 674)
(703,)
(90, 674)
(90,)


In [10]:
X_train_6822, y_train_6822, X_test_6822, y_test_6822 = split_time_series_data(
    df_6822,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train_6822.shape)
print(y_train_6822.shape)
print(X_test_6822.shape)
print(y_test_6822.shape)

(703, 674)
(703,)
(90, 674)
(90,)


In [11]:
past_ride_columns = [c for c in X_train_5905.columns if c.startswith("rides_")]
X_train_only_numeric_5905 = X_train_5905[past_ride_columns]
X_test_only_numeric_5905 = X_test_5905[past_ride_columns]

X_train_only_numeric_6140 = X_train_6140[past_ride_columns]
X_test_only_numeric_6140 = X_test_6140[past_ride_columns]

X_train_only_numeric_6822 = X_train_6822[past_ride_columns]
X_test_only_numeric_6822 = X_test_6822[past_ride_columns]

In [12]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

In [13]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [14]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train_5905)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,start_hour,start_station_id,average_rides_last_4_weeks
0,0,5,7,3,2,1,2,0,2,2,...,30,25,13,12,9,9,11,2023-01-29,5905.140137,2.00
1,2,0,0,0,1,1,0,0,6,6,...,24,17,7,11,14,7,4,2023-01-30,5905.140137,1.25
2,1,2,1,0,0,0,1,3,12,12,...,40,27,28,23,10,7,7,2023-01-31,5905.140137,2.00
3,2,5,0,0,1,0,4,7,34,22,...,34,34,22,22,12,17,4,2023-02-01,5905.140137,2.50
4,0,2,0,1,1,2,3,5,18,14,...,16,23,25,11,10,6,2,2023-02-02,5905.140137,1.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,2,4,0,1,1,0,2,2,6,11,...,11,15,8,5,1,0,7,2024-12-27,5905.140137,2.00
699,5,0,0,0,1,2,1,2,3,6,...,5,3,17,5,9,7,4,2024-12-28,5905.140137,4.25
700,7,4,0,3,0,1,4,9,5,6,...,5,7,5,5,5,7,4,2024-12-29,5905.140137,4.75
701,0,0,0,1,0,1,6,7,22,25,...,16,26,6,6,15,9,1,2024-12-30,5905.140137,1.00


In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["start_hour"].dt.hour
        X_["day_of_week"] = X_["start_hour"].dt.dayofweek

        return X_.drop(columns=["start_hour", "start_station_id"])

In [16]:
add_temporal_features = TemporalFeatureEngineer()
add_temporal_features.fit_transform(X_train_5905)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,average_rides_last_4_weeks,hour,day_of_week
0,0,5,7,3,2,1,2,0,2,2,...,30,25,13,12,9,9,11,2.00,0,6
1,2,0,0,0,1,1,0,0,6,6,...,24,17,7,11,14,7,4,1.25,0,0
2,1,2,1,0,0,0,1,3,12,12,...,40,27,28,23,10,7,7,2.00,0,1
3,2,5,0,0,1,0,4,7,34,22,...,34,34,22,22,12,17,4,2.50,0,2
4,0,2,0,1,1,2,3,5,18,14,...,16,23,25,11,10,6,2,1.75,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,2,4,0,1,1,0,2,2,6,11,...,11,15,8,5,1,0,7,2.00,0,4
699,5,0,0,0,1,2,1,2,3,6,...,5,3,17,5,9,7,4,4.25,0,5
700,7,4,0,3,0,1,4,9,5,6,...,5,7,5,5,5,7,4,4.75,0,6
701,0,0,0,1,0,1,6,7,22,25,...,16,26,6,6,15,9,1,1.00,0,0


In [17]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

In [18]:
pipeline.fit(X_train_5905, y_train_5905)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25785
[LightGBM] [Info] Number of data points in the train set: 703, number of used features: 674
[LightGBM] [Info] Start training from score 4.533428


In [20]:
from sklearn.metrics import mean_absolute_error

preds_5905 = pipeline.predict(X_test_5905)
test_mae_5905 = mean_absolute_error(y_test_5905, preds_5905)
print(f"{test_mae_5905:.4f}")

1.7812


In [21]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv()

mlflow = set_mlflow_tracking()
log_model_to_mlflow(pipeline, X_test_5905, "LGBMRegressorWFE_5905", "mean_absolute_error", score=test_mae_5905)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/02 16:34:21 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressorWFE_5905' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressorWFE_5905
INFO:src.experiment_utils:Logged mean_absolute_error: 1.7811957443002566
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/02 16:34:29 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Successfully registered model 'Pipeline'.
2025/05/02 16:34:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Pipeline, version 1
Created version '1' of model 'Pipeline'.
INFO:src.experiment_utils:Model logged with name: Pipeline


🏃 View run inquisitive-asp-136 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/6/runs/b477380aa9834991be53d04d62a23c21
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/6


<mlflow.models.model.ModelInfo at 0x1c8fc800f50>

In [22]:
pipeline.fit(X_train_6140, y_train_6140)

preds_6140 = pipeline.predict(X_test_6140)
test_mae_6140 = mean_absolute_error(y_test_6140, preds_6140)
print(f"{test_mae_6140:.4f}")

mlflow = set_mlflow_tracking()
log_model_to_mlflow(pipeline, X_test_6140, "LGBMRegressorWFE_6140", "mean_absolute_error", score=test_mae_6140)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006367 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25224
[LightGBM] [Info] Number of data points in the train set: 703, number of used features: 674
[LightGBM] [Info] Start training from score 2.739687


INFO:src.experiment_utils:MLflow tracking URI and credentials set.


1.4034


2025/05/02 16:36:01 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressorWFE_6140' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressorWFE_6140
INFO:src.experiment_utils:Logged mean_absolute_error: 1.4033525502355684
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Pipeline' already exists. Creating a new version of this model...
2025/05/02 16:36:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Pipeline, version 2
Created version '2' of model 'Pipeline'.
INFO:src.experiment_utils:Model logged with name: Pipeline


🏃 View run powerful-robin-498 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/7/runs/17b0b17781e141c2a8bae979dc5af33e
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/7


<mlflow.models.model.ModelInfo at 0x1c8fc8e35d0>

In [23]:
pipeline.fit(X_train_6822, y_train_6822)

preds_6822 = pipeline.predict(X_test_6822)
test_mae_6822 = mean_absolute_error(y_test_6822, preds_6822)
print(f"{test_mae_6822:.4f}")

mlflow = set_mlflow_tracking()
log_model_to_mlflow(pipeline, X_test_6822, "LGBMRegressorWFE_6822", "mean_absolute_error", score=test_mae_6822)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20514
[LightGBM] [Info] Number of data points in the train set: 703, number of used features: 674
[LightGBM] [Info] Start training from score 2.318634


INFO:src.experiment_utils:MLflow tracking URI and credentials set.


1.2076


2025/05/02 16:37:08 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressorWFE_6822' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressorWFE_6822
INFO:src.experiment_utils:Logged mean_absolute_error: 1.2076095356052092
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Pipeline' already exists. Creating a new version of this model...
2025/05/02 16:37:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Pipeline, version 3
Created version '3' of model 'Pipeline'.
INFO:src.experiment_utils:Model logged with name: Pipeline


🏃 View run efficient-moth-811 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/8/runs/a0bc4e7be7cd4f7d87f8a0b6e8286080
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/8


<mlflow.models.model.ModelInfo at 0x1c8fc81e090>