In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [4]:
import pandas as pd

In [5]:
from src.config import PROCESSED_DATA_DIR

ts_data = pd.read_parquet(PROCESSED_DATA_DIR / "ts_data.parquet")
ts_data

Unnamed: 0,start_hour,start_station_id,rides
0,2023-01-01 00:00:00,5905.140137,0
1,2023-01-01 01:00:00,5905.140137,5
2,2023-01-01 02:00:00,5905.140137,7
3,2023-01-01 03:00:00,5905.140137,3
4,2023-01-01 04:00:00,5905.140137,2
...,...,...,...
59107,2025-03-31 19:00:00,6822.089844,23
59108,2025-03-31 20:00:00,6822.089844,14
59109,2025-03-31 21:00:00,6822.089844,5
59110,2025-03-31 22:00:00,6822.089844,2


In [8]:
ts_data["start_station_id"].value_counts()

start_station_id
5905.140137    19704
6140.049805    19704
6822.089844    19704
Name: count, dtype: int64

In [11]:
from datetime import datetime

# Create baseline models for each location using simple mean 

ts_data_5905 = ts_data[ts_data["start_station_id"] == 5905.140137]
ts_data_6140 = ts_data[ts_data["start_station_id"] == 6140.049805]
ts_data_6822 = ts_data[ts_data["start_station_id"] == 6822.089844]

cutoff_date=datetime(2025, 1, 1, 0, 0, 0)

ts_data_5905_train = ts_data_5905[ts_data_5905["start_hour"] < cutoff_date]
ts_data_6140_train = ts_data_6140[ts_data_6140["start_hour"] < cutoff_date]
ts_data_6822_train = ts_data_6822[ts_data_6822["start_hour"] < cutoff_date]

ts_data_5905_test = ts_data_5905[ts_data_5905["start_hour"] >= cutoff_date]
ts_data_6140_test = ts_data_6140[ts_data_6140["start_hour"] >= cutoff_date]
ts_data_6822_test = ts_data_6822[ts_data_6822["start_hour"] >= cutoff_date]

In [13]:
ts_data_6822_test

Unnamed: 0,start_hour,start_station_id,rides
56952,2025-01-01 00:00:00,6822.089844,0
56953,2025-01-01 01:00:00,6822.089844,3
56954,2025-01-01 02:00:00,6822.089844,2
56955,2025-01-01 03:00:00,6822.089844,2
56956,2025-01-01 04:00:00,6822.089844,3
...,...,...,...
59107,2025-03-31 19:00:00,6822.089844,23
59108,2025-03-31 20:00:00,6822.089844,14
59109,2025-03-31 21:00:00,6822.089844,5
59110,2025-03-31 22:00:00,6822.089844,2


In [15]:
import numpy as np

class MeanModel:
    def __init__(self):
        self.mean_value = None
        self.fitted = False

    def fit(self, ts_train: pd.DataFrame):
        """
        Fit the model by computing the mean of the time series.
        """
        if not isinstance(ts_train, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame.")
        self.mean_value = ts_train['rides'].mean()
        self.fitted = True
        print(f"Model fitted. Mean = {self.mean_value:.2f}")

    def predict(self, ts_test: pd.DataFrame) -> np.array:
        if not self.fitted:
            raise ValueError("Model must be fitted before predicting.")
        return np.full(shape=len(ts_test), fill_value=self.mean_value)

In [19]:
model_5905 = MeanModel()
model_6140 = MeanModel()
model_6822 = MeanModel()

model_5905.fit(ts_data_5905_train)
model_6140.fit(ts_data_6140_train)
model_6822.fit(ts_data_6822_train)

preds_5905 = model_5905.predict(ts_data_5905_test)
preds_6140 = model_6140.predict(ts_data_6140_test)
preds_6822 = model_6822.predict(ts_data_6822_test)

Model fitted. Mean = 14.02
Model fitted. Mean = 17.21
Model fitted. Mean = 10.68


In [22]:
from sklearn.metrics import mean_absolute_error

test_mae_5905 = mean_absolute_error(ts_data_5905_test["rides"], preds_5905)
test_mae_6140 = mean_absolute_error(ts_data_6140_test["rides"], preds_6140)
test_mae_6822 = mean_absolute_error(ts_data_6822_test["rides"], preds_6822)

print(f"MAE for 5905: {test_mae_5905:.2f}")
print(f"MAE for 6140: {test_mae_6140:.2f}")
print(f"MAE for 6822: {test_mae_6822:.2f}")

MAE for 5905: 8.56
MAE for 6140: 11.56
MAE for 6822: 8.40


In [23]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
load_dotenv()

mlflow = set_mlflow_tracking()

INFO:src.experiment_utils:MLflow tracking URI and credentials set.


In [24]:
log_model_to_mlflow(model_5905, ts_data_5905_test, "BaselineSimpleMean5905", "mean_absolute_error", score=test_mae_5905)
log_model_to_mlflow(model_6140, ts_data_6140_test, "BaselineSimpleMean6140", "mean_absolute_error", score=test_mae_6140)
log_model_to_mlflow(model_6822, ts_data_6822_test, "BaselineSimpleMean6822", "mean_absolute_error", score=test_mae_6822)

2025/05/02 15:52:00 INFO mlflow.tracking.fluent: Experiment with name 'BaselineSimpleMean5905' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: BaselineSimpleMean5905
INFO:src.experiment_utils:Logged mean_absolute_error: 8.555065042812991
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/02 15:52:07 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Successfully registered model 'MeanModel'.
2025/05/02 15:52:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MeanModel, version 1
Created version '1' of model 'MeanModel'.
INFO:src.experiment_utils:Model logged with name: MeanModel


🏃 View run polite-fly-297 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/0/runs/48da260dd10e440bb4b839aab054f5e1
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/0


2025/05/02 15:52:13 INFO mlflow.tracking.fluent: Experiment with name 'BaselineSimpleMean6140' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: BaselineSimpleMean6140
INFO:src.experiment_utils:Logged mean_absolute_error: 11.559781121751024
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'MeanModel' already exists. Creating a new version of this model...
2025/05/02 15:52:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MeanModel, version 2
Created version '2' of model 'MeanModel'.
INFO:src.experiment_utils:Model logged with name: MeanModel


🏃 View run unequaled-pig-783 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/1/runs/deb29f8e65dc4ea6baa2579fbd606296
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/1


2025/05/02 15:52:27 INFO mlflow.tracking.fluent: Experiment with name 'BaselineSimpleMean6822' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: BaselineSimpleMean6822
INFO:src.experiment_utils:Logged mean_absolute_error: 8.404859316681023
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'MeanModel' already exists. Creating a new version of this model...
2025/05/02 15:52:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MeanModel, version 3
Created version '3' of model 'MeanModel'.
INFO:src.experiment_utils:Model logged with name: MeanModel


🏃 View run agreeable-horse-486 at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/2/runs/26506207cdb84a9892865cf0defadea2
🧪 View experiment at: https://dagshub.com/nolantphillips/citibike.mlflow/#/experiments/2


<mlflow.models.model.ModelInfo at 0x29844e64b10>