In [1]:
#!/usr/bin/env python3
"""
register_models.py

Register three existing MLflow runs (each with a logged "model" artifact)
as versions of the same Registered Model on DagsHub.
"""

import os
import mlflow
from mlflow import MlflowClient

# ──────────────────────────────────────────────────────────────────────────────
# DAGS-HUB MLflow CONFIGURATION
MLFLOW_TRACKING_URI      = "https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow"
MLFLOW_TRACKING_USERNAME = "kaushal-shivaprakashan"
MLFLOW_TRACKING_PASSWORD = "b01d7b8c94b982d47d0224ea469bbfe4b8870ff6"

# replace these with your actual run IDs
BASELINE_RUN_ID = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
LAG28_RUN_ID    = "yyyyyyyyyyyyyyyyyyyyyyyyyyy"
TOP10IMP_RUN_ID = "zzzzzzzzzzzzzzzzzzzzzzzzzzz"

# The name of the Registered Model to create/use
REGISTERED_MODEL_NAME = "CitiBikeForecasting"
# ──────────────────────────────────────────────────────────────────────────────

# 1️⃣ Set up MLflow to point at your DagsHub endpoint with basic auth
os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

def register_run(run_id: str, model_name: str, client: MlflowClient):
    """
    Registers the model artifact from a given run_id as a new version
    in `model_name`.
    """
    model_uri = f"runs:/{run_id}/model"
    print(f"Registering {model_uri} as a new version of '{model_name}'…")
    mv = client.create_model_version(
        name=model_name,
        source=model_uri,
        run_id=run_id,
    )
    print(f" → Registered as version {mv.version}")
    return mv

def main():
    client = MlflowClient()

    # Ensure the Registered Model exists (idempotent)
    try:
        client.get_registered_model(REGISTERED_MODEL_NAME)
        print(f"Registered Model '{REGISTERED_MODEL_NAME}' already exists.")
    except mlflow.exceptions.RestException:
        print(f"Creating Registered Model '{REGISTERED_MODEL_NAME}'…")
        client.create_registered_model(REGISTERED_MODEL_NAME)

    # Register each run and transition to Staging
    for run_id in [BASELINE_RUN_ID, LAG28_RUN_ID, TOP10IMP_RUN_ID]:
        mv = register_run(run_id, REGISTERED_MODEL_NAME, client)
        client.transition_model_version_stage(
            name=REGISTERED_MODEL_NAME,
            version=mv.version,
            stage="Staging",
            archive_existing_versions=False,
        )
        print(f" → Transitioned version {mv.version} to 'Staging'\n")

    print("✅ All done. Check your DagsHub Model Registry under", REGISTERED_MODEL_NAME)

if __name__ == "__main__":
    main()

Creating Registered Model 'CitiBikeForecasting'…
Registering runs:/xxxxxxxxxxxxxxxxxxxxxxxxxxxxx/model as a new version of 'CitiBikeForecasting'…


2025/05/10 15:28:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CitiBikeForecasting, version 1
  client.transition_model_version_stage(


 → Registered as version 1


2025/05/10 15:28:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CitiBikeForecasting, version 2


 → Transitioned version 1 to 'Staging'

Registering runs:/yyyyyyyyyyyyyyyyyyyyyyyyyyy/model as a new version of 'CitiBikeForecasting'…
 → Registered as version 2


2025/05/10 15:28:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CitiBikeForecasting, version 3


 → Transitioned version 2 to 'Staging'

Registering runs:/zzzzzzzzzzzzzzzzzzzzzzzzzzz/model as a new version of 'CitiBikeForecasting'…
 → Registered as version 3
 → Transitioned version 3 to 'Staging'

✅ All done. Check your DagsHub Model Registry under CitiBikeForecasting


In [2]:
#!/usr/bin/env python3
"""
train_and_log_all.py

Loads cleaned Citibike data, then:
  1) logs a baseline mean model
  2) logs a LightGBM on 28 lag features
  3) logs a LightGBM on top-10 importance features

All runs go to your DagsHub MLflow server.
"""

import os
import pandas as pd
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import mlflow

# ──────────────────────────────────────────────────────────────────────────────
# DagsHub MLflow settings
os.environ["MLFLOW_TRACKING_USERNAME"] = "kaushal-shivaprakashan"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "b01d7b8c94b982d47d0224ea469bbfe4b8870ff6"
mlflow.set_tracking_uri("https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow")

# Experiment name under which the three runs will appear
EXPERIMENT_NAME = "CitiBike_Remote_Experiment"
mlflow.set_experiment(EXPERIMENT_NAME)

# Data & split config
PARQUET_PATH = "/Users/kaushalshivaprakash/Desktop/project3/data/processed/cleaned_citibike/citibike_2023_top3.parquet"
TRAIN_FRAC   = 0.8
MAX_LAG      = 28
TOP_K        = 10
# ──────────────────────────────────────────────────────────────────────────────

def load_and_agg(path):
    df = pd.read_parquet(path)
    df["datetime"] = df["started_at"].dt.floor("H")
    agg = df.groupby("datetime").size().reset_index(name="count").sort_values("datetime")
    return agg

def train_test_split_ts(df, frac):
    idx = int(len(df) * frac)
    return df.iloc[:idx], df.iloc[idx:]

def log_baseline(train, test):
    with mlflow.start_run(run_name="baseline_mean"):
        pred = train["count"].mean()
        mae = mean_absolute_error(test["count"], [pred]*len(test))
        mlflow.log_param("model_type", "baseline_mean")
        mlflow.log_metric("mae", mae)
        print(f"[baseline] MAE = {mae:.2f}")

def log_lag_model(df):
    df_lag = df.copy()
    for lag in range(1, MAX_LAG+1):
        df_lag[f"lag_{lag}"] = df_lag["count"].shift(lag)
    df_lag = df_lag.dropna().reset_index(drop=True)
    train, test = train_test_split_ts(df_lag, TRAIN_FRAC)
    feats = [f"lag_{i}" for i in range(1, MAX_LAG+1)]
    X_train, y_train = train[feats], train["count"]
    X_test,  y_test  = test[feats],  test["count"]

    with mlflow.start_run(run_name="lgbm_28lag"):
        mlflow.log_param("model_type", "lgbm_28lag")
        mlflow.log_param("num_lags", MAX_LAG)
        model = LGBMRegressor(random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(model, "model")
        print(f"[lgbm_28lag] MAE = {mae:.2f}")

def log_topk_model(df):
    # build full-lag dataset
    df_lag = df.copy()
    for lag in range(1, MAX_LAG+1):
        df_lag[f"lag_{lag}"] = df_lag["count"].shift(lag)
    df_lag = df_lag.dropna().reset_index(drop=True)
    train, test = train_test_split_ts(df_lag, TRAIN_FRAC)
    feats = [f"lag_{i}" for i in range(1, MAX_LAG+1)]
    X_train, y_train = train[feats], train["count"]
    X_test,  y_test  = test[feats],  test["count"]

    # get importances
    base = LGBMRegressor(random_state=42)
    base.fit(X_train, y_train)
    importances = pd.Series(base.feature_importances_, index=feats)
    top_feats = importances.nlargest(TOP_K).index.tolist()

    with mlflow.start_run(run_name="lgbm_top10_imp"):
        mlflow.log_param("model_type", "lgbm_top10_imp")
        mlflow.log_param("num_lags", MAX_LAG)
        mlflow.log_param("selected_feats", top_feats)
        model = LGBMRegressor(random_state=42)
        model.fit(X_train[top_feats], y_train)
        preds = model.predict(X_test[top_feats])
        mae = mean_absolute_error(y_test, preds)
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(model, "model")
        print(f"[lgbm_top10_imp] MAE = {mae:.2f}")

def main():
    df = load_and_agg(PARQUET_PATH)
    train, test = train_test_split_ts(df, TRAIN_FRAC)

    # 1) Baseline
    log_baseline(train, test)
    # 2) 28-lag
    log_lag_model(df)
    # 3) Top-10 importance
    log_topk_model(df)

    print(f"\n✅ All three models have been logged under experiment '{EXPERIMENT_NAME}'")

if __name__ == "__main__":
    main()

2025/05/10 15:30:07 INFO mlflow.tracking.fluent: Experiment with name 'CitiBike_Remote_Experiment' does not exist. Creating a new experiment.


[baseline] MAE = 31.20
🏃 View run baseline_mean at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0/runs/bc294ae85d6a41d19346bafdce4518c5
🧪 View experiment at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5326
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 28
[LightGBM] [Info] Start training from score 45.662413




[lgbm_28lag] MAE = 8.22
🏃 View run lgbm_28lag at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0/runs/f205531465e347f4ac2719fb2df7b3b1
🧪 View experiment at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5326
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 28
[LightGBM] [Info] Start training from score 45.662413
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000825 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1904
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 10
[LightGBM] [Info] Start training from score 45.662413




[lgbm_top10_imp] MAE = 8.33
🏃 View run lgbm_top10_imp at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0/runs/c4b45d6d42024e72b74aab6ecef5b727
🧪 View experiment at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0

✅ All three models have been logged under experiment 'CitiBike_Remote_Experiment'


In [3]:
#!/usr/bin/env python3
"""
train_and_log_all.py

Loads cleaned Citibike data, then:
  1) logs a baseline mean model
  2) logs a LightGBM on 28 lag features
  3) logs a LightGBM on top-10 importance features

Each run logs:
  - mae
  - mae_improvement = baseline_mae - mae
  - pct_improvement = (baseline_mae - mae) / baseline_mae

At the end, prints out which model performed best (lowest MAE).
"""

import os
import pandas as pd
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import mlflow

# ──────────────────────────────────────────────────────────────────────────────
# DagsHub MLflow settings
os.environ["MLFLOW_TRACKING_USERNAME"] = "kaushal-shivaprakashan"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "b01d7b8c94b982d47d0224ea469bbfe4b8870ff6"
mlflow.set_tracking_uri("https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow")

EXPERIMENT_NAME = "CitiBike_Remote_Experiment"
mlflow.set_experiment(EXPERIMENT_NAME)

# Data & split config
PARQUET_PATH = "/Users/kaushalshivaprakash/Desktop/project3/data/processed/cleaned_citibike/citibike_2023_top3.parquet"
TRAIN_FRAC   = 0.8
MAX_LAG      = 28
TOP_K        = 10
# ──────────────────────────────────────────────────────────────────────────────

def load_and_agg(path):
    df = pd.read_parquet(path)
    df["datetime"] = df["started_at"].dt.floor("H")
    agg = df.groupby("datetime").size().reset_index(name="count")
    return agg.sort_values("datetime").reset_index(drop=True)

def train_test_split_ts(df, frac):
    idx = int(len(df) * frac)
    return df.iloc[:idx], df.iloc[idx:]

def log_baseline(train, test):
    with mlflow.start_run(run_name="baseline_mean"):
        pred = train["count"].mean()
        mae = mean_absolute_error(test["count"], [pred] * len(test))
        mlflow.log_param("model_type", "baseline_mean")
        mlflow.log_metric("mae", mae)
        print(f"[baseline_mean] MAE = {mae:.2f}")
        return mae

def log_lag_model(df, baseline_mae):
    df_lag = df.copy()
    for lag in range(1, MAX_LAG + 1):
        df_lag[f"lag_{lag}"] = df_lag["count"].shift(lag)
    df_lag = df_lag.dropna().reset_index(drop=True)
    train, test = train_test_split_ts(df_lag, TRAIN_FRAC)
    feats = [f"lag_{i}" for i in range(1, MAX_LAG + 1)]
    X_train, y_train = train[feats], train["count"]
    X_test,  y_test  = test[feats], test["count"]

    with mlflow.start_run(run_name="lgbm_28lag"):
        mlflow.log_param("model_type", "lgbm_28lag")
        mlflow.log_param("num_lags", MAX_LAG)

        model = LGBMRegressor(random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)

        imp_abs = baseline_mae - mae
        imp_pct = imp_abs / baseline_mae if baseline_mae else 0.0

        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mae_improvement", imp_abs)
        mlflow.log_metric("pct_improvement", imp_pct)
        mlflow.sklearn.log_model(model, "model")

        print(f"[lgbm_28lag] MAE = {mae:.2f} | Δ = {imp_abs:.2f} ({imp_pct:.1%})")
        return mae

def log_topk_model(df, baseline_mae):
    df_lag = df.copy()
    for lag in range(1, MAX_LAG + 1):
        df_lag[f"lag_{lag}"] = df_lag["count"].shift(lag)
    df_lag = df_lag.dropna().reset_index(drop=True)
    train, test = train_test_split_ts(df_lag, TRAIN_FRAC)
    feats = [f"lag_{i}" for i in range(1, MAX_LAG + 1)]
    X_train, y_train = train[feats], train["count"]
    X_test,  y_test  = test[feats],  test["count"]

    # initial fit to get importances
    base = LGBMRegressor(random_state=42)
    base.fit(X_train, y_train)
    importances = pd.Series(base.feature_importances_, index=feats)
    top_feats = importances.nlargest(TOP_K).index.tolist()

    with mlflow.start_run(run_name="lgbm_top10_imp"):
        mlflow.log_param("model_type", "lgbm_top10_imp")
        mlflow.log_param("num_lags", MAX_LAG)
        mlflow.log_param("selected_feats", top_feats)

        model = LGBMRegressor(random_state=42)
        model.fit(X_train[top_feats], y_train)
        preds = model.predict(X_test[top_feats])
        mae = mean_absolute_error(y_test, preds)

        imp_abs = baseline_mae - mae
        imp_pct = imp_abs / baseline_mae if baseline_mae else 0.0

        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mae_improvement", imp_abs)
        mlflow.log_metric("pct_improvement", imp_pct)
        mlflow.sklearn.log_model(model, "model")

        print(f"[lgbm_top10_imp] MAE = {mae:.2f} | Δ = {imp_abs:.2f} ({imp_pct:.1%})")
        return mae

def main():
    df = load_and_agg(PARQUET_PATH)
    train, test = train_test_split_ts(df, TRAIN_FRAC)

    # Train/log and collect MAEs
    maes = {}
    maes["baseline_mean"]   = log_baseline(train, test)
    maes["lgbm_28lag"]      = log_lag_model(df, maes["baseline_mean"])
    maes["lgbm_top10_imp"]  = log_topk_model(df, maes["baseline_mean"])

    # Determine best model (lowest MAE)
    best_model = min(maes, key=maes.get)
    best_mae   = maes[best_model]
    print(f"\n✅ Best model: '{best_model}' with MAE = {best_mae:.2f}")

if __name__ == "__main__":
    main()

[baseline_mean] MAE = 31.20
🏃 View run baseline_mean at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0/runs/26072956d953438a9f1274ce4ab9d1e3
🧪 View experiment at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5326
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 28
[LightGBM] [Info] Start training from score 45.662413




[lgbm_28lag] MAE = 8.22 | Δ = 22.98 (73.7%)
🏃 View run lgbm_28lag at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0/runs/305b68c9cd9c463fb359e5b715a94872
🧪 View experiment at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5326
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 28
[LightGBM] [Info] Start training from score 45.662413
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1904
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 10
[LightGBM] [Info] Start training from score 45.662413




[lgbm_top10_imp] MAE = 8.33 | Δ = 22.86 (73.3%)
🏃 View run lgbm_top10_imp at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0/runs/a23fe245e27341f182e28050d5b60992
🧪 View experiment at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0

✅ Best model: 'lgbm_28lag' with MAE = 8.22
