In [1]:
# model2_lag_features.py

import pandas as pd
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import mlflow

# ──────────────────────────────────────────────────────────────────────────────
PARQUET_PATH = "/Users/manu/Desktop/cda_final/data/processed/cleaned_citibike/citibike_2023_top3.parquet"
EXPERIMENT   = "CitiBike_Forecasting"
TRAIN_FRAC   = 0.8
MAX_LAG      = 28
# ──────────────────────────────────────────────────────────────────────────────

def load_agg(path):
    df = pd.read_parquet(path)
    df["datetime"] = df["started_at"].dt.floor("H")
    agg = df.groupby("datetime").size().reset_index(name="count")
    agg = agg.sort_values("datetime").reset_index(drop=True)
    return agg

def make_lag_features(df, max_lag):
    for lag in range(1, max_lag + 1):
        df[f"lag_{lag}"] = df["count"].shift(lag)
    return df.dropna().reset_index(drop=True)

def split(df, frac):
    idx = int(len(df) * frac)
    return df.iloc[:idx], df.iloc[idx:]

def main():
    mlflow.set_experiment(EXPERIMENT)
    df = load_agg(PARQUET_PATH)
    df = make_lag_features(df, MAX_LAG)
    train, test = split(df, TRAIN_FRAC)

    feats = [f"lag_{i}" for i in range(1, MAX_LAG + 1)]
    X_train, y_train = train[feats], train["count"]
    X_test,  y_test  = test[feats],  test["count"]

    with mlflow.start_run(run_name="lgbm_28lag"):
        mlflow.log_param("model_type",    "lgbm_28lag")
        mlflow.log_param("num_lag_features", MAX_LAG)

        model = LGBMRegressor(random_state=42)
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        mae   = mean_absolute_error(y_test, preds)

        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(model, "model")

    print(f"[lgbm_28lag] MAE = {mae:.2f}")

if __name__ == "__main__":
    main()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5326
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 28
[LightGBM] [Info] Start training from score 45.662413




PermissionError: [Errno 13] Permission denied: '/Users/kaushalshivaprakash'

In [3]:
import hopsworks
import mlflow

# ——— Option A: Pass secrets & settings directly to hopsworks.login ———
project = hopsworks.login(
    host="c.app.hopsworks.ai",               
    project="ny_taxi_manognat",               
    api_key_value="akHPBh7lX0c8lAkU.Ki9JT00tDfdQYirHAG2lIWuKnp17150mQ9LWZFFsLh0tfVoP9W7YUIqUAxfvPkAB"  
)

# Grab the MLflow tracking URI from Hopsworks
tracking_uri = project.get_mlflow_tracking_uri()

# Point MLflow at your Hopsworks-managed server
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("CitiBike_Forecasting")

# Now you can start runs & log to Hopsworks:
with mlflow.start_run(run_name="lgbm_28lag"):
    # … train your model …
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(model, "model")



2025-05-10 19:35:06,938 INFO: Initializing external client
2025-05-10 19:35:06,939 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-10 19:35:07,717 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215653


AttributeError: 'Project' object has no attribute 'get_mlflow_tracking_uri'