In [17]:
%load_ext autoreload
%autoreload 2

import sys
import os
from datetime import datetime, timedelta
import pandas as pd

# Add project root to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as config
from src.inference import get_feature_store, load_model_from_registry, get_model_predictions
from src.data_utils import transform_ts_data_info_features


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import pandas as pd
import numpy as np
import joblib
import mlflow
import hopsworks
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from datetime import timedelta

from src import config as c
from src.pipeline_utils import create_lag_features
mlflow.set_experiment("xgboost-lag3")

# Step 2: Load Features from Hopsworks
print("🔐 Logging in to Hopsworks...")
project = hopsworks.login(project=c.HOPSWORKS_PROJECT_NAME, api_key_value=c.HOPSWORKS_API_KEY)
fs = project.get_feature_store()

fg = fs.get_feature_group(name=c.FEATURE_GROUP_NAME, version=c.FEATURE_GROUP_VERSION)
df = fg.read()
print("✅ Loaded features:", df.shape)

🔐 Logging in to Hopsworks...
2025-05-10 10:32:35,765 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 10:32:35,866 INFO: Initializing external client
2025-05-10 10:32:35,869 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-10 10:32:36,975 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215672
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.70s) 
✅ Loaded features: (26535, 3)


In [19]:
# Step 3: Preprocess Features (create lags)
df = df.sort_values(["pickup_location_id", "pickup_hour"])
df = create_lag_features(df, lags=[1, 2, 3], group_col="pickup_location_id", target_col="rides")
df = df.dropna()
print("✅ Data with lags:", df.shape)

✅ Data with lags: (26526, 6)


In [20]:
# Step 4: Train/Test Split (last 7 days as test)
df["pickup_hour"] = pd.to_datetime(df["pickup_hour"])
max_date = df.pickup_hour.max()
split_date = max_date - timedelta(days=7)
train_df = df[df.pickup_hour < split_date]
test_df = df[df.pickup_hour >= split_date]

X_train = train_df[["rides_t-1", "rides_t-2", "rides_t-3"]]
y_train = train_df["rides"]
X_test = test_df[["rides_t-1", "rides_t-2", "rides_t-3"]]
y_test = test_df["rides"]

In [21]:
# Step 5: Model Training and MLflow Tracking
mlflow.set_experiment("xgboost-lag3")
with mlflow.start_run():
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    mae = mean_absolute_error(y_test, preds)

    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)

    # Save model
    model_path = c.MODELS_DIR / "xgb_model.pkl"
    joblib.dump(model, model_path)
    mlflow.sklearn.log_model(model, "model")

print("✅ Model trained. RMSE:", rmse, "MAE:", mae)



🏃 View run likeable-moose-352 at: https://dagshub.com/nivesharath/citi_bike.mlflow/#/experiments/7/runs/44333edf28524987a23f95c33ddb622e
🧪 View experiment at: https://dagshub.com/nivesharath/citi_bike.mlflow/#/experiments/7
✅ Model trained. RMSE: 4.8068401157508545 MAE: 3.301681501743121


In [22]:
print("✅ Model trained. RMSE:", rmse, "MAE:", mae)

✅ Model trained. RMSE: 4.8068401157508545 MAE: 3.301681501743121


In [23]:
# Step 6: Register Model in Hopsworks
mr = project.get_model_registry()

model_hops = mr.python.create_model(
    name=c.MODEL_NAME,
    metrics={"rmse": rmse},
    description="XGBoost model trained on lag features"
)

# ✅ Add this to upload the model file
model_hops.save(str(model_path))

print("📦 Model registered in Hopsworks:", model_hops.name)




  0%|          | 0/6 [00:00<?, ?it/s]

Uploading C:\Users\nives\Downloads\sp25_citi_bike\models\xgb_model.pkl: 0.000%|          | 0/411655 elapsed<00…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1215672/models/citi_bike_data/7
📦 Model registered in Hopsworks: citi_bike_data


In [24]:
# Step 6: Register Model in Hopsworks
mr = project.get_model_registry()

model_hops = mr.python.create_model(
    name=c.MODEL_NAME,
    metrics={"mae": mae},
    description="XGBoost model trained on lag features"
)

# ✅ Add this to upload the model file
model_hops.save(str(model_path))

print("📦 Model registered in Hopsworks:", model_hops.name)


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading C:\Users\nives\Downloads\sp25_citi_bike\models\xgb_model.pkl: 0.000%|          | 0/411655 elapsed<00…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1215672/models/citi_bike_data/8
📦 Model registered in Hopsworks: citi_bike_data
