In [8]:
import pandas as pd
import mlflow
import joblib
from dotenv import load_dotenv
import os

# Step 1: Load environment variables
load_dotenv()
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_experiment("CitiBikeTripPrediction")

# Step 2: Load your feature dataset
df = pd.read_csv("./data/features/citibike_features.csv", parse_dates=["datetime"])
df = df.sort_values("datetime")

# Step 3: Select recent time window for inference
recent_df = df.groupby("start_station_id").tail(48)  # last 48 hours per station

feature_cols = [col for col in df.columns if col.startswith("lag_") or 
                col.startswith("rolling_") or col in ["hour", "weekday", "is_weekend"]]
X_recent = recent_df[feature_cols]

# Step 4: Load best model from MLflow (DagsHub)
client = mlflow.tracking.MlflowClient()
experiment = client.get_experiment_by_name("CitiBikeTripPrediction")
runs = client.search_runs(experiment.experiment_id, order_by=["metrics.mae ASC"], max_results=1)
best_run_id = runs[0].info.run_id
model_uri = f"runs:/{best_run_id}/model"

model = mlflow.pyfunc.load_model(model_uri)

# Step 5: Predict
recent_df["predicted_ride_count"] = model.predict(X_recent)

# Step 6: Save predictions
output_path = "./data/predictions/predictions.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
recent_df[["start_station_id", "datetime", "predicted_ride_count"]].to_csv(output_path, index=False)

print(f"✅ Predictions saved to {output_path}")


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Predictions saved to ./data/predictions/predictions.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_df["predicted_ride_count"] = model.predict(X_recent)
