In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import mlflow
import os

# Load feature dataset
df = pd.read_csv("./data/features/citibike_features.csv", parse_dates=["datetime"])

# Select features and target
feature_cols = [col for col in df.columns if col.startswith("lag_") or col.startswith("rolling_") or col in ["hour", "weekday", "is_weekend"]]
target_col = "ride_count"

# Train-test split by time
df.sort_values("datetime", inplace=True)
train_size = int(len(df) * 0.8)
train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

# Baseline model (mean)
baseline_pred = [y_train.mean()] * len(y_test)
baseline_mae = mean_absolute_error(y_test, baseline_pred)

mlflow.set_experiment("CitiBikeTripPrediction")
with mlflow.start_run(run_name="baseline_model"):
    mlflow.log_metric("mae", baseline_mae)
    mlflow.log_param("model_type", "mean_baseline")

# Model 1: LightGBM with all features
model1 = LGBMRegressor(n_estimators=100, random_state=42)
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test)
mae1 = mean_absolute_error(y_test, pred1)

with mlflow.start_run(run_name="lightgbm_all_features") as run1:
    mlflow.log_metric("mae", mae1)
    mlflow.log_params(model1.get_params())
    mlflow.sklearn.log_model(model1, "model")

# Model 2: Top 10 important features
importances = pd.Series(model1.feature_importances_, index=feature_cols).sort_values(ascending=False)
top10_features = importances.head(10).index.tolist()

model2 = LGBMRegressor(n_estimators=100, random_state=42)
model2.fit(X_train[top10_features], y_train)
pred2 = model2.predict(X_test[top10_features])
mae2 = mean_absolute_error(y_test, pred2)

with mlflow.start_run(run_name="lightgbm_top10_features") as run2:
    mlflow.log_metric("mae", mae2)
    mlflow.log_params(model2.get_params())
    mlflow.sklearn.log_model(model2, "model")

print(f"Baseline MAE: {baseline_mae:.2f}")
print(f"LightGBM All Features MAE: {mae1:.2f}")
print(f"LightGBM Top 10 Features MAE: {mae2:.2f}")


2025/05/10 19:45:10 INFO mlflow.tracking.fluent: Experiment with name 'CitiBikeTripPrediction' does not exist. Creating a new experiment.


🏃 View run baseline_model at: https://dagshub.com/omsalunke19/cda500-final.mlflow/#/experiments/0/runs/6f6a2ad4f69d4f58b726394ec209cd60
🧪 View experiment at: https://dagshub.com/omsalunke19/cda500-final.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004837 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 762
[LightGBM] [Info] Number of data points in the train set: 52532, number of used features: 35
[LightGBM] [Info] Start training from score 1.184364




🏃 View run lightgbm_all_features at: https://dagshub.com/omsalunke19/cda500-final.mlflow/#/experiments/0/runs/08573c402b584765ad2ab96fec42f0a7
🧪 View experiment at: https://dagshub.com/omsalunke19/cda500-final.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 481
[LightGBM] [Info] Number of data points in the train set: 52532, number of used features: 10
[LightGBM] [Info] Start training from score 1.184364




🏃 View run lightgbm_top10_features at: https://dagshub.com/omsalunke19/cda500-final.mlflow/#/experiments/0/runs/4deb22d300af4d44942e65a7aaf789ef
🧪 View experiment at: https://dagshub.com/omsalunke19/cda500-final.mlflow/#/experiments/0
Baseline MAE: 0.31
LightGBM All Features MAE: 0.28
LightGBM Top 10 Features MAE: 0.28
