In [1]:
# model3_feature_reduction.py

import pandas as pd
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import mlflow

# ──────────────────────────────────────────────────────────────────────────────
PARQUET_PATH = "/Users/kaushalshivaprakash/Desktop/project3/data/processed/cleaned_citibike/citibike_2023_top3.parquet"
EXPERIMENT   = "CitiBike_Forecasting"
TRAIN_FRAC   = 0.8
MAX_LAG      = 28
TOP_K        = 10   # how many features to keep
# ──────────────────────────────────────────────────────────────────────────────

def load_and_prepare(path):
    df = pd.read_parquet(path)
    df["datetime"] = df["started_at"].dt.floor("H")
    agg = df.groupby("datetime").size().reset_index(name="count").sort_values("datetime")
    # lag features
    for lag in range(1, MAX_LAG + 1):
        agg[f"lag_{lag}"] = agg["count"].shift(lag)
    agg = agg.dropna().reset_index(drop=True)
    return agg

def split(df, frac):
    idx = int(len(df) * frac)
    return df.iloc[:idx], df.iloc[idx:]

def main():
    mlflow.set_experiment(EXPERIMENT)
    df = load_and_prepare(PARQUET_PATH)
    train, test = split(df, TRAIN_FRAC)

    feats = [f"lag_{i}" for i in range(1, MAX_LAG + 1)]
    X_train, y_train = train[feats], train["count"]
    X_test,  y_test  = test[feats],  test["count"]

    # 1) initial model to get importances
    base = LGBMRegressor(random_state=42)
    base.fit(X_train, y_train)
    importances = pd.Series(base.feature_importances_, index=feats)
    top_features = importances.nlargest(TOP_K).index.tolist()

    # 2) retrain on top-K
    with mlflow.start_run(run_name="lgbm_top10_imp"):
        mlflow.log_param("model_type",      "lgbm_top10_importance")
        mlflow.log_param("num_lag_features", MAX_LAG)
        mlflow.log_param("selected_features", top_features)

        model = LGBMRegressor(random_state=42)
        model.fit(X_train[top_features], y_train)

        preds = model.predict(X_test[top_features])
        mae   = mean_absolute_error(y_test, preds)

        mlflow.log_param("num_selected_features", len(top_features))
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(model, "model")

    print(f"[lgbm_top10_imp] MAE = {mae:.2f}")

if __name__ == "__main__":
    main()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000504 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5326
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 28
[LightGBM] [Info] Start training from score 45.662413
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000301 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1904
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 10
[LightGBM] [Info] Start training from score 45.662413




[lgbm_top10_imp] MAE = 8.33
