In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE_DIR = Path().resolve().parent.parent  # works in Jupyter notebook in /ml/
DATA_PATH = BASE_DIR / "dataset" / "card_usage" / "card_subway_transform_cleaned.csv"
DATE_COL = "date"
TARGET_COL = "total_flow"

df = pd.read_csv(DATA_PATH, parse_dates=[DATE_COL], low_memory=False)
# ...rest of your code...
# numeric cleanup
for col in ["boardings","alightings","latitude","longitude","station_code","seoulmetro_code", TARGET_COL]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# weekend -> int
if "is_weekend" in df.columns:
    df["is_weekend"] = pd.to_numeric(df["is_weekend"], errors="coerce").fillna(0).astype(int)
else:
    df["is_weekend"] = (df[DATE_COL].dt.weekday >= 5).astype(int)

# station_key
code = None
if "seoulmetro_code" in df.columns:
    code = df["seoulmetro_code"]
elif "station_code" in df.columns:
    code = df["station_code"]

if code is not None:
    code_int = code.fillna(-1).astype(int)
    df["station_key"] = np.where(
        code.notna() & (code.astype(float) > 0),
        code_int.astype(str),
        df["line"].astype(str) + "|" + df["station_kr"].astype(str)
    )
else:
    df["station_key"] = df["line"].astype(str) + "|" + df["station_kr"].astype(str)

df = df.sort_values(["station_key", DATE_COL]).reset_index(drop=True)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/melvinang/Documents/NUS/Y4 Winter/Team7/dataset/card_usage/card_subway_transform_cleaned.csv'

In [None]:
# FEATURES
df["day_of_week_num"] = df[DATE_COL].dt.dayofweek
df["day_of_month"] = df[DATE_COL].dt.day
df["week_of_year"] = df[DATE_COL].dt.isocalendar().week.astype(int)

df["flow_ratio"] = df["boardings"] / (df["alightings"] + 1)
df["flow_diff"] = df["boardings"] - df["alightings"]

LAGS = [1, 7, 14]
for lag in LAGS:
    df[f"flow_lag_{lag}"] = df.groupby("station_key")[TARGET_COL].shift(lag)

ROLL_WINDOWS = [7, 14]
for w in ROLL_WINDOWS:
    df[f"flow_roll_mean_{w}"] = df.groupby("station_key")[TARGET_COL].transform(
        lambda s: s.shift(1).rolling(w, min_periods=w).mean()
    )
    df[f"flow_roll_std_{w}"] = df.groupby("station_key")[TARGET_COL].transform(
        lambda s: s.shift(1).rolling(w, min_periods=w).std()
    )

# target: next day flow
df["target"] = df.groupby("station_key")[TARGET_COL].shift(-1)

FEATURES = [
    "flow_lag_1","flow_lag_7","flow_lag_14",
    "flow_roll_mean_7","flow_roll_mean_14",
    "flow_roll_std_7","flow_roll_std_14",
    "flow_ratio","flow_diff",
    "day_of_week_num","day_of_month","week_of_year",
    "is_weekend"
]
for g in ["latitude", "longitude"]:
    if g in df.columns:
        FEATURES.append(g)

needed = FEATURES + ["target", DATE_COL, "station_key"]
df_model = df.dropna(subset=needed).reset_index(drop=True)

In [None]:
# TIME SPLIT
split_date = df_model[DATE_COL].quantile(0.8)
train_df = df_model[df_model[DATE_COL] <= split_date].copy()
val_df   = df_model[df_model[DATE_COL] >  split_date].copy()

X_train, y_train = train_df[FEATURES], train_df["target"]
X_val,   y_val   = val_df[FEATURES],   val_df["target"]


In [None]:
# TRAIN MODEL
model = LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)
model.fit(X_train, y_train)

val_df["pred"] = model.predict(X_val)
val_df["residual"] = val_df["pred"] - val_df["target"]

mae = mean_absolute_error(y_val, val_df["pred"])
rmse = np.sqrt(mean_squared_error(y_val, val_df["pred"]))

print(f"VAL MAE : {mae:,.2f}")
print(f"VAL RMSE: {rmse:,.2f}")

In [None]:
# PLOT 1: Actual vs Pred (scatter)
plt.figure()
plt.scatter(val_df["target"], val_df["pred"], alpha=0.2)
plt.xlabel("Actual (target)")
plt.ylabel("Predicted")
plt.title("Validation: Actual vs Predicted (Next-day Total Flow)")
plt.tight_layout()
plt.show()

# PLOT 2: Residual distribution
plt.figure()
plt.hist(val_df["residual"], bins=60)
plt.xlabel("Residual (Pred - Actual)")
plt.ylabel("Count")
plt.title("Residual Distribution (Validation)")
plt.tight_layout()
plt.show()

# PLOT 3: Time-series plot for one station
# Choose a station with many rows in val set
station_counts = val_df["station_key"].value_counts()
example_station = station_counts.index[0]  # most frequent in val
station_df = val_df[val_df["station_key"] == example_station].sort_values(DATE_COL)

plt.figure()
plt.plot(station_df[DATE_COL], station_df["target"], label="Actual")
plt.plot(station_df[DATE_COL], station_df["pred"], label="Predicted")
plt.xlabel("Date")
plt.ylabel("Next-day Total Flow")
plt.title(f"Station Forecast (station_key={example_station})")
plt.legend()
plt.tight_layout()
plt.show()

# PLOT 4: Feature importance (top 15)
imp = pd.DataFrame({
    "feature": FEATURES,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False).head(15)

plt.figure()
plt.barh(imp["feature"][::-1], imp["importance"][::-1])
plt.xlabel("Importance")
plt.title("Top 15 Feature Importances (LightGBM)")
plt.tight_layout()
plt.show()
