In [33]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from pathlib import Path
from numpy import sqrt

In [None]:
# ==============================
# Predictor Class
# ==============================
class TimeSeriesRequestPredictor30MinHybrid:
    def __init__(self, sequence_length=48, prediction_horizon=48, rare_threshold=10):
        self.sequence_length = sequence_length
        self.prediction_horizon = prediction_horizon
        self.rare_threshold = rare_threshold
        self.scaler = StandardScaler()
        self.geo_encoder = LabelEncoder()
        self.model = None
        self.rf_models = {}
        self.slot_counts = {}

    # Preprocess
    def load_and_preprocess(self, df):
        if df.empty:
            # buat kolom tetap ada
            for col in ["request_date","origin_geo_hash","time_slot","request_count"]:
                if col not in df.columns:
                    df[col] = pd.NA
            return df

        # convert date
        df["request_date"] = pd.to_datetime(df["request_date"], errors="coerce", dayfirst=True)

        # slot_30min
        def extract_slot(val):
            if pd.isna(val):
                return -1
            if isinstance(val, str) and "-" in val:
                try:
                    h, m = map(int, val.split("-")[0].strip().split(":"))
                    return h*2 + m//30
                except:
                    return -1
            return -1
        df["slot_30min"] = df["time_slot"].apply(extract_slot)
        df["is_missing_slot"] = (df["slot_30min"]==-1).astype(int)

        # Encode geo
        if df["origin_geo_hash"].notna().sum() > 0:
            df["geo_encoded"] = self.geo_encoder.fit_transform(df["origin_geo_hash"])
        else:
            df["geo_encoded"] = 0

        df["day_of_week"] = df["request_date"].dt.dayofweek.fillna(0).astype(int)
        df["is_weekend"] = (df["day_of_week"] >=5).astype(int)

        df = df.sort_values(["origin_geo_hash","request_date","slot_30min"])
        return df

    # Merge histori + new, tambah geo baru jika perlu
    def load_and_preprocess_with_new_geos(self, historical_df, new_df):
        df_hist = historical_df.copy() if historical_df is not None else pd.DataFrame()
        df_new = new_df.copy() if new_df is not None else pd.DataFrame()

        # pastikan kolom
        for df in [df_hist, df_new]:
            for col in ["request_date","origin_geo_hash","time_slot","request_count"]:
                if col not in df.columns:
                    df[col] = pd.NA

        # semua geo unik
        all_geos = pd.concat([df_hist.get("origin_geo_hash", pd.Series(dtype=str)),
                      df_new.get("origin_geo_hash", pd.Series(dtype=str))]).dropna().unique()
        last_date_hist = df_hist["request_date"].max() if not df_hist.empty else pd.Timestamp.today()

        # tambahkan row geo baru
        new_rows = []
        for geo in all_geos:
            if geo not in df_hist["origin_geo_hash"].unique():
                for slot in range(48):
                    new_rows.append({
                        "origin_geo_hash": geo,
                        "request_date": last_date_hist,
                        "time_slot": f"{slot//2:02d}:{(slot%2)*30:02d}-{slot//2:02d}:{(slot%2+1)*30:02d}",
                        "request_count": 0
                    })
        if new_rows:
            df_hist = pd.concat([df_hist, pd.DataFrame(new_rows)], ignore_index=True)

        df_full = pd.concat([df_hist, df_new], ignore_index=True) if not df_new.empty else df_hist
        return self.load_and_preprocess(df_full)

    # Fill missing slots untuk tanggal terakhir
    @staticmethod
    def fill_missing_slots(df_last_day):
        all_slots = np.arange(48)
        all_geos = df_last_day["origin_geo_hash"].unique()
        new_rows = []
        for geo in all_geos:
            geo_slots = df_last_day[df_last_day["origin_geo_hash"]==geo]["slot_30min"].unique()
            missing_slots = set(all_slots) - set(geo_slots)
            for slot in missing_slots:
                new_rows.append({
                    "origin_geo_hash": geo,
                    "request_date": df_last_day["request_date"].max(),
                    "slot_30min": slot,
                    "request_count": 0
                })
        if new_rows:
            df_last_day = pd.concat([df_last_day, pd.DataFrame(new_rows)], ignore_index=True)
        return df_last_day

    # Train
    def train(self, df=None, epochs=20, validation_split=0.2):
        if df is None or df.empty:
            print("⚠️ Tidak ada data untuk train.")
            return df, None

        # hitung slot counts
        counts = df.groupby(["origin_geo_hash","slot_30min"]).size()
        self.slot_counts = counts.to_dict()

        # sequence generator
        X, y = self.create_sequences(df)
        if X.size != 0:
            X_scaled = self.scaler.fit_transform(X.reshape(-1,X.shape[-1])).reshape(X.shape)
            split_index = int(len(X_scaled)*(1-validation_split))
            X_train, X_val = X_scaled[:split_index], X_scaled[split_index:]
            y_train, y_val = y[:split_index], y[split_index:]

            # build LSTM
            self.model = self.build_model((X.shape[1], X.shape[2]))
            callbacks = [
                EarlyStopping(patience=10, restore_best_weights=True),
                ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6)
            ]
            self.model.fit(X_train, y_train, validation_data=(X_val,y_val),
                           epochs=epochs, batch_size=32, callbacks=callbacks, verbose=1)
        else:
            print("⚠️ Tidak cukup data untuk LSTM.")

        # RF untuk rare slots
        for (geo, slot), cnt in self.slot_counts.items():
            if cnt < self.rare_threshold:
                slot_df = df[(df["origin_geo_hash"]==geo) & (df["slot_30min"]==slot)]
                if len(slot_df) >= 3:
                    X_rf = slot_df[["day_of_week","is_weekend","geo_encoded"]]
                    y_rf = slot_df["request_count"]
                    rf = RandomForestRegressor(n_estimators=50, random_state=42)
                    rf.fit(X_rf, y_rf)
                    self.rf_models[(geo, slot)] = rf

        print(f"🔹 RF models trained: {len(self.rf_models)}")
        return df, None

    # Buat sequences
    def create_sequences(self, data):
        if data.empty:
            return np.array([]), np.array([])
        sequences, targets = [], []
        features = ["request_count","slot_30min","day_of_week","is_weekend","geo_encoded"]
        for geo in data["origin_geo_hash"].unique():
            geo_data = data[data["origin_geo_hash"]==geo].sort_values(["request_date","slot_30min"])
            feature_data = geo_data[features].fillna(0).values.copy()
            feature_data[:,0] = np.log1p(feature_data[:,0] + 1e-6)
            for i in range(len(feature_data)-self.sequence_length-self.prediction_horizon+1):
                seq = feature_data[i:i+self.sequence_length]
                target = feature_data[i+self.sequence_length:i+self.sequence_length+self.prediction_horizon,0]
                sequences.append(seq)
                targets.append(target)
        if len(sequences)==0:
            return np.array([]), np.array([])
        return np.array(sequences), np.array(targets)

    # Build LSTM
    def build_model(self, input_shape):
        model = Sequential([
            Input(shape=input_shape),
            LSTM(128, return_sequences=True, recurrent_dropout=0.2),
            LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            LSTM(32, dropout=0.2, recurrent_dropout=0.2),
            Dense(64, activation="relu"),
            Dropout(0.3),
            Dense(32, activation="relu"),
            Dropout(0.2),
            Dense(self.prediction_horizon, activation="linear")
        ])
        model.compile(optimizer=Adam(0.001), loss="mse", metrics=["mae"])
        return model

    # Prediksi semua geo untuk H+1
    def predict_all_next_day_30min_filtered(self, df_last_day, fallback_value=0.0):
        if df_last_day.empty:
            return pd.DataFrame(columns=["geo_hash","date","slot_30min","predicted_request_count"])

        results = []
        for geo in df_last_day["origin_geo_hash"].unique():
            geo_data = df_last_day[df_last_day["origin_geo_hash"]==geo].sort_values("slot_30min")
            last_date = geo_data["request_date"].max()
            tomorrow_date = last_date + timedelta(days=1)
            all_slots = np.arange(48)

            out = {"geo_hash":[],"date":[],"slot_30min":[],"predicted_request_count":[]}
            for slot in all_slots:
                key = (geo,int(slot))
                if key in self.rf_models:
                    X_new = pd.DataFrame({
                        "day_of_week":[tomorrow_date.dayofweek],
                        "is_weekend":[int(tomorrow_date.dayofweek>=5)],
                        "geo_encoded":[self.geo_encoder.transform([geo])[0]]
                    })
                    try:
                        yhat = float(self.rf_models[key].predict(X_new)[0])
                    except:
                        yhat = fallback_value
                else:
                    yhat = fallback_value
                out["geo_hash"].append(geo)
                out["date"].append(tomorrow_date)
                out["slot_30min"].append(slot)
                out["predicted_request_count"].append(max(0,yhat))
            results.append(pd.DataFrame(out))
        return pd.concat(results, ignore_index=True)


In [38]:
# ==============================
# Pipeline main
# ==============================
historical_file = "data_historis.csv"
file_new_data = "data_29_09_2025.csv"

# Load histori
historical_df = pd.read_csv(historical_file) if Path(historical_file).exists() else pd.DataFrame()
# Load data baru
new_df = pd.read_csv(file_new_data) if Path(file_new_data).exists() else pd.DataFrame()

# Pastikan kolom ada walau kosong
for df in [historical_df,new_df]:
    for col in ["request_date","origin_geo_hash","time_slot","request_count"]:
        if col not in df.columns:
            df[col] = pd.NA

# Gabung data baru ke histori
if not new_df.empty:
    historical_df = pd.concat([historical_df,new_df], ignore_index=True)
    historical_df.to_csv(historical_file, index=False)
    print(f"✅ Data baru ditambahkan. Total rows: {len(historical_df)}")
else:
    print("⚠️ File data baru tidak ditemukan. Menggunakan histori yang ada saja.")

# Inisialisasi predictor
predictor = TimeSeriesRequestPredictor30MinHybrid()

# Preprocess
df_full = predictor.load_and_preprocess_with_new_geos(historical_df, new_df)

# Training
train_data,_ = predictor.train(df=df_full, epochs=20)

# Ambil tanggal terakhir & fill missing slot
max_date = pd.to_datetime(df_full["request_date"]).max()
df_last_day = df_full[df_full["request_date"]==max_date].copy()
df_last_day = predictor.fill_missing_slots(df_last_day)

# Prediksi semua geo untuk H+1
df_pred = predictor.predict_all_next_day_30min_filtered(df_last_day)
df_pred["date"] = pd.to_datetime(df_pred["date"]).dt.strftime("%d-%m-%Y")

# Simpan prediksi
prediksi_file = f"prediksi_{max_date.strftime('%d_%m_%Y')}.csv"
df_pred.to_csv(prediksi_file, index=False)
print(f"✅ Prediksi tersimpan di {prediksi_file}")

⚠️ File data baru tidak ditemukan. Menggunakan histori yang ada saja.
Epoch 1/20
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 55ms/step - loss: 0.8262 - mae: 0.7272 - val_loss: 0.4783 - val_mae: 0.5215 - learning_rate: 0.0010
Epoch 2/20
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - loss: 0.5071 - mae: 0.5574 - val_loss: 0.4223 - val_mae: 0.4668 - learning_rate: 0.0010
Epoch 3/20
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - loss: 0.4706 - mae: 0.5316 - val_loss: 0.4178 - val_mae: 0.4619 - learning_rate: 0.0010
Epoch 4/20
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - loss: 0.4562 - mae: 0.5187 - val_loss: 0.4406 - val_mae: 0.5044 - learning_rate: 0.0010
Epoch 5/20
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - loss: 0.4498 - mae: 0.5123 - val_loss: 0.4078 - val_mae: 0.4382 - learning_rate: 0.0010
Epoch 6/20
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [34]:
def evaluate_prediction(pred_file: str, actual_file: str, output_file="evaluasi_geo_hash_sama.csv"):
    import pandas as pd
    from pathlib import Path
    from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

    # --- Baca file prediksi ---
    if not Path(pred_file).exists():
        print(f"⚠️ File {pred_file} tidak ditemukan.")
        return
    df_pred = pd.read_csv(pred_file)
    df_pred["date"] = pd.to_datetime(df_pred["date"], dayfirst=True)

    # --- Baca file actual ---
    if not Path(actual_file).exists():
        print(f"⚠️ File {actual_file} tidak ditemukan.")
        return
    df_actual = pd.read_csv(actual_file)

    # Buat slot_30min dari time_slot
    def extract_slot(val):
        try:
            h, m = map(int, val.split("-")[0].strip().split(":"))
            return h * 2 + (m // 30)
        except:
            return -1

    df_actual = pd.read_csv(actual_file, skipinitialspace=True)
    df_actual.columns = df_actual.columns.str.strip()  # hapus spasi
    if "request_date" not in df_actual.columns:
        print("⚠️ Kolom 'request_date' tidak ditemukan di CSV actual!")
        return

    df_actual["slot_30min"] = df_actual["time_slot"].apply(extract_slot)
    df_actual["geo_hash"] = df_actual["origin_geo_hash"]
    df_actual["date"] = pd.to_datetime(df_actual["request_date"], dayfirst=True)
    df_actual["actual_request_count"] = df_actual["request_count"]

    # Pilih kolom penting
    df_actual_proc = df_actual[["geo_hash","date","slot_30min","actual_request_count"]]

    # Filter hanya geo_hash yang ada di prediksi
    valid_geo = df_pred["geo_hash"].unique()
    df_actual_proc = df_actual_proc[df_actual_proc["geo_hash"].isin(valid_geo)]

    # Merge inner
    merged = pd.merge(
        df_pred,
        df_actual_proc,
        on=["geo_hash","date","slot_30min"],
        how="inner"
    )

    if not merged.empty:
        mae = mean_absolute_error(merged["actual_request_count"], merged["predicted_request_count"])
        r2  = r2_score(merged["actual_request_count"], merged["predicted_request_count"])
        rmse = sqrt(mean_squared_error(merged["actual_request_count"], merged["predicted_request_count"]))
        print(f"📊 Evaluasi (geo_hash sama): MAE={mae:.4f}, R²={r2:.4f}, RMSE={rmse:.4f}, N={len(merged)}")
    else:
        print("⚠️ Tidak ada slot/geo_hash yang cocok untuk evaluasi.")

    # Simpan hasil evaluasi
    merged.to_csv(output_file, index=False)
    print(f"✅ Hasil evaluasi disimpan di {output_file}")


In [None]:
evaluate_prediction(
    pred_file="prediksi_29_09_2025.csv",
    actual_file="data_29_09_202.csv",
    output_file="evaluasi_manual.csv"
)

📊 Evaluasi (geo_hash sama): MAE=3.0974, R²=-0.6117, RMSE=4.5352, N=1083
✅ Hasil evaluasi disimpan di evaluasi_manual.csv
