In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from pathlib import Path

In [5]:

class TimeSeriesRequestPredictor30Min:
    def __init__(self, sequence_length=144, prediction_horizon=48):
        """
        sequence_length: panjang window historis (slot 30 menit)
        prediction_horizon: horizon prediksi (berapa slot 30 menit ke depan)
        Default 144 slot = 3 hari * 48 slot/hari
        """
        self.sequence_length = sequence_length
        self.prediction_horizon = prediction_horizon
        self.scaler = StandardScaler()
        self.geo_encoder = LabelEncoder()
        self.model = None

    # ================================
    # Preprocessing
    # ================================
    def load_and_preprocess_data(self, csv_file_path):
        df = pd.read_csv(csv_file_path)
        df["request_date"] = pd.to_datetime(df["request_date"], errors="coerce", dayfirst=True)

        # Ekstrak slot 30 menit
        def extract_slot(val):
            if pd.isna(val):
                return -1
            if isinstance(val, str) and "-" in val:
                start = val.split("-")[0]
                h, m = map(int, start.split(":"))
                return h*2 + (m//30)
            try:
                t = pd.to_datetime(val)
                return t.hour*2 + (t.minute//30)
            except:
                return -1
        df["slot_30min"] = df["time_slot"].apply(extract_slot)
        df["is_missing_slot"] = (df["slot_30min"] == -1).astype(int)

        # Encode geo_hash
        df["geo_encoded"] = self.geo_encoder.fit_transform(df["origin_geo_hash"])

        # Fitur temporal
        df["day_of_week"] = df["request_date"].dt.dayofweek
        df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)

        df = df.sort_values(["origin_geo_hash", "request_date", "slot_30min"])
        return df

    # ================================
    # Sequence Generator dengan log-transform
    # ================================
    def create_sequences(self, data):
        sequences, targets = [], []
        features = ["request_count","slot_30min","day_of_week","is_weekend","geo_encoded"]

        for geo_hash in data["origin_geo_hash"].unique():
            geo_data = data[data["origin_geo_hash"]==geo_hash].sort_values(["request_date","slot_30min"])
            feature_data = geo_data[features].values.copy()

            # Log-transform request_count
            feature_data[:,0] = np.log1p(feature_data[:,0])

            for i in range(len(feature_data) - self.sequence_length - self.prediction_horizon + 1):
                seq = feature_data[i:i+self.sequence_length]
                target = feature_data[i+self.sequence_length:i+self.sequence_length+self.prediction_horizon,0]
                sequences.append(seq)
                targets.append(target)

        return np.array(sequences), np.array(targets)

    # ================================
    # Build model
    # ================================
    def build_model(self, input_shape):
        model = Sequential([
            Input(shape=input_shape),
            LSTM(128, return_sequences=True, recurrent_dropout=0.2),
            LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            LSTM(32, dropout=0.2, recurrent_dropout=0.2),
            Dense(64, activation="relu"),
            Dropout(0.3),
            Dense(32, activation="relu"),
            Dropout(0.2),
            Dense(self.prediction_horizon, activation="linear")
        ])
        model.compile(optimizer=Adam(0.001), loss="mse", metrics=["mae"])
        return model

    # ================================
    # Temporal split
    # ================================
    def temporal_split(self, X, y, val_ratio=0.2):
        split_index = int(len(X)*(1-val_ratio))
        return X[:split_index], X[split_index:], y[:split_index], y[split_index:]

    # ================================
    # Training
    # ================================
    def train(self, csv_file_path, validation_split=0.2, epochs=50):
        data = self.load_and_preprocess_data(csv_file_path)
        X, y = self.create_sequences(data)

        X_reshaped = X.reshape(-1, X.shape[-1])
        X_scaled = self.scaler.fit_transform(X_reshaped).reshape(X.shape)

        X_train, X_val, y_train, y_val = self.temporal_split(X_scaled, y, validation_split)
        self.model = self.build_model((X.shape[1], X.shape[2]))

        callbacks = [
            EarlyStopping(patience=10, restore_best_weights=True),
            ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6)
        ]

        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=32,
            callbacks=callbacks,
            verbose=1
        )
        return data, history

    # ================================
    # Prediksi per geo_hash
    # ================================
    def predict_next_day_30min_filtered(self, data, geo_hash):
        """
        Prediksi hanya untuk slot yang pernah ada di data historis.
        Jika data < sequence_length → tetap ditampilkan di CSV, 
        tapi predicted_request_count = NaN.
        """
        geo_data = data[data["origin_geo_hash"]==geo_hash].sort_values(["request_date","slot_30min"])
        features = ["request_count","slot_30min","day_of_week","is_weekend","geo_encoded"]

        # Slot historis yang ada
        existing_slots = geo_data["slot_30min"].unique()
        tomorrow_date = geo_data["request_date"].max() + timedelta(days=1) if len(geo_data) > 0 else pd.NaT

        # Kalau data terlalu sedikit → return dataframe dengan NaN
        if len(geo_data) < self.sequence_length or self.model is None:
            return pd.DataFrame({
                "geo_hash": [geo_hash]*len(existing_slots),
                "date": [tomorrow_date]*len(existing_slots),
                "slot_30min": existing_slots,
                "predicted_request_count": [0]*len(existing_slots)   # bisa diganti 0 kalau mau
            })

        # Ambil sequence terakhir untuk prediksi
        last_seq = geo_data[features].tail(self.sequence_length).values
        last_seq[:,0] = np.log1p(last_seq[:,0])  # log-transform
        seq_scaled = self.scaler.transform(last_seq).reshape(1,self.sequence_length,len(features))
        pred_full = self.model.predict(seq_scaled, verbose=0)[0]

        # Inverse log-transform
        pred_full = np.expm1(pred_full)
        pred_full = np.maximum(0, pred_full)

        # Buat dataframe hasil prediksi
        pred_dict = {
            "geo_hash": [],
            "date": [],
            "slot_30min": [],
            "predicted_request_count": []
        }

        for i, slot in enumerate(range(len(pred_full))):
            if slot in existing_slots:
                pred_dict["geo_hash"].append(geo_hash)
                pred_dict["date"].append(tomorrow_date)
                pred_dict["slot_30min"].append(slot)
                pred_dict["predicted_request_count"].append(pred_full[i])

        # bikin dataframe & sort
        df_result = pd.DataFrame(pred_dict)
        df_result = df_result.sort_values(["geo_hash", "date", "slot_30min"]).reset_index(drop=True)
        return df_result
        
    # ================================
    # Prediksi semua geo_hash
    # ================================
    def predict_all_next_day_30min_filtered(self, data, fallback_value=np.nan):
        """
        Prediksi untuk semua geo_hash, hanya untuk slot yang ada di historis.
        Jika histori < sequence_length → tetap ditampilkan dengan predicted_request_count = NaN/0.
        """
        results = []
        for geo_hash in data["origin_geo_hash"].unique():
            df_pred = self.predict_next_day_30min_filtered(data, geo_hash)

            # Kalau hasil kosong (misalnya data kosong) → buat dummy NaN/0
            if df_pred.empty:
                geo_data = data[data["origin_geo_hash"]==geo_hash]
                existing_slots = geo_data["slot_30min"].unique()
                tomorrow_date = geo_data["request_date"].max() + timedelta(days=1) if len(geo_data) > 0 else pd.NaT
                df_pred = pd.DataFrame({
                    "geo_hash": [geo_hash]*len(existing_slots),
                    "date": [tomorrow_date]*len(existing_slots),
                    "slot_30min": existing_slots,
                    "predicted_request_count": [fallback_value]*len(existing_slots)
                })

            results.append(df_pred)

        if results:
                df_all = pd.concat(results, ignore_index=True)
                df_all = df_all.sort_values(["geo_hash", "date", "slot_30min"]).reset_index(drop=True)
                return df_all
        else:
                return pd.DataFrame(columns=["geo_hash", "date", "slot_30min", "predicted_request_count"])
        
    # ================================
    # Evaluasi
    # ================================
    def evaluate_yesterday_30min(self, data, yesterday_predictions):
        actuals = data.merge(
            yesterday_predictions[["geo_hash","date","slot_30min"]],
            left_on=["origin_geo_hash","request_date","slot_30min"],
            right_on=["geo_hash","date","slot_30min"],
            how="inner"
        )
        actuals = actuals.rename(columns={"request_count":"actual_request_count"})
        merged = pd.merge(yesterday_predictions, actuals, on=["geo_hash","date","slot_30min"], how="inner")

        if merged.empty:
            print("⚠️ Tidak ada data aktual untuk dievaluasi.")
            return None

        mae = mean_absolute_error(merged["actual_request_count"], merged["predicted_request_count"])
        rmse = mean_squared_error(merged["actual_request_count"], merged["predicted_request_count"], squared=False)
        print(f"Evaluasi untuk {merged['date'].iloc[0].date()}: MAE={mae:.2f}, RMSE={rmse:.2f}")
        return merged, {"MAE": mae, "RMSE": rmse}


In [6]:
# =============================
# 1️⃣ Load dan update histori
# =============================
historical_file = "data_historis.csv"
file_new_data = "data_29_09_2025.csv"  # ganti sesuai data hari ini

# Load data historis
if Path(historical_file).exists():
    historical_data = pd.read_csv(historical_file)
else:
    historical_data = pd.DataFrame()  # kalau histori belum ada, buat dataframe kosong
    print("⚠️ File histori belum ada, membuat dataframe kosong.")

# Tambahkan data baru jika ada
if Path(file_new_data).exists():
    new_data = pd.read_csv(file_new_data)
    historical_data = pd.concat([historical_data, new_data], ignore_index=True)
    historical_data.to_csv(historical_file, index=False)
    print(f"✅ Data baru dari {file_new_data} ditambahkan ke histori. Total rows: {len(historical_data)}")
else:
    print(f"⚠️ File {file_new_data} tidak ditemukan, menggunakan histori yang ada saja.")

# =============================
# 2️⃣ Inisialisasi dan train model
# =============================
predictor = TimeSeriesRequestPredictor30Min(sequence_length=144, prediction_horizon=48)

if len(historical_data) > 0:
    data_preprocessed, history = predictor.train(historical_file, epochs=20)
    print("✅ Model selesai di-train.")
else:
    data_preprocessed = pd.DataFrame()  # kosong jika tidak ada data

# =============================
# 3️⃣ Prediksi hari berikutnya
# =============================
if len(data_preprocessed) > 0:
    tomorrow_date = (data_preprocessed["request_date"].max() + timedelta(days=1)).strftime("%d_%m_%Y")
    predictions = predictor.predict_all_next_day_30min_filtered(data_preprocessed)

    # Simpan hasil prediksi
    pred_file = f"prediksi_{tomorrow_date}.csv"
    predictions.to_csv(pred_file, index=False)
    print(f"✅ Prediksi untuk {tomorrow_date} tersimpan di {pred_file}")
else:
    print("⚠️ Tidak ada data untuk prediksi.")

# =============================
# 4️⃣ Evaluasi prediksi kemarin
# =============================
yesterday_date = (data_preprocessed["request_date"].max()).strftime("%d_%m_%Y") if len(data_preprocessed) > 0 else None
file_yesterday_actual = f"data_{yesterday_date}.csv" if yesterday_date else None

if file_yesterday_actual and Path(file_yesterday_actual).exists() and len(data_preprocessed) > 0:
    actual_yesterday = predictor.load_and_preprocess_data(file_yesterday_actual)
    eval_results, metrics = predictor.evaluate_yesterday_30min(actual_yesterday, predictions)
else:
    print(f"⚠️ Data aktual untuk {yesterday_date} belum tersedia atau data kosong, evaluasi dilewati.")

⚠️ File data_29_09_2025.csv tidak ditemukan, menggunakan histori yang ada saja.
Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 189ms/step - loss: 1.2939 - mae: 0.9672 - val_loss: 1.4443 - val_mae: 1.0402 - learning_rate: 0.0010
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step - loss: 1.0542 - mae: 0.8539 - val_loss: 0.9475 - val_mae: 0.7953 - learning_rate: 0.0010
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - loss: 0.8254 - mae: 0.7280 - val_loss: 0.6999 - val_mae: 0.6651 - learning_rate: 0.0010
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - loss: 0.6573 - mae: 0.6442 - val_loss: 0.5674 - val_mae: 0.5800 - learning_rate: 0.0010
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step - loss: 0.5688 - mae: 0.5920 - val_loss: 0.4974 - val_mae: 0.5225 - learning_rate: 0.0010
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━