In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')




In [8]:

class TimeSeriesRequestPredictor:
    def __init__(self, sequence_length=7, prediction_horizon=1):
        """
        Pipeline prediksi request count dengan LSTM
        Args:
            sequence_length: panjang window historis (hari)
            prediction_horizon: horizon prediksi (berapa hari ke depan)
        """
        self.sequence_length = sequence_length
        self.prediction_horizon = prediction_horizon
        self.scaler = StandardScaler()
        self.geo_encoder = LabelEncoder()
        self.model = None

    def load_and_preprocess_data(self, csv_file_path):
        """
        Load dan preprocess data CSV
        """
        df = pd.read_csv(csv_file_path)

        # Pastikan request_date datetime
        df["request_date"] = pd.to_datetime(df["request_date"], errors="coerce")

        # Ekstrak jam dari kolom time_slot (jika ada)
        if "time_slot" in df.columns:
            def extract_hour(val):
                if pd.isna(val):
                    return 0
                if isinstance(val, str) and "-" in val:
                    return int(val.split(":")[0])
                try:
                    return pd.to_datetime(val, errors="coerce").hour
                except Exception:
                    return 0

            df["time_slot"] = df["time_slot"].apply(extract_hour)
        else:
            df["time_slot"] = 0

        # Aggregate per hari per lokasi
        daily_data = df.groupby(["request_date", "origin_geo_hash"]).agg({
            "request_count": "sum",
            "time_slot": "count"  # jumlah slot aktif
        }).reset_index()

        daily_data.rename(columns={"time_slot": "active_slots"}, inplace=True)

        # Feature engineering
        daily_data["day_of_week"] = daily_data["request_date"].dt.dayofweek
        daily_data["day_of_month"] = daily_data["request_date"].dt.day
        daily_data["month"] = daily_data["request_date"].dt.month
        daily_data["is_weekend"] = (daily_data["day_of_week"] >= 5).astype(int)

        # Encode geo hash
        daily_data["geo_encoded"] = self.geo_encoder.fit_transform(daily_data["origin_geo_hash"])

        # Sort by date and geo
        daily_data = daily_data.sort_values(["origin_geo_hash", "request_date"])

        return daily_data

    def create_sequences(self, data):
        """
        Buat sequences untuk LSTM
        """
        sequences, targets = [], []
        features = [
            "request_count",
            "active_slots",
            "day_of_week",
            "day_of_month",
            "month",
            "is_weekend",
            "geo_encoded",
        ]

        for geo_hash in data["origin_geo_hash"].unique():
            geo_data = data[data["origin_geo_hash"] == geo_hash].sort_values(
                "request_date"
            )

            # Lengkapi missing dates
            date_range = pd.date_range(
                start=geo_data["request_date"].min(),
                end=geo_data["request_date"].max(),
                freq="D",
            )
            geo_data = (
                geo_data.set_index("request_date")
                .reindex(date_range, fill_value=0)
                .reset_index()
                .rename(columns={"index": "request_date"})
            )

            feature_data = geo_data[features].values

            for i in range(len(feature_data) - self.sequence_length):
                if (
                    i + self.sequence_length + self.prediction_horizon
                    <= len(feature_data)
                ):
                    seq = feature_data[i : i + self.sequence_length]
                    target = feature_data[
                        i + self.sequence_length : i + self.sequence_length
                        + self.prediction_horizon,
                        0,
                    ]
                    sequences.append(seq)
                    targets.append(target)

        return np.array(sequences), np.array(targets)

    def build_model(self, input_shape):
        """
        Build LSTM model
        """
        model = Sequential(
            [
                Input(shape=input_shape),
                LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
                LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
                LSTM(32, dropout=0.2, recurrent_dropout=0.2),
                Dense(64, activation="relu"),
                Dropout(0.3),
                Dense(32, activation="relu"),
                Dropout(0.2),
                Dense(self.prediction_horizon, activation="linear"),
            ]
        )

        model.compile(optimizer=Adam(0.001), loss="mse", metrics=["mae"])
        return model

    def temporal_split(self, X, y, val_ratio=0.2):
        """
        Split data secara temporal (bukan random)
        """
        split_index = int(len(X) * (1 - val_ratio))
        X_train, X_val = X[:split_index], X[split_index:]
        y_train, y_val = y[:split_index], y[split_index:]
        return X_train, X_val, y_train, y_val

    def train(self, csv_file_path, validation_split=0.2, epochs=50):
        """
        Train model dengan temporal split
        """
        print("Loading data...")
        data = self.load_and_preprocess_data(csv_file_path)

        print("Creating sequences...")
        X, y = self.create_sequences(data)

        print(f"Total sequences: {len(X)}")
        print(f"X shape: {X.shape}, y shape: {y.shape}")

        # Scale
        X_reshaped = X.reshape(-1, X.shape[-1])
        X_scaled = self.scaler.fit_transform(X_reshaped)
        X_scaled = X_scaled.reshape(X.shape)

        # Temporal split
        X_train, X_val, y_train, y_val = self.temporal_split(X_scaled, y, validation_split)

        # Build model
        self.model = self.build_model((X.shape[1], X.shape[2]))
        print(self.model.summary())

        callbacks = [
            EarlyStopping(patience=10, restore_best_weights=True),
            ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6),
        ]

        history = self.model.fit(
            X_train,
            y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=32,
            callbacks=callbacks,
            verbose=1,
        )

        return data, history

    def predict_next_day(self, data, geo_hash):
        """
        Prediksi request count untuk besok di 1 lokasi (geo_hash)
        """
        if self.model is None:
            raise ValueError("Model belum di-train!")

        # Ambil data terakhir
        geo_data = data[data["origin_geo_hash"] == geo_hash].sort_values("request_date")
        features = [
            "request_count", "active_slots", "day_of_week",
            "day_of_month", "month", "is_weekend", "geo_encoded"
        ]
        
        # Cek apakah ada cukup data untuk panjang urutan yang ditentukan
        if len(geo_data) < self.sequence_length:
            raise ValueError(f"Tidak cukup data untuk geo_hash '{geo_hash}'. Dibutuhkan {self.sequence_length} hari, tapi hanya tersedia {len(geo_data)} hari.")
            
        last_sequence = geo_data[features].tail(self.sequence_length).values

        # Normalisasi: perbaikan di sini untuk mengatasi ValueError
        seq_scaled = self.scaler.transform(last_sequence)
        seq_scaled = seq_scaled.reshape(1, self.sequence_length, len(features))

        # Prediksi
        pred = self.model.predict(seq_scaled, verbose=0)[0]

        # Tentukan tanggal besok
        tomorrow = geo_data["request_date"].max() + timedelta(days=1)

        # Kembalikan sebagai tabel (DataFrame)
        result_df = pd.DataFrame({
            "geo_hash": [geo_hash],
            "date": [tomorrow],
            "predicted_request_count": [max(0, pred[0])] # Jaga jangan negatif
        })
        return result_df
    
    def predict_all_next_day(self, data):
        """
        Prediksi request_count besok untuk semua lokasi (geo_hash)
        """
        results = []
        tomorrow = data["request_date"].max() + timedelta(days=1)

        for geo_hash in data["origin_geo_hash"].unique():
            try:
                pred_df = self.predict_next_day(data, geo_hash)
                results.append(pred_df)
            except Exception as e:
                print(f"Skipping {geo_hash}, error: {e}")

        if results:
            return pd.concat(results, ignore_index=True)
        else:
            return pd.DataFrame(columns=["geo_hash", "date", "predicted_request_count"])

    def evaluate_yesterday_prediction(self, data, yesterday_predictions):
        """
        Evaluasi prediksi kemarin dengan data aktual hari ini
        Args:
            data: Dataframe hasil preprocessing (yang sudah ada data terbaru)
            yesterday_predictions: DataFrame hasil prediksi kemarin (geo_hash, date, predicted_request_count)
        """
        # Ambil data aktual di tanggal yang sama
        actuals = data[data["request_date"].isin(yesterday_predictions["date"])][
            ["origin_geo_hash", "request_date", "request_count"]
        ]
        actuals = actuals.rename(columns={
            "origin_geo_hash": "geo_hash",
            "request_date": "date",
            "request_count": "actual_request_count"
        })

        # Merge prediksi dengan aktual
        merged = pd.merge(yesterday_predictions, actuals, on=["geo_hash", "date"], how="inner")

        if merged.empty:
            print("⚠️ Tidak ada data aktual untuk dievaluasi.")
            return None

        # Hitung metrik
        mae = mean_absolute_error(merged["actual_request_count"], merged["predicted_request_count"])
        rmse = mean_squared_error(merged["actual_request_count"], merged["predicted_request_count"], squared=False)

        print(f"Evaluasi untuk {merged['date'].iloc[0].date()}:")
        print(f"  MAE  = {mae:.2f}")
        print(f"  RMSE = {rmse:.2f}")

        return merged, {"MAE": mae, "RMSE": rmse}

In [None]:
if __name__ == "__main__":
    predictor = TimeSeriesRequestPredictor(sequence_length=7, prediction_horizon=1)

    # Train pertama kali
    data, history = predictor.train("./studio_results_20250925_1926.csv", epochs=30)

    # Prediksi untuk besok (misal tanggal 8)
    yesterday_predictions = predictor.predict_all_next_day(data)
    print("Prediksi kemarin:")
    print(yesterday_predictions)

    # ==== Keesokan harinya kamu sudah punya data baru (tgl 8 masuk) ====
    # Load ulang data dengan update
    new_data = predictor.load_and_preprocess_data("./New_Data.csv")

    # Evaluasi hasil prediksi kemarin dengan data aktual tgl 8
    eval_results, metrics = predictor.evaluate_yesterday_prediction(new_data, yesterday_predictions)
    print(eval_results)

    # Sekarang bikin prediksi lagi untuk tgl 9
    today_predictions = predictor.predict_all_next_day(new_data)
    print("Prediksi hari ini:")
    print(today_predictions)

Loading data...
Creating sequences...
Total sequences: 2064
X shape: (2064, 7, 7), y shape: (2064, 1)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 7, 128)            69632     
                                                                 
 lstm_4 (LSTM)               (None, 7, 64)             49408     
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dense_3 (Dense)             (None, 64)                2112      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                  

FileNotFoundError: [Errno 2] No such file or directory: './studio_results_20250925_1927.csv'