In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')




In [16]:
class TimeSeriesRequestPredictor:
    def __init__(self, sequence_length=7, prediction_horizon=1):
        """
        Pipeline prediksi request count dengan LSTM
        Args:
            sequence_length: panjang window historis (hari)
            prediction_horizon: horizon prediksi (berapa hari ke depan)
        """
        self.sequence_length = sequence_length
        self.prediction_horizon = prediction_horizon
        self.scaler = StandardScaler()
        self.geo_encoder = LabelEncoder()
        self.model = None
        self.last_trained_date = None   # simpan tanggal terakhir data training

    # -------------------- LOAD & PREPROCESS --------------------
    def load_and_preprocess_data(self, csv_file_path):
        df = pd.read_csv(csv_file_path)

        # Pastikan request_date datetime
        df["request_date"] = pd.to_datetime(df["request_date"], errors="coerce")

        # Ekstrak jam
        if "time_slot" in df.columns:
            def extract_hour(val):
                if pd.isna(val):
                    return -1   # NaN -> dummy
                if isinstance(val, str) and "-" in val:
                    return int(val.split(":")[0])
                try:
                    return pd.to_datetime(val, errors="coerce").hour
                except Exception:
                    return -1
            df["time_slot"] = df["time_slot"].apply(extract_hour)
        else:
            df["time_slot"] = -1

        # Dummy flag
        df["is_missing_slot"] = (df["time_slot"] == -1).astype(int)

        # Aggregate per hari per lokasi
        daily_data = df.groupby(["request_date", "origin_geo_hash"]).agg({
            "request_count": "sum",
            "time_slot": "count"
        }).reset_index()

        daily_data.rename(columns={"time_slot": "active_slots"}, inplace=True)

        # Feature engineering
        daily_data["day_of_week"] = daily_data["request_date"].dt.dayofweek
        daily_data["day_of_month"] = daily_data["request_date"].dt.day
        daily_data["month"] = daily_data["request_date"].dt.month
        daily_data["is_weekend"] = (daily_data["day_of_week"] >= 5).astype(int)

        # Encode geo
        daily_data["geo_encoded"] = self.geo_encoder.fit_transform(daily_data["origin_geo_hash"])

        # Sort
        daily_data = daily_data.sort_values(["origin_geo_hash", "request_date"])
        return daily_data

    # -------------------- SEQUENCE CREATION --------------------
    def create_sequences(self, data):
        sequences, targets = [], []
        features = [
            "request_count",
            "active_slots",
            "day_of_week",
            "day_of_month",
            "month",
            "is_weekend",
            "geo_encoded",
        ]

        for geo_hash in data["origin_geo_hash"].unique():
            geo_data = data[data["origin_geo_hash"] == geo_hash].sort_values("request_date")

            # Lengkapi missing date
            date_range = pd.date_range(
                start=geo_data["request_date"].min(),
                end=geo_data["request_date"].max(),
                freq="D"
            )
            geo_data = (
                geo_data.set_index("request_date")
                .reindex(date_range, fill_value=0)
                .reset_index()
                .rename(columns={"index": "request_date"})
            )

            feature_data = geo_data[features].values

            for i in range(len(feature_data) - self.sequence_length):
                if i + self.sequence_length + self.prediction_horizon <= len(feature_data):
                    seq = feature_data[i:i + self.sequence_length]
                    target = feature_data[
                        i + self.sequence_length:i + self.sequence_length + self.prediction_horizon, 0
                    ]
                    sequences.append(seq)
                    targets.append(target)

        return np.array(sequences), np.array(targets)

    # -------------------- MODEL --------------------
    def build_model(self, input_shape):
        model = Sequential([
            Input(shape=input_shape),
            LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            LSTM(32, dropout=0.2, recurrent_dropout=0.2),
            Dense(64, activation="relu"),
            Dropout(0.3),
            Dense(32, activation="relu"),
            Dropout(0.2),
            Dense(self.prediction_horizon, activation="linear"),
        ])
        model.compile(optimizer=Adam(0.001), loss="mse", metrics=["mae"])
        return model

    # -------------------- TRAIN --------------------
    def temporal_split(self, X, y, val_ratio=0.2):
        split_index = int(len(X) * (1 - val_ratio))
        return X[:split_index], X[split_index:], y[:split_index], y[split_index:]

    def train(self, csv_file_path, validation_split=0.2, epochs=50):
        print("Loading data...")
        data = self.load_and_preprocess_data(csv_file_path)

        print("Creating sequences...")
        X, y = self.create_sequences(data)

        X_scaled = self.scaler.fit_transform(X.reshape(-1, X.shape[-1]))
        X_scaled = X_scaled.reshape(X.shape)

        X_train, X_val, y_train, y_val = self.temporal_split(X_scaled, y, validation_split)

        self.model = self.build_model((X.shape[1], X.shape[2]))
        print(self.model.summary())

        callbacks = [
            EarlyStopping(patience=10, restore_best_weights=True),
            ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6),
        ]

        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=32,
            callbacks=callbacks,
            verbose=1
        )

        self.last_trained_date = data["request_date"].max()
        return data, history

    # -------------------- UPDATE (FINE TUNE) --------------------
    def update_with_new_data(self, new_csv_file, epochs=10):
        if self.model is None:
            raise ValueError("Model belum di-train!")

        new_data = self.load_and_preprocess_data(new_csv_file)
        X_new, y_new = self.create_sequences(new_data)

        X_new_scaled = self.scaler.transform(X_new.reshape(-1, X_new.shape[-1]))
        X_new_scaled = X_new_scaled.reshape(X_new.shape)

        print(f"Fine-tuning model dengan {len(X_new_scaled)} sequences dari data baru...")
        history = self.model.fit(
            X_new_scaled, y_new,
            epochs=epochs,
            batch_size=32,
            verbose=1
        )
        self.last_trained_date = new_data["request_date"].max()
        return new_data, history

    # -------------------- PREDICTION --------------------
    def predict_next_day(self, data, geo_hash):
        if self.model is None:
            raise ValueError("Model belum di-train!")

        geo_data = data[data["origin_geo_hash"] == geo_hash].sort_values("request_date")
        features = [
            "request_count", "active_slots", "day_of_week",
            "day_of_month", "month", "is_weekend", "geo_encoded"
        ]

        if len(geo_data) < self.sequence_length:
            raise ValueError(f"Data geo_hash '{geo_hash}' kurang dari {self.sequence_length} hari.")

        last_sequence = geo_data[features].tail(self.sequence_length).values
        seq_scaled = self.scaler.transform(last_sequence)
        seq_scaled = seq_scaled.reshape(1, self.sequence_length, len(features))

        pred = self.model.predict(seq_scaled, verbose=0)[0]
        tomorrow = geo_data["request_date"].max() + timedelta(days=1)

        return pd.DataFrame({
            "geo_hash": [geo_hash],
            "date": [tomorrow],
            "predicted_request_count": [max(0, pred[0])]
        })

    def predict_all_next_day(self, data):
        results = []
        for geo_hash in data["origin_geo_hash"].unique():
            try:
                pred_df = self.predict_next_day(data, geo_hash)
                results.append(pred_df)
            except Exception as e:
                print(f"Skipping {geo_hash}, error: {e}")

        return pd.concat(results, ignore_index=True) if results else pd.DataFrame()

    # -------------------- EVALUATION --------------------
    def evaluate_yesterday_prediction(self, data, yesterday_predictions):
        actuals = data[data["request_date"].isin(yesterday_predictions["date"])][
            ["origin_geo_hash", "request_date", "request_count"]
        ]
        actuals = actuals.rename(columns={
            "origin_geo_hash": "geo_hash",
            "request_date": "date",
            "request_count": "actual_request_count"
        })

        merged = pd.merge(yesterday_predictions, actuals, on=["geo_hash", "date"], how="inner")
        if merged.empty:
            print("⚠️ Tidak ada data aktual untuk evaluasi.")
            return None

        mae = mean_absolute_error(merged["actual_request_count"], merged["predicted_request_count"])
        rmse = mean_squared_error(merged["actual_request_count"], merged["predicted_request_count"], squared=False)

        print(f"Evaluasi {merged['date'].iloc[0].date()}: MAE={mae:.2f}, RMSE={rmse:.2f}")
        return merged, {"MAE": mae, "RMSE": rmse}


In [17]:
# ===============================
# 1. TRAIN PERTAMA
# ===============================
predictor = TimeSeriesRequestPredictor(sequence_length=7, prediction_horizon=1)

# Train model dengan dataset awal
data, history = predictor.train("data/train_awal.csv", epochs=30)

# Prediksi besok untuk semua lokasi
predictions_dayN = predictor.predict_all_next_day(data)
print("Prediksi besok (training awal):")
print(predictions_dayN.head())

# Simpan prediksi ke file (misalnya buat evaluasi besok)
predictions_dayN.to_csv("prediksi_hariN.csv", index=False)

Loading data...


FileNotFoundError: [Errno 2] No such file or directory: 'data/train_awal.csv'