In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')




In [4]:
class TimeSeriesRequestPredictor:
    def __init__(self, sequence_length=7, prediction_horizon=1):
        """
        Pipeline untuk prediksi request count menggunakan LSTM
        
        Args:
            sequence_length: Jumlah hari historis untuk prediksi (window size)
            prediction_horizon: Jumlah hari ke depan yang diprediksi
        """
        self.sequence_length = sequence_length
        self.prediction_horizon = prediction_horizon
        self.scaler = StandardScaler()
        self.geo_encoder = LabelEncoder()
        self.model = None
        self.feature_columns = []
        
    def load_and_preprocess_data(self, csv_file_path):
        """
        Load dan preprocess data CSV
        """
        # Load data
        df = pd.read_csv(csv_file_path)
        
        # Convert datetime
        df['request_date'] = pd.to_datetime(df['request_date'])
        df['time_slot'] = pd.to_datetime(df['time_slot'], format='%H:%M-%H:%M', errors='coerce').dt.hour
        
        # Aggregate per hari per lokasi
        daily_data = df.groupby(['request_date', 'origin_geo_hash']).agg({
            'request_count': 'sum',
            'time_slot': 'count'  # sebagai feature tambahan (jumlah time slot aktif)
        }).reset_index()
        
        daily_data.rename(columns={'time_slot': 'active_slots'}, inplace=True)
        
        # Feature engineering
        daily_data['day_of_week'] = daily_data['request_date'].dt.dayofweek
        daily_data['day_of_month'] = daily_data['request_date'].dt.day
        daily_data['month'] = daily_data['request_date'].dt.month
        daily_data['is_weekend'] = (daily_data['day_of_week'] >= 5).astype(int)
        
        # Encode geo hash
        daily_data['geo_encoded'] = self.geo_encoder.fit_transform(daily_data['origin_geo_hash'])
        
        # Sort by date and geo
        daily_data = daily_data.sort_values(['origin_geo_hash', 'request_date'])
        
        return daily_data
    
    def create_sequences(self, data):
        """
        Buat sequences untuk LSTM training
        """
        sequences = []
        targets = []
        
        # Group by geo hash
        for geo_hash in data['origin_geo_hash'].unique():
            geo_data = data[data['origin_geo_hash'] == geo_hash].sort_values('request_date')
            
            # Pastikan data continuous (fill missing dates)
            date_range = pd.date_range(
                start=geo_data['request_date'].min(),
                end=geo_data['request_date'].max(),
                freq='D'
            )
            
            # Reindex dengan semua tanggal
            geo_data = geo_data.set_index('request_date').reindex(date_range, fill_value=0)
            geo_data.index.name = 'request_date'
            geo_data = geo_data.reset_index()
            
            # Fill missing values dengan forward fill atau interpolation
            geo_data = geo_data.fillna(method='ffill').fillna(method='bfill')
            
            # Features untuk model
            features = [
                'request_count', 'active_slots', 'day_of_week', 
                'day_of_month', 'month', 'is_weekend', 'geo_encoded'
            ]
            
            feature_data = geo_data[features].values
            
            # Create sequences
            for i in range(len(feature_data) - self.sequence_length):
                if i + self.sequence_length + self.prediction_horizon <= len(feature_data):
                    # Input sequence
                    seq = feature_data[i:i + self.sequence_length]
                    # Target (request_count saja)
                    target = feature_data[i + self.sequence_length:i + self.sequence_length + self.prediction_horizon, 0]
                    
                    sequences.append(seq)
                    targets.append(target)
        
        return np.array(sequences), np.array(targets)
    
    def build_model(self, input_shape):
        """
        Build LSTM model
        """
        model = Sequential([
            Input(shape=input_shape),
            
            # LSTM layers
            LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            LSTM(32, dropout=0.2, recurrent_dropout=0.2),
            
            # Dense layers
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(self.prediction_horizon, activation='linear')  # Linear untuk regression
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='mse',
            metrics=['mae']
        )
        
        return model
    
    def train(self, csv_file_path, validation_split=0.2, epochs=100):
        """
        Train model
        """
        # Load dan preprocess data
        print("Loading and preprocessing data...")
        data = self.load_and_preprocess_data(csv_file_path)
        
        # Create sequences
        print("Creating sequences...")
        X, y = self.create_sequences(data)
        
        print(f"Total sequences: {len(X)}")
        print(f"Input shape: {X.shape}")
        print(f"Target shape: {y.shape}")
        
        # Normalize features
        X_reshaped = X.reshape(-1, X.shape[-1])
        X_scaled = self.scaler.fit_transform(X_reshaped)
        X_scaled = X_scaled.reshape(X.shape)
        
        # Split data
        X_train, X_val, y_train, y_val = train_test_split(
            X_scaled, y, test_size=validation_split, random_state=42
        )
        
        # Build model
        self.model = self.build_model((X.shape[1], X.shape[2]))
        print(self.model.summary())
        
        # Callbacks
        callbacks = [
            EarlyStopping(patience=15, restore_best_weights=True),
            ReduceLROnPlateau(factor=0.5, patience=7, min_lr=1e-7)
        ]
        
        # Train
        print("Training model...")
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=32,
            callbacks=callbacks,
            verbose=1
        )
        
        return history
    
    def predict(self, data, geo_hash, start_date, days_ahead=7):
        """
        Prediksi untuk geo hash tertentu
        """
        if self.model is None:
            raise ValueError("Model belum di-train!")
        
        # Filter data untuk geo hash tertentu
        geo_data = data[data['origin_geo_hash'] == geo_hash].sort_values('request_date')
        
        # Ambil sequence terakhir
        features = [
            'request_count', 'active_slots', 'day_of_week', 
            'day_of_month', 'month', 'is_weekend', 'geo_encoded'
        ]
        
        last_sequence = geo_data[features].tail(self.sequence_length).values
        
        predictions = []
        current_sequence = last_sequence.copy()
        
        for i in range(days_ahead):
            # Normalize sequence
            seq_reshaped = current_sequence.reshape(1, -1)
            seq_scaled = self.scaler.transform(seq_reshaped)
            seq_scaled = seq_scaled.reshape(1, self.sequence_length, len(features))
            
            # Predict
            pred = self.model.predict(seq_scaled, verbose=0)[0]
            predictions.extend(pred)
            
            # Update sequence untuk prediksi berikutnya
            if i < days_ahead - 1:
                next_date = start_date + timedelta(days=i+1)
                
                # Create new row dengan prediksi
                new_row = current_sequence[-1].copy()
                new_row[0] = pred[0]  # request_count
                new_row[2] = next_date.weekday()  # day_of_week
                new_row[3] = next_date.day  # day_of_month
                new_row[4] = next_date.month  # month
                new_row[5] = 1 if next_date.weekday() >= 5 else 0  # is_weekend
                
                # Slide sequence
                current_sequence = np.vstack([current_sequence[1:], new_row.reshape(1, -1)])
        
        return predictions[:days_ahead]
    
    def plot_predictions(self, actual_data, predictions, geo_hash, start_date, days_ahead):
        """
        Plot hasil prediksi
        """
        plt.figure(figsize=(12, 6))
        
        # Historical data
        geo_data = actual_data[actual_data['origin_geo_hash'] == geo_hash].sort_values('request_date')
        historical_dates = geo_data['request_date'].tail(30)
        historical_counts = geo_data['request_count'].tail(30)
        
        plt.plot(historical_dates, historical_counts, label='Historical', marker='o')
        
        # Predictions
        pred_dates = [start_date + timedelta(days=i) for i in range(days_ahead)]
        plt.plot(pred_dates, predictions, label='Predictions', marker='s', color='red')
        
        plt.title(f'Request Count Prediction - {geo_hash}')
        plt.xlabel('Date')
        plt.ylabel('Request Count')
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()


In [5]:
class TimeSeriesRequestPredictor:
    def __init__(self, n_past=24, n_future=24, batch_size=32, shift=1):
        """
        Pipeline untuk prediksi request count menggunakan LSTM dengan windowed dataset
        
        Args:
            n_past: Jumlah time steps untuk input (lookback window)
            n_future: Jumlah time steps untuk prediksi (forecast horizon) 
            batch_size: Batch size untuk training
            shift: Shift untuk sliding window
        """
        self.n_past = n_past
        self.n_future = n_future
        self.batch_size = batch_size
        self.shift = shift
        self.scaler = StandardScaler()
        self.geo_encoder = LabelEncoder()
        self.model = None
        self.feature_columns = []
        
    def windowed_dataset(self, series, batch_size, n_past=24, n_future=24, shift=1):
        """
        Membuat windowed dataset untuk time series dengan TensorFlow
        
        Args:
            series: Input time series data
            batch_size: Batch size
            n_past: Window size untuk input
            n_future: Window size untuk output
            shift: Shift untuk sliding window
        
        Returns:
            tf.data.Dataset: Windowed dataset
        """
        # Convert to TensorFlow dataset
        ds = tf.data.Dataset.from_tensor_slices(series)
        
        # Create windowed dataset
        ds = ds.window(n_past + n_future, shift=shift, drop_remainder=True)
        ds = ds.flat_map(lambda w: w.batch(n_past + n_future))
        ds = ds.map(lambda w: (w[:n_past], w[n_past:]))
        
        return ds.batch(batch_size).prefetch(1)
    
    def load_and_preprocess_data(self, csv_file_path):
        """
        Load dan preprocess data CSV untuk time series
        """
        # Load data
        df = pd.read_csv(csv_file_path)
        
        # Convert datetime
        df['request_date'] = pd.to_datetime(df['request_date'])
        df['time_slot'] = pd.to_datetime(df['time_slot'], format='%H:%M-%H:%M', errors='coerce').dt.hour
        
        # Create datetime index combining date and hour
        df['datetime'] = df['request_date'] + pd.to_timedelta(df['time_slot'], unit='h')
        
        # Sort by datetime and geo hash
        df = df.sort_values(['origin_geo_hash', 'datetime'])
        
        # Feature engineering
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek
        df['day_of_month'] = df['datetime'].dt.day
        df['month'] = df['datetime'].dt.month
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['is_peak_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9) | 
                              (df['hour'] >= 17) & (df['hour'] <= 19)).astype(int)
        
        # Encode geo hash
        df['geo_encoded'] = self.geo_encoder.fit_transform(df['origin_geo_hash'])
        
        return df
    
    def create_multivariate_series(self, data):
        """
        Membuat multivariate time series dari data
        """
        series_data = []
        
        # Group by geo hash untuk menjaga continuity per lokasi
        for geo_hash in data['origin_geo_hash'].unique():
            geo_data = data[data['origin_geo_hash'] == geo_hash].copy()
            geo_data = geo_data.sort_values('datetime')
            
            # Create complete datetime range untuk fill missing values
            full_range = pd.date_range(
                start=geo_data['datetime'].min(),
                end=geo_data['datetime'].max(),
                freq='H'  # Hourly frequency
            )
            
            # Reindex untuk fill missing timestamps
            geo_data = geo_data.set_index('datetime').reindex(full_range)
            
            # Forward fill missing values
            geo_data = geo_data.fillna(method='ffill').fillna(method='bfill')
            geo_data = geo_data.fillna(0)  # Fill remaining NaN with 0
            
            # Features untuk model (normalize target variable)
            features = [
                'request_count', 'hour', 'day_of_week', 
                'day_of_month', 'month', 'is_weekend', 
                'is_peak_hour', 'geo_encoded'
            ]
            
            # Hanya ambil data yang cukup panjang untuk windowing
            if len(geo_data) >= (self.n_past + self.n_future):
                series_data.append(geo_data[features].values)
        
        # Gabungkan semua series
        if series_data:
            combined_series = np.concatenate(series_data, axis=0)
        else:
            raise ValueError("No sufficient data for windowing")
        
        return combined_series
    
    def prepare_data(self, csv_file_path, train_split=0.7, validation_split=0.2):
        """
        Prepare data dengan temporal split (TIDAK menggunakan random split!)
        """
        # Load dan preprocess
        print("Loading and preprocessing data...")
        data = self.load_and_preprocess_data(csv_file_path)
        
        # Create multivariate series
        print("Creating multivariate series...")
        series = self.create_multivariate_series(data)
        
        print(f"Total time steps: {len(series)}")
        print(f"Features: {series.shape[1]}")
        
        # Normalize features
        series_scaled = self.scaler.fit_transform(series)
        
        # TEMPORAL SPLIT (bukan random split!)
        total_len = len(series_scaled)
        train_size = int(total_len * train_split)
        val_size = int(total_len * validation_split)
        
        # Split berdasarkan waktu
        train_data = series_scaled[:train_size]
        val_data = series_scaled[train_size:train_size + val_size]
        test_data = series_scaled[train_size + val_size:]
        
        print(f"Train size: {len(train_data)}")
        print(f"Validation size: {len(val_data)}")
        print(f"Test size: {len(test_data)}")
        
        # Create windowed datasets
        train_set = self.windowed_dataset(
            train_data, self.batch_size, self.n_past, self.n_future, self.shift
        )
        
        val_set = self.windowed_dataset(
            val_data, self.batch_size, self.n_past, self.n_future, self.shift
        )
        
        return train_set, val_set, test_data, series_scaled
    
    def build_model(self, n_features):
        """
        Build LSTM model untuk multivariate time series
        """
        model = Sequential([
            Input(shape=(self.n_past, n_features)),
            
            # LSTM layers dengan return_sequences=True untuk stacked LSTM
            LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            LSTM(32, dropout=0.2, recurrent_dropout=0.2),
            
            # Dense layers
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dropout(0.2),
            
            # Output layer: predict multiple timesteps untuk multiple features
            Dense(self.n_future * n_features, activation='linear'),
            
            # Reshape ke (batch_size, n_future, n_features)
            tf.keras.layers.Reshape((self.n_future, n_features))
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='mse',
            metrics=['mae']
        )
        
        return model
    
    def train(self, csv_file_path, epochs=100):
        """
        Train model dengan proper time series validation
        """
        # Prepare data dengan temporal split
        train_set, val_set, test_data, scaled_series = self.prepare_data(csv_file_path)
        
        # Build model
        n_features = scaled_series.shape[1]
        self.model = self.build_model(n_features)
        
        print("Model Summary:")
        print(self.model.summary())
        
        # Callbacks
        callbacks = [
            EarlyStopping(
                patience=15, 
                restore_best_weights=True,
                monitor='val_loss'
            ),
            ReduceLROnPlateau(
                factor=0.5, 
                patience=7, 
                min_lr=1e-7,
                monitor='val_loss'
            )
        ]
        
        # Train model
        print("Training model...")
        history = self.model.fit(
            train_set,
            validation_data=val_set,
            epochs=epochs,
            callbacks=callbacks,
            verbose=1
        )
        
        # Store test data untuk evaluasi
        self.test_data = test_data
        self.scaled_series = scaled_series
        
        return history
    
    def predict_future(self, input_sequence, steps_ahead=24):
        """
        Multi-step ahead prediction
        """
        if self.model is None:
            raise ValueError("Model belum di-train!")
        
        predictions = []
        current_sequence = input_sequence.copy()
        
        for _ in range(steps_ahead // self.n_future):
            # Predict next n_future steps
            pred = self.model.predict(
                current_sequence.reshape(1, self.n_past, -1), 
                verbose=0
            )[0]
            
            predictions.append(pred)
            
            # Update sequence untuk prediksi berikutnya
            # Slide window dan tambahkan prediksi
            current_sequence = np.vstack([
                current_sequence[self.n_future:],
                pred
            ])
        
        return np.concatenate(predictions, axis=0) if predictions else np.array([])
    
    def evaluate_model(self):
        """
        Evaluate model pada test set
        """
        if self.test_data is None or len(self.test_data) < (self.n_past + self.n_future):
            print("Insufficient test data for evaluation")
            return None
        
        # Create test dataset
        test_set = self.windowed_dataset(
            self.test_data, self.batch_size, self.n_past, self.n_future, self.shift
        )
        
        # Evaluate
        test_loss, test_mae = self.model.evaluate(test_set, verbose=0)
        print(f"Test Loss (MSE): {test_loss:.4f}")
        print(f"Test MAE: {test_mae:.4f}")
        
        return test_loss, test_mae
    
    def plot_training_history(self, history):
        """
        Plot training history
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Loss
        ax1.plot(history.history['loss'], label='Training Loss')
        ax1.plot(history.history['val_loss'], label='Validation Loss')
        ax1.set_title('Model Loss')
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Loss')
        ax1.legend()
        
        # MAE
        ax2.plot(history.history['mae'], label='Training MAE')
        ax2.plot(history.history['val_mae'], label='Validation MAE')
        ax2.set_title('Model MAE')
        ax2.set_xlabel('Epoch')
        ax2.set_ylabel('MAE')
        ax2.legend()
        
        plt.tight_layout()
        plt.show()



In [15]:
df = pd.read_csv('studio_results_20250925_1926.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'studio_results_20250925_1926.csv'

In [None]:
# Constants
BATCH_SIZE = 32
N_PAST = 24      # 24 hours lookback
N_FUTURE = 24    # 24 hours forecast
SHIFT = 1        # 1 hour shift

# Contoh penggunaan
if __name__ == "__main__":
    # Inisialisasi predictor dengan windowed dataset
    predictor = TimeSeriesRequestPredictor(
        n_past=N_PAST,
        n_future=N_FUTURE, 
        batch_size=BATCH_SIZE,
        shift=SHIFT
    )
    
    # Train model dengan proper temporal split
    print("Starting training with temporal split...")
    history = predictor.train('studio_results_20250925_1926.csv', epochs=50)
    
    # Plot training history
    predictor.plot_training_history(history)
    
    # Evaluate model
    test_results = predictor.evaluate_model()
    
    print("\n" + "="*50)
    print("Training completed!")
    print(f"Using temporal split instead of random split")
    print(f"Window size: {N_PAST} -> {N_FUTURE}")
    print(f"Batch size: {BATCH_SIZE}")
    print("="*50)

Starting training with temporal split...
Loading and preprocessing data...


FileNotFoundError: [Errno 2] No such file or directory: 'studio_results_20250925_1926.csv'