In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import joblib
from google.cloud import storage

In [None]:
PROJECT_ID = 'your-project-id'
BUCKET_NAME = 'your-bucket-name'
MODEL_PATH = 'models/aqi_prediction_model.pkl'
SCALER_PATH = 'models/aqi_prediction_scaler.pkl'

In [None]:
def read_data_from_gcs(bucket_name, blob_name):
    client = storage.Client(project=PROJECT_ID)
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    data = blob.download_as_string()
    df = pd.read_csv(pd.io.common.StringIO(data.decode('utf-8')))
    return df

In [None]:
def save_model_to_gcs(model, bucket_name, blob_name):
    client = storage.Client(project=PROJECT_ID)
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)

    temp_path = '/tmp/model.pkl'
    joblib.dump(model, temp_path)
    
    blob.upload_from_filename(temp_path)
    print(f"Model saved to gs://{bucket_name}/{blob_name}")

In [None]:
def create_time_features(df):
    """Membuat fitur waktu dari kolom timestamp."""
    # Pastikan timestamp adalah dalam format datetime
    if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
        df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    df['is_weekend'] = df['timestamp'].dt.dayofweek >= 5
    
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    
    df['is_morning'] = (df['hour'] >= 6) & (df['hour'] < 12)
    df['is_afternoon'] = (df['hour'] >= 12) & (df['hour'] < 18)
    df['is_evening'] = (df['hour'] >= 18) & (df['hour'] < 22)
    df['is_night'] = (df['hour'] >= 22) | (df['hour'] < 6)
    
    return df

In [None]:
def create_lag_features(df, columns, lag_periods=[1, 2, 3, 6]):
    """Membuat fitur lag untuk kolom yang ditentukan."""
    for col in columns:
        for lag in lag_periods:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    return df

In [None]:
def create_rolling_features(df, columns, windows=[2, 3, 6]):
    """Membuat fitur rolling untuk kolom yang ditentukan."""
    for col in columns:
        for window in windows:
            df[f'{col}_rolling_mean_{window}'] = df[col].rolling(window=window).mean()
            df[f'{col}_rolling_std_{window}'] = df[col].rolling(window=window).std()
    return df

In [None]:
def preprocess_data(df):
    """Fungsi utama untuk memproses data."""
    processed_df = df.copy()
    
    for col in processed_df.columns:
        if processed_df[col].dtype in [np.float64, np.int64]:
            processed_df[col] = processed_df[col].fillna(processed_df[col].median())
    
    processed_df = create_time_features(processed_df)
    
    sensor_cols = ['pm2_5', 'pm10', 'o3', 'co', 'no2']
    weather_cols = ['temperature', 'humidity', 'wind_speed', 'wind_direction']
    
    processed_df = create_lag_features(processed_df, sensor_cols + weather_cols)
    processed_df = create_rolling_features(processed_df, sensor_cols + weather_cols)
    processed_df = create_interaction_features(processed_df)
    
    processed_df = processed_df.dropna()
    
    return processed_df

In [None]:
def build_prediction_model(df, forecast_horizon=6, target_col='aqi'):
    """
    Membangun model untuk memprediksi AQI beberapa jam ke depan.
    
    Args:
        df: DataFrame dengan data historis
        forecast_horizon: Jumlah jam ke depan untuk prediksi (default: 6)
        target_col: Nama kolom target (default: 'aqi')
    """
    # Shifting target untuk setiap horizon prediksi
    for h in range(1, forecast_horizon + 1):
        df[f'{target_col}_plus_{h}h'] = df[target_col].shift(-h)
    
    models = {}
    metrics = {}
    
    for h in range(1, forecast_horizon + 1):
        target = f'{target_col}_plus_{h}h'
        print(f"\nBuilding model for {h}-hour ahead prediction")
        
        horizon_df = df.dropna(subset=[target])
        
        X = horizon_df.drop(columns=[col for col in horizon_df.columns if col.startswith(f'{target_col}_plus_')] + [target_col, 'timestamp'])
        y = horizon_df[target]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_cols = [col for col in X.columns if col not in numeric_cols]
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_cols),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
            ])
        
        model = lgb.LGBMRegressor(
            objective='regression',
            n_estimators=100,
            learning_rate=0.1,
            max_depth=7,
            num_leaves=31,
            min_data_in_leaf=20,
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42
        )
        
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        pipeline.fit(X_train, y_train)
        
        y_pred = pipeline.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        metrics[h] = {'rmse': rmse, 'mae': mae, 'r2': r2}
        models[h] = pipeline
        
        print(f"Horizon: +{h} jam | RMSE: {rmse:.2f} | MAE: {mae:.2f} | R²: {r2:.2f}")
    
    return models, metrics

In [None]:
def train_and_save_models():
    """Melatih dan menyimpan model ke GCS bucket."""
    # Baca data dari GCS
    sensors_weather_data = read_data_from_gcs(BUCKET_NAME, 'processed_data/sensors_weather_combined.csv')
    
    print("Data shape:", sensors_weather_data.shape)
    print("Columns:", sensors_weather_data.columns.tolist())
    
    print("Preprocessing data...")
    processed_df = preprocess_data(sensors_weather_data)
    
    print("Building models for 6-hour forecasting horizon...")
    models, metrics = build_prediction_model(processed_df, forecast_horizon=6)
    
    # Visualisasi metrik per horizon
    horizons = list(metrics.keys())
    rmse_values = [metrics[h]['rmse'] for h in horizons]
    mae_values = [metrics[h]['mae'] for h in horizons]
    r2_values = [metrics[h]['r2'] for h in horizons]
    
    plt.figure(figsize=(12, 8))
    
    plt.subplot(3, 1, 1)
    plt.plot(horizons, rmse_values, 'o-', linewidth=2)
    plt.title('RMSE by Forecast Horizon')
    plt.xlabel('Hours Ahead')
    plt.ylabel('RMSE')
    plt.grid(True)
    
    plt.subplot(3, 1, 2)
    plt.plot(horizons, mae_values, 'o-', linewidth=2)
    plt.title('MAE by Forecast Horizon')
    plt.xlabel('Hours Ahead')
    plt.ylabel('MAE')
    plt.grid(True)
    
    plt.subplot(3, 1, 3)
    plt.plot(horizons, r2_values, 'o-', linewidth=2)
    plt.title('R² by Forecast Horizon')
    plt.xlabel('Hours Ahead')
    plt.ylabel('R²')
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig('/tmp/forecast_metrics.png')
    
    client = storage.Client(project=PROJECT_ID)
    bucket = client.get_bucket(BUCKET_NAME)
    blob = bucket.blob('model_evaluation/forecast_metrics.png')
    blob.upload_from_filename('/tmp/forecast_metrics.png')
    
    for h, model in models.items():
        save_model_to_gcs(model, BUCKET_NAME, f'models/aqi_prediction_h{h}_model.pkl')

    metrics_df = pd.DataFrame({
        'horizon': horizons,
        'rmse': rmse_values,
        'mae': mae_values,
        'r2': r2_values
    })
    
    metrics_df.to_csv('/tmp/model_metrics.csv', index=False)
    blob = bucket.blob('model_evaluation/model_metrics.csv')
    blob.upload_from_filename('/tmp/model_metrics.csv')
    
    print("Training and evaluation complete. Models saved to GCS bucket.")
    return models, metrics