In [None]:
# Setup dan Import
!pip install -q google-cloud-aiplatform
!pip install -q google-cloud-storage
!pip install -q pandas numpy scipy scikit-learn
!pip install -q matplotlib seaborn

import os
import pandas as pd
import numpy as np
from google.cloud import storage
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Setup project
PROJECT_ID = "your-project-id"
BUCKET = "your-bucket"
REGION = "your-region"

os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID

In [None]:
# Load data from Cloud Storage
def load_data_from_gcs(bucket_name, blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    # Download as string
    data_str = blob.download_as_string()
    
    # Parse CSV
    return pd.read_csv(pd.StringIO(data_str.decode('utf-8')))

In [None]:
# Load sensor, reference data, weather data
sensor_data = load_data_from_gcs(BUCKET, 'raw/sensor_data.csv')
reference_data = load_data_from_gcs(BUCKET, 'raw/reference_data.csv')
weather_data = load_data_from_gcs(BUCKET, 'raw/weather_api_data.csv')

In [None]:
# Data Exploration
def explore_data(df, title):
    print(f"\n{title} Data Exploration")
    print("-" * 50)
    print("\nShape:", df.shape)
    print("\nColumns:", df.columns.tolist())
    print("\nData Types:\n", df.dtypes)
    print("\nMissing Values:\n", df.isnull().sum())
    print("\nSummary Statistics:\n", df.describe())

explore_data(sensor_data, "Sensor")
explore_data(reference_data, "Reference")
explore_data(weather_data, "Weather API")

In [None]:
# Preprocessing Weather Data
def preprocess_weather_data(weather_df):
    # Convert timestamps
    weather_df['timestamp'] = pd.to_datetime(weather_df['timestamp'])
    
    # Rename columns untuk konsistensi (jika diperlukan)
    # Contoh: jika kolom di weather API berbeda namanya
    column_mapping = {
        'wind_direction': 'wind_direction',
        'wind_speed': 'wind_speed',
        'temperature': 'temperature',
        'precipitation': 'precipitation'
    }
    
    weather_df = weather_df.rename(columns=column_mapping)
    
    # Handle missing values if any
    weather_df['wind_direction'].fillna(weather_df['wind_direction'].median(), inplace=True)
    weather_df['wind_speed'].fillna(weather_df['wind_speed'].median(), inplace=True)
    weather_df['temperature'].fillna(weather_df['temperature'].median(), inplace=True)
    weather_df['precipitation'].fillna(0, inplace=True)  # Asumsi 0 untuk data curah hujan yang hilang
    
    return weather_df

In [None]:
# Data Preprocessing
def preprocess_data(sensor_df, reference_df, weather_df):
    # Convert timestamps
    sensor_df['timestamp'] = pd.to_datetime(sensor_df['timestamp'])
    reference_df['timestamp'] = pd.to_datetime(reference_df['timestamp'])
    weather_df['timestamp'] = pd.to_datetime(weather_df['timestamp'])
    
    # Remove invalid values
    sensor_df = sensor_df[sensor_df['pm25'] >= 0]
    sensor_df = sensor_df[sensor_df['pm10'] >= 0]
    sensor_df = sensor_df[sensor_df['o3'] >= 0]
    sensor_df = sensor_df[sensor_df['co'] >= 0]
    sensor_df = sensor_df[sensor_df['no2'] >= 0]
    
    # Match timestamps between sensor and reference data first
    merged_df = pd.merge(sensor_df, reference_df, 
                       on='timestamp', 
                       suffixes=('_sensor', '_reference'))
    
    # Now merge with weather data
    final_df = pd.merge(merged_df, weather_df, on='timestamp', how='left')
    
    # Handle any timestamp mismatches with weather data using forward fill
    # (asumsi data weather mungkin interval waktunya berbeda)
    final_df['wind_direction'].fillna(method='ffill', inplace=True)
    final_df['wind_speed'].fillna(method='ffill', inplace=True)
    final_df['temperature'].fillna(method='ffill', inplace=True)
    final_df['precipitation'].fillna(method='ffill', inplace=True)
    
    return final_df

In [None]:
# Preprocess weather data
processed_weather_data = preprocess_weather_data(weather_data)

# Merge all datasets
processed_data = preprocess_data(sensor_data, reference_data, processed_weather_data)

In [None]:
# Visualization - correlation between pollution and weather
def plot_weather_correlations(df, pollutants, weather_params):
    # Set up a grid for the correlation plots
    rows = len(pollutants)
    cols = len(weather_params)
    fig, axes = plt.subplots(rows, cols, figsize=(cols*4, rows*3))
    
    # If we only have one row, reshape axes for consistent indexing
    if rows == 1:
        axes = np.array([axes])
    
    # If we only have one pollutant and one weather param, ensure axes is 2D
    if rows == 1 and cols == 1:
        axes = np.array([[axes]])
    
    for i, pollutant in enumerate(pollutants):
        for j, weather_param in enumerate(weather_params):
            sns.scatterplot(data=df, x=weather_param, y=f"{pollutant}_sensor", ax=axes[i, j])
            axes[i, j].set_title(f'{pollutant} vs {weather_param}')
            
            # Add correlation coefficient
            corr = df[f"{pollutant}_sensor"].corr(df[weather_param])
            axes[i, j].text(0.05, 0.95, f'r = {corr:.2f}', transform=axes[i, j].transAxes)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Extended visualization untuk wind direction rose plot
def plot_wind_rose(df, pollutant):
    plt.figure(figsize=(10, 10))
    
    # Group by wind direction and calculate mean pollutant value
    wind_groups = df.groupby(pd.cut(df['wind_direction'], bins=8, labels=False))
    mean_pollutant = wind_groups[f'{pollutant}_sensor'].mean()
    
    # Create polar plot
    ax = plt.subplot(111, projection='polar')
    
    # Convert wind direction to radians
    theta = np.linspace(0, 2*np.pi, 8, endpoint=False)
    
    # Plot as bars in polar coordinates
    bars = ax.bar(theta, mean_pollutant, width=2*np.pi/8, bottom=0.0)
    
    # Set the direction labels
    ax.set_xticks(theta)
    ax.set_xticklabels(['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW'])
    
    # Set title
    ax.set_title(f'Mean {pollutant} by Wind Direction', y=1.08)
    
    # Colorize based on value
    min_val = mean_pollutant.min()
    max_val = mean_pollutant.max()
    norm = plt.Normalize(min_val, max_val)
    colors = plt.cm.viridis(norm(mean_pollutant))
    
    for bar, color in zip(bars, colors):
        bar.set_facecolor(color)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Plot correlations
pollutant_params = ['pm25', 'pm10', 'o3', 'co', 'no2']
weather_params = ['wind_direction', 'wind_speed', 'temperature', 'precipitation']

plot_weather_correlations(processed_data, pollutant_params, weather_params)

# Plot wind rose for PM2.5
plot_wind_rose(processed_data, 'pm25')

# Correlation heatmap of all variables
plt.figure(figsize=(12, 10))
# Select only the relevant columns
corr_columns = [col for col in processed_data.columns 
                if any(param in col for param in pollutant_params) or 
                col in weather_params]
                
corr_matrix = processed_data[corr_columns].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap between Pollutants and Weather Parameters')
plt.tight_layout()
plt.show()

In [None]:
# Persiapan Time Series untuk model prediksi
def prepare_time_series_features(df):
    """
    Mempersiapkan fitur time series dari dataset untuk keperluan prediksi.
    """
    # Pastikan dataframe terurut berdasarkan timestamp
    df = df.sort_values('timestamp')
    
    # Ekstrak fitur waktu
    df['hour'] = df['timestamp'].dt.hour
    df['day'] = df['timestamp'].dt.day
    df['day_of_week'] = df['timestamp'].dt.dayofweek  # 0=Senin, 6=Minggu
    df['month'] = df['timestamp'].dt.month
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    
    # Fitur siklikal untuk variabel siklis (jam, hari dalam seminggu)
    # Mengkonversi variabel siklis ke representasi sinus dan cosinus
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week']/7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week']/7)
    
    # Lag features (contoh untuk PM2.5)
    df['pm25_sensor_lag1h'] = df['pm25_sensor'].shift(1)
    df['pm25_sensor_lag3h'] = df['pm25_sensor'].shift(3)
    df['pm25_sensor_lag6h'] = df['pm25_sensor'].shift(6)
    df['pm25_sensor_lag12h'] = df['pm25_sensor'].shift(12)
    df['pm25_sensor_lag24h'] = df['pm25_sensor'].shift(24)
    
    # Rolling windows untuk mendapatkan tren (contoh untuk PM2.5)
    df['pm25_sensor_rolling_mean_3h'] = df['pm25_sensor'].rolling(window=3).mean()
    df['pm25_sensor_rolling_mean_6h'] = df['pm25_sensor'].rolling(window=6).mean()
    df['pm25_sensor_rolling_mean_12h'] = df['pm25_sensor'].rolling(window=12).mean()
    df['pm25_sensor_rolling_std_3h'] = df['pm25_sensor'].rolling(window=3).std()
    df['pm25_sensor_rolling_std_12h'] = df['pm25_sensor'].rolling(window=12).std()
    
    # Tambahkan lag features untuk parameter cuaca
    df['wind_speed_lag3h'] = df['wind_speed'].shift(3)
    df['temperature_lag3h'] = df['temperature'].shift(3)
    
    # Deteksi tren (slope) menggunakan diferensiasi
    df['pm25_sensor_diff_1h'] = df['pm25_sensor'].diff(1)
    df['pm25_sensor_diff_3h'] = df['pm25_sensor'].diff(3)
    
    # Menangani nilai null yang dihasilkan dari operasi shift dan rolling
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(method='bfill').fillna(method='ffill')
    
    return df

In [None]:
# Persiapan data untuk cross-validation time series
def prepare_time_series_splits(df, target_col, n_splits=5):
    """
    Menyiapkan indeks untuk cross-validation time series
    yang menghormati urutan kronologis data.
    """
    from sklearn.model_selection import TimeSeriesSplit
    
    # Buat TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    # Simpan indeks split dalam dataframe
    splits = []
    
    for train_idx, test_idx in tscv.split(df):
        splits.append({
            'train_start': df.iloc[train_idx[0]]['timestamp'],
            'train_end': df.iloc[train_idx[-1]]['timestamp'],
            'test_start': df.iloc[test_idx[0]]['timestamp'],
            'test_end': df.iloc[test_idx[-1]]['timestamp'],
            'train_idx': train_idx,
            'test_idx': test_idx
        })
    
    # Simpan informasi split untuk digunakan di model development
    split_df = pd.DataFrame(splits)
    
    return split_df

In [None]:
# Terapkan time series features
processed_data = prepare_time_series_features(processed_data)

# Menyiapkan informasi split untuk time series cross-validation
# (asumsi target prediksi adalah pm25_sensor)
splits_info = prepare_time_series_splits(processed_data, 'pm25_sensor')

In [None]:
# Visualisasi distribusi fitur time series
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
processed_data.groupby('hour')['pm25_sensor'].mean().plot(kind='bar')
plt.title('Mean PM2.5 by Hour of Day')
plt.tight_layout()

plt.subplot(2, 3, 2)
processed_data.groupby('day_of_week')['pm25_sensor'].mean().plot(kind='bar')
plt.title('Mean PM2.5 by Day of Week')
plt.xticks(range(7), ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.tight_layout()

plt.subplot(2, 3, 3)
processed_data.groupby('month')['pm25_sensor'].mean().plot(kind='bar')
plt.title('Mean PM2.5 by Month')
plt.tight_layout()

plt.subplot(2, 3, 4)
sns.boxplot(data=processed_data, x='is_weekend', y='pm25_sensor')
plt.title('PM2.5 Distribution: Weekday vs Weekend')
plt.xticks([0, 1], ['Weekday', 'Weekend'])
plt.tight_layout()

plt.subplot(2, 3, 5)
plt.scatter(processed_data['wind_speed'], processed_data['pm25_sensor'], alpha=0.5)
plt.title('PM2.5 vs Wind Speed')
plt.xlabel('Wind Speed')
plt.ylabel('PM2.5')
plt.tight_layout()

plt.subplot(2, 3, 6)
processed_data['pm25_sensor'].plot(kind='hist', bins=30)
plt.title('PM2.5 Distribution')
plt.tight_layout()

plt.figure(figsize=(12, 6))
# Plot time series untuk PM2.5
plt.plot(processed_data['timestamp'], processed_data['pm25_sensor'])
plt.title('PM2.5 Time Series')
plt.xlabel('Timestamp')
plt.ylabel('PM2.5 Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Simpan data time series yang sudah diproses
save_to_gcs(processed_data, BUCKET, 'processed/time_series_training_data.csv')

# Simpan informasi split untuk digunakan pada model development
save_to_gcs(splits_info, BUCKET, 'processed/time_series_splits.csv')

In [None]:
# Save processed data including weather parameters
def save_to_gcs(df, bucket_name, blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    # Save to CSV
    blob.upload_from_string(df.to_csv(index=False))

save_to_gcs(processed_data, BUCKET, 'processed/training_data_with_weather.csv')