In [None]:
# Setup dan Import
!pip install -q google-cloud-aiplatform
!pip install -q google-cloud-storage
!pip install -q google-cloud-bigquery
!pip install -q pandas numpy scipy scikit-learn
!pip install -q matplotlib seaborn

import os
import pandas as pd
import numpy as np
from google.cloud import storage
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Setup project
PROJECT_ID = "your-project-id"
BUCKET = "your-bucket"
REGION = "your-region"

os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID

In [None]:
# Load data from Cloud Storage
def load_data_from_gcs(bucket_name, blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    # Download as string
    data_str = blob.download_as_string()
    
    # Parse CSV
    return pd.read_csv(pd.StringIO(data_str.decode('utf-8')))

In [None]:
# Load sensor dan reference data
sensor_data = load_data_from_gcs(BUCKET, 'raw/sensor_data.csv')
reference_data = load_data_from_gcs(BUCKET, 'raw/reference_data.csv')

In [None]:
# Data Exploration
def explore_data(df, title):
    print(f"\n{title} Data Exploration")
    print("-" * 50)
    print("\nShape:", df.shape)
    print("\nColumns:", df.columns.tolist())
    print("\nData Types:\n", df.dtypes)
    print("\nMissing Values:\n", df.isnull().sum())
    print("\nSummary Statistics:\n", df.describe())

explore_data(sensor_data, "Sensor")
explore_data(reference_data, "Reference")

In [None]:
# Data Preprocessing
def preprocess_data(sensor_df, reference_df):
    # Convert timestamps
    sensor_df['timestamp'] = pd.to_datetime(sensor_df['timestamp'])
    reference_df['timestamp'] = pd.to_datetime(reference_df['timestamp'])
    
    # Remove invalid values
    sensor_df = sensor_df[sensor_df['pm25'] >= 0]
    sensor_df = sensor_df[sensor_df['pm10'] >= 0]
    sensor_df = sensor_df[sensor_df['o3'] >= 0]
    sensor_df = sensor_df[sensor_df['co'] >= 0]
    sensor_df = sensor_df[sensor_df['no2'] >= 0]
    
    # Match timestamps between sensor and reference data
    merged_df = pd.merge(sensor_df, reference_df, 
                        on='timestamp', 
                        suffixes=('_sensor', '_reference'))
    
    return merged_df

processed_data = preprocess_data(sensor_data, reference_data)

In [None]:
# Visualization
def plot_correlations(df, parameters):
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for idx, param in enumerate(parameters):
        sensor_col = f"{param}_sensor"
        ref_col = f"{param}_reference"
        
        sns.scatterplot(data=df, x=sensor_col, y=ref_col, ax=axes[idx])
        axes[idx].set_title(f'{param} Correlation')
        
        # Add correlation coefficient
        corr = df[sensor_col].corr(df[ref_col])
        axes[idx].text(0.05, 0.95, f'r = {corr:.2f}', 
                      transform=axes[idx].transAxes)
    
    plt.tight_layout()
    plt.show()

parameters = ['pm25', 'pm10', 'o3', 'co', 'no2']
plot_correlations(processed_data, parameters)

In [None]:
# Save processed data
def save_to_gcs(df, bucket_name, blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    # Save to CSV
    blob.upload_from_string(df.to_csv(index=False))

save_to_gcs(processed_data, BUCKET, 'processed/training_data.csv')