In [None]:
# Setup dan Import
!pip install -q google-cloud-aiplatform
!pip install -q google-cloud-storage
!pip install -q scikit-learn
!pip install -q pandas numpy matplotlib seaborn joblib

import os
import pandas as pd
import numpy as np
from google.cloud import storage
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
# Setup project
PROJECT_ID = "your-project-id"
BUCKET = "your-bucket"
REGION = "your-region"

In [None]:
# Load processed data
def load_data_from_gcs(bucket_name, blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    data_str = blob.download_as_string()
    return pd.read_csv(pd.StringIO(data_str.decode('utf-8')))

processed_data = load_data_from_gcs(BUCKET, 'processed/training_data.csv')

In [None]:
# Model Class
class AirQualityCalibrator:
    def __init__(self):
        self.parameters = ['pm25', 'pm10', 'o3', 'co', 'no2']
        self.models = {}
        
    def prepare_features(self, data):
        features = data[[
            'pm25_sensor', 'pm10_sensor', 'o3_sensor', 
            'co_sensor', 'no2_sensor', 'temperature', 'humidity'
        ]]
        return features
    
    def train(self, X_train, y_train, param):
        print(f"Training model for {param}...")
        
        # Define parameter grid for GridSearchCV
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [10, 20, 30],
            'min_samples_split': [2, 5, 10]
        }
        
        # Initialize base model
        base_model = RandomForestRegressor(random_state=42)
        
        # Perform GridSearchCV
        grid_search = GridSearchCV(
            base_model, param_grid, cv=5, 
            scoring='neg_root_mean_squared_error',
            n_jobs=-1
        )
        
        grid_search.fit(X_train, y_train)
        
        print(f"Best parameters for {param}: {grid_search.best_params_}")
        self.models[param] = grid_search.best_estimator_
        
    def evaluate(self, X_test, y_test, param):
        predictions = self.models[param].predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        r2 = r2_score(y_test, predictions)
        
        return {
            'rmse': rmse,
            'r2': r2,
            'predictions': predictions
        }

In [None]:
# Data Preparation
def prepare_training_data(df):
    features = df[[
        'pm25_sensor', 'pm10_sensor', 'o3_sensor', 
        'co_sensor', 'no2_sensor', 'temperature', 'humidity'
    ]]
    
    targets = df[[
        'pm25_reference', 'pm10_reference', 'o3_reference',
        'co_reference', 'no2_reference'
    ]]
    
    return train_test_split(features, targets, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = prepare_training_data(processed_data)

In [None]:
# Train Models
calibrator = AirQualityCalibrator()
results = {}

for param in calibrator.parameters:
    # Train
    calibrator.train(
        X_train, 
        y_train[f'{param}_reference'],
        param
    )
    
    # Evaluate
    results[param] = calibrator.evaluate(
        X_test,
        y_test[f'{param}_reference'],
        param
    )

In [None]:
# Visualize Results
def plot_results(results, parameters):
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for idx, param in enumerate(parameters):
        y_true = y_test[f'{param}_reference']
        y_pred = results[param]['predictions']
        
        sns.scatterplot(x=y_true, y=y_pred, ax=axes[idx])
        
        # Add perfect prediction line
        min_val = min(y_true.min(), y_pred.min())
        max_val = max(y_true.max(), y_pred.max())
        axes[idx].plot([min_val, max_val], [min_val, max_val], 'r--')
        
        axes[idx].set_title(f'{param} Calibration\nRMSE: {results[param]["rmse"]:.2f}\nR2: {results[param]["r2"]:.2f}')
        axes[idx].set_xlabel('True Values')
        axes[idx].set_ylabel('Predicted Values')
    
    plt.tight_layout()
    plt.show()

plot_results(results, calibrator.parameters)

In [None]:
# Save Models
def save_models(calibrator, bucket_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    for param, model in calibrator.models.items():
        # Save locally first
        local_path = f'/tmp/{param}_model.joblib'
        joblib.dump(model, local_path)
        
        # Upload to GCS
        blob = bucket.blob(f'models/{param}_model.joblib')
        blob.upload_from_filename(local_path)
        
        print(f"Saved model for {param}")

save_models(calibrator, BUCKET)