In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
import joblib
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 300

# LOAD AND PREPARE DATA

# Load data
df = pd.read_csv('/content/drive/MyDrive/final_dataset.csv')
print(f"Loaded {len(df)} samples")
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
df['season'] = df['date'].dt.quarter
df['day_of_year'] = df['date'].dt.dayofyear
df['year'] = df['date'].dt.year
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['abs_lat'] = np.abs(df['lat'])
df['lat_zone'] = pd.cut(df['lat'], bins=[-90, -30, 0, 30, 90],
                        labels=['southern', 'tropical_south', 'tropical_north', 'northern'])
df['lat_zone'] = df['lat_zone'].astype('category').cat.codes
if 'VV_backscatter' in df.columns and 'sst' in df.columns:
    df['VV_SST_interaction'] = df['VV_backscatter'] * df['sst']
    df['VH_SST_interaction'] = df['VH_backscatter'] * df['sst']
    df['roughness_SST'] = df['roughness_proxy'] * df['sst']
if 'VH_backscatter' in df.columns and 'chlorophyll_a' in df.columns:
    df['VH_chl_interaction'] = df['VH_backscatter'] * df['chlorophyll_a']
if 'chlorophyll_a' in df.columns:
    df['log_chlorophyll'] = np.log1p(df['chlorophyll_a'])
    df['chl_squared'] = df['chlorophyll_a'] ** 2
    df['chl_cubed'] = df['chlorophyll_a'] ** 3
if 'VV_backscatter' in df.columns and 'VH_backscatter' in df.columns:
    df['backscatter_diff'] = df['VV_backscatter'] - df['VH_backscatter']
    df['backscatter_sum'] = df['VV_backscatter'] + df['VH_backscatter']

# Feature Selection
base_features = ['VV_backscatter','roughness_proxy', 'cross_pol_ratio']
engineered_features = ['month_cos', 'season', 'day_of_year', 'abs_lat', 'lat_zone', 'VV_SST_interaction', 'VH_SST_interaction', 'VH_chl_interaction', 'sst_squared', 'chl_squared', 'backscatter_diff', 'backscatter_sum', 'roughness_SST']
full_features = base_features + engineered_features
available_features = [f for f in full_features if f in df.columns]
feature_coverage = {f: df[f].notna().sum() / len(df) * 100 for f in available_features}
good_features = [f for f, cov in feature_coverage.items() if cov >= 50]
print(f"Selected {len(good_features)} features")

# Prepare Datasets
df_valid = df[df['mp_measurement'].notna()].copy()
X = df_valid[good_features].copy()
y = df_valid['mp_measurement'].copy()

# Train-Test Split (BEFORE any preprocessing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit preprocessing ONLY on training data
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=good_features, index=X_train.index)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=good_features, index=X_test.index)
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# PART 2: OPTUNA HYPERPARAMETER TUNING


def objective(trial):

    # Define hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.01),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42,
        'n_jobs': -1
    }

    # Create pipeline with imputer and model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('model', RandomForestRegressor(**params))
    ])

    # Perform cross-validation on training data
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
    return cv_scores.mean()

# Create Optuna study
study = optuna.create_study(direction='maximize', study_name='rf_optimization')
study.optimize(objective, n_trials=100, show_progress_bar=True)

# Print optimization results
print(f"Best R² Score: {study.best_value:.4f}")
print(f"Best Parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# PART 3: TRAIN FINAL MODEL WITH BEST PARAMETERS

rf_model = RandomForestRegressor(**study.best_params)
rf_model.fit(X_train_imputed, y_train)

# Predictions
y_pred_rf_train = rf_model.predict(X_train_imputed)
y_pred_rf_test = rf_model.predict(X_test_imputed)

# Metrics
rf_r2_train = r2_score(y_train, y_pred_rf_train)
rf_r2_test = r2_score(y_test, y_pred_rf_test)
rf_mae = mean_absolute_error(y_test, y_pred_rf_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf_test))

print(f"R² (train): {rf_r2_train:.4f}")
print(f"R² (test):  {rf_r2_test:.4f}")
print(f"MAE:        {rf_mae:.4f}")
print(f"RMSE:       {rf_rmse:.4f}")

# Cross-validation with best parameters
print("\n--- Cross-Validation ---")
final_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('model', RandomForestRegressor(**study.best_params))
])

rf_cv = cross_val_score(final_pipeline, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
print(f"CV R² (on training set): {rf_cv.mean():.4f} (+/- {rf_cv.std() * 2:.4f})")

# Feature Importance
rf_importance = pd.DataFrame({
    'feature': good_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Features:")
for i, row in rf_importance.head(10).iterrows():
    print(f"  {i+1:2d}. {row['feature']:30s}: {row['importance']:.4f}")

# PART 4: VISUALIZATION

# Feature Importance
plt.figure(figsize=(10, 6))
top_n = 10
rf_top = rf_importance.head(top_n)
plt.barh(range(len(rf_top)), rf_top['importance'], color='steelblue', alpha=0.8)
plt.yticks(range(len(rf_top)), rf_top['feature'])
plt.gca().invert_yaxis()
plt.xlabel('Importance', fontweight='bold')
plt.title('Random Forest - Top 10 Feature Importance', fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('rf_feature_importance.png', dpi=300, bbox_inches='tight')
print("Saved: rf_feature_importance.png")
plt.close()

# Cross-Validation Scores
plt.figure(figsize=(6, 4))
plt.bar(0, rf_cv.mean(), yerr=rf_cv.std() * 2, capsize=10, alpha=0.8, color='teal')
plt.xticks([0], ['Random Forest (Optuna)'])
plt.ylabel('Cross-Validation R² Score', fontweight='bold')
plt.title('Random Forest - Cross-Validation Performance', fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')
plt.text(0, rf_cv.mean() + rf_cv.std() * 2 + 0.02, f'{rf_cv.mean():.3f}',
         ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.savefig('rf_cv_scores.png', dpi=300, bbox_inches='tight')
print("Saved: rf_cv_scores.png")
plt.close()

# PART 5: SAVE MODEL AND PREPROCESSING TOOLS

# Save model
joblib.dump(rf_model, 'model_randomforest.pkl')
print("Saved: model_randomforest.pkl")

# Save predictions
predictions_df = pd.DataFrame({
    'lat': df_valid.loc[y_test.index, 'lat'].values,
    'lon': df_valid.loc[y_test.index, 'lon'].values,
    'date': df_valid.loc[y_test.index, 'date'].values,
    'actual_mp': y_test.values,
    'pred_rf': y_pred_rf_test,
    'error_rf': y_test.values - y_pred_rf_test
})
predictions_df.to_csv('rf_predictions.csv', index=False)

print("\n=== All Done ===")

Loaded 341 samples
Selected 15 features


  0%|          | 0/100 [00:00<?, ?it/s]

Best R² Score: 0.7542
Best Parameters:
  n_estimators: 100
  max_depth: 13
  min_samples_split: 16
  min_samples_leaf: 2
  max_features: sqrt
  min_impurity_decrease: 0.0002311561027774259
  bootstrap: False
R² (train): 0.8432
R² (test):  0.8051
MAE:        0.0531
RMSE:       0.1678

--- Cross-Validation ---
CV R² (on training set): 0.7522 (+/- 0.2615)

Top 10 Features:
  10. VH_SST_interaction            : 0.2777
  13. backscatter_diff              : 0.1355
   7. abs_lat                       : 0.1308
  11. VH_chl_interaction            : 0.0990
   3. cross_pol_ratio               : 0.0666
   9. VV_SST_interaction            : 0.0535
   8. lat_zone                      : 0.0478
   6. day_of_year                   : 0.0435
  15. roughness_SST                 : 0.0402
  12. chl_squared                   : 0.0401
Saved: rf_feature_importance.png
Saved: rf_cv_scores.png
Saved: model_randomforest.pkl

=== All Done ===
