In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
# 1. Generate synthetic dataset
np.random.seed(42)
n_samples = 1000

# Simulated features (soil, weather, fertilizer, etc.)
X = pd.DataFrame({
    'soil_carbon': np.random.rand(n_samples),
    'rainfall': np.random.rand(n_samples) * 100,
    'fertilizer_amount': np.random.rand(n_samples) * 50,
    'crop_yield': np.random.rand(n_samples) * 10
})

# Add some missing values
X.loc[X.sample(frac=0.1).index, 'rainfall'] = np.nan
X.loc[X.sample(frac=0.05).index, 'fertilizer_amount'] = np.nan

# Target variable: simulated GHG emissions (kg CO₂-eq/ha)
y = (
    10 + 5 * X['soil_carbon'] +
    0.2 * X['rainfall'].fillna(50) +
    0.8 * X['fertilizer_amount'].fillna(25) +
    0.1 * X['crop_yield'] +
    np.random.normal(0, 2, n_samples)
)

In [3]:
# 2. Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [4]:
# 3. Optional: Feature scaling (not critical for RF but can help)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [5]:
# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [6]:
# 5. Define parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'max_features': ['auto', 'sqrt']
}

In [7]:
# 6. Grid Search with cross-validation
rf = RandomForestRegressor(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [5, 10, None],
                         'max_features': ['auto', 'sqrt'],
                         'n_estimators': [100, 200]},
             scoring='neg_mean_squared_error')

In [8]:
# 7. Best model evaluation
best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", grid.best_params_)
print("Mean Squared Error:", mse)
print("R² Score:", r2)

Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 200}
Mean Squared Error: 5.7006362415471585
R² Score: 0.9672228579967123
